Source code for fast_eda.fast_eda

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype


[docs]
def describe_function(df):
    """
    Generate summary statistics for numeric columns in the DataFrame.

    This function computes basic statistics such as mean, median, 
    standard deviation, minimum, and maximum for each numeric column 
    in the DataFrame, providing an overview of the central tendency 
    and spread of the data.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing numeric columns.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the calculated summary statistics for 
        each numeric column.
    
    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': [1, 2, 3, 4, 5], 'B': [10, 20, 30, 40, 50]})
    >>> describe_function(df)
    """
    summary = df.describe()
    
    # Add median (50%) to the summary
    median = df.median()
    
    # Add the mean and standard deviation
    mean = df.mean()
    std = df.std()

    # Create a new DataFrame with the desired statistics
    summary.loc['mean'] = mean
    summary.loc['50%'] = median
    summary.loc['std'] = std

    return summary



[docs]
def distribution_plots(df, c, r,  figsize = (10,  6), col_ovr=None):
    """
    Plots distributions of columns from a DataFrame using Matplotlib subplots and Seaborn plots.

    This function creates a grid of subplots to visualize the distributions of specified columns from a DataFrame.
    Numeric columns are plotted using histograms, while string columns are plotted using bar plots.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame containing the data to be plotted. Must not be empty.

    c : int
        The number of columns in the subplot grid.

    r : int
        The number of rows in the subplot grid.

    figsize : tuple of int, optional, default=(10, 6)
        The size of the figure in inches (width, height).

    col_ovr : list of str, optional
        A list of column names to plot. If None, all columns in the DataFrame are used.
        Must be a subset of the DataFrame's columns.

    Returns
    -------
    fig : matplotlib.figure.Figure
        The Matplotlib figure object containing the subplots.

    axes : numpy.ndarray of matplotlib.axes._subplots.AxesSubplot
        An array of Axes objects corresponding to the subplots.

    Raises
    ------
    AssertionError
        If input validation fails for any of the parameters.

    Notes
    -----
    - The function handles both numeric and string columns differently:
        - Numeric columns: Plotted using Seaborn's `histplot`.
        - String columns: Plotted using Seaborn's `barplot` without error bars.
    - Any unused subplot axes are hidden to prevent empty plots.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({
    ...     'numeric_col': [1, 2, 3, 4, 5],
    ...     'string_col': ['a', 'b', 'a', 'c', 'b']
    ... })
    >>> fig, axes = distribution_plotting_function(df, c=2, r=1)
    """

    # user inpute validation
    assert isinstance(df, pd.DataFrame), 'df is not a pd.DataFrame'
    assert len(df) > 0, 'df is an empty DataFrame'
    assert isinstance(c, int) and c > 0, 'c is not an integer > 0'
    assert isinstance(r, int) and r > 0, 'r is not an integer > 0'
    assert isinstance(figsize, tuple), 'figsize is not a tuple of 2 integers'
    if isinstance(figsize, tuple):
        assert len(figsize) == 2, 'figsize is not a tuple of 2 integers'
        assert all(isinstance(x, int) for x in figsize), 'figsize is not a tuple of 2 integers'
    if col_ovr:
        assert isinstance(col_ovr, list), 'col_ovr is not a list of string'
        assert all(isinstance(x, str) for x in col_ovr), 'col_ovr is not a list of string'
        assert set(col_ovr).issubset(set(df.columns.tolist())), 'some columns are not in df'

    columns = col_ovr if col_ovr else df.columns.tolist()
    d = len(columns)
    
    fig, axes = plt.subplots(
        nrows=r, ncols=c, figsize=figsize, sharex=False, sharey=False, squeeze=False
    )
    axes_flat = axes.flatten()
    for i, col in enumerate(columns):
        series = df[col]
        if is_numeric_dtype(series):
            sns.histplot(series, ax=axes_flat[i])
        elif is_string_dtype(series):
            sns.barplot(series, errorbar=None, ax=axes_flat[i])
     # hide unuse Axes objects
    if d < len(axes_flat):
        for ax in axes_flat[d:]:
            ax.set_visible(False)
    fig.tight_layout()
    return fig, axes



[docs]
def count_nulls(df):
    """
    Count missing values in each column of the DataFrame.

    This function calculates the number of missing (NaN) values in each column 
    of the DataFrame, assisting in identifying columns that need cleaning or 
    imputation.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame to be analyzed.

    Returns
    -------
    pandas.Series
        A Series with column names as the index and the count of missing values 
        in each column as the values.

    Raises
    ------
    ValueError
        If the input is not a pandas DataFrame.
    
    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': [1, None, 3], 'B': [None, 2, 3]})
    >>> count_nulls(df)
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame")
    missing_counts = df.isnull().sum().astype('int64')
    
    return missing_counts



[docs]
def correlation_matrix_viz(df):
    """
    Generate a correlation matrix visualization for numeric columns in a DataFrame.

    This function computes the Spearman correlation coefficients between all numeric 
    columns in the provided DataFrame. The resulting correlation matrix is transformed 
    into a long-form DataFrame suitable for visualization, and an interactive Altair 
    scatter plot is created to display the correlations.

    The visualization includes:
    - **X-axis and Y-axis**: The pair of features being compared.
    - **Circle size**: The magnitude of the absolute correlation value, indicating the strength of the relationship.
    - **Color**: The direction and strength of the correlation (positive or negative), represented using a diverging color scale.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing numeric columns for correlation analysis.

    Returns
    -------
    alt.Chart
        An interactive Altair chart visualizing the correlation matrix.

    Notes
    -----
    - Self-correlations (diagonal values) are set to 0 to avoid cluttering the plot.
    - Non-numeric columns are ignored in the computation.
    
    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 4, 6], 'C': [5, 3, 1]})
    >>> correlation_matrix_viz(df)
    """
    import altair as alt
    corr_df = df.select_dtypes('number').corr('spearman', numeric_only=True).stack().reset_index(name='corr')
    corr_df.loc[corr_df['corr'] == 1, 'corr'] = 0  # Remove diagonal
    corr_df['abs'] = corr_df['corr'].abs()
    chart = alt.Chart(corr_df).mark_circle().encode(
        x='level_0',
        y='level_1',
        size=alt.Size('abs').scale(domain=(0, 1)),
        color=alt.Color('corr').scale(scheme='blueorange', domain=(-1, 1))
    )
    return chart