Source code for cobra.evaluation.pigs_tables


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib.ticker import FuncFormatter

import cobra.utils as utils

[docs]def generate_pig_tables(basetable: pd.DataFrame,
                        id_column_name: str,
                        target_column_name: str,
                        preprocessed_predictors: list) -> pd.DataFrame:
    """Compute PIG tables for all predictors in preprocessed_predictors.

    The output is a DataFrame with columns ``variable``, ``label``,
    ``pop_size``, ``global_avg_target`` and ``avg_target``.

    Parameters
    ----------
    basetable : pd.DataFrame
        Basetable to compute PIG tables from.
    id_column_name : str
        Name of the basetable column containing the IDs of the basetable rows
        (e.g. customernumber).
    target_column_name : str
        Name of the basetable column containing the target values to predict.
    preprocessed_predictors: list
        List of basetable column names containing preprocessed predictors.

    Returns
    -------
    pd.DataFrame
        DataFrame containing a PIG table for all predictors.
    """
    pigs = [
        compute_pig_table(basetable,
                          column_name,
                          target_column_name,
                          id_column_name)
        for column_name in sorted(preprocessed_predictors)
        if column_name not in [id_column_name, target_column_name]
    ]
    output = pd.concat(pigs)
    return output


[docs]def compute_pig_table(basetable: pd.DataFrame,
                      predictor_column_name: str,
                      target_column_name: str,
                      id_column_name: str) -> pd.DataFrame:
    """Compute the PIG table of a given predictor for a given target.

    Parameters
    ----------
    basetable : pd.DataFrame
        Input data from which to compute the pig table.
    predictor_column_name : str
        Predictor name of which to compute the pig table.
    target_column_name : str
        Name of the target variable.
    id_column_name : str
        Name of the id column (used to count population size).

    Returns
    -------
    pd.DataFrame
        PIG table as a DataFrame
    """
    global_avg_target = basetable[target_column_name].mean()

    # group by the binned variable, compute the incidence
    # (=mean of the target for the given bin) and compute the bin size
    # (e.g. COUNT(id_column_name)). After that, rename the columns
    res = (basetable.groupby(predictor_column_name)
           .agg({target_column_name: "mean", id_column_name: "size"})
           .reset_index()
           .rename(columns={predictor_column_name: "label",
                            target_column_name: "avg_target",
                            id_column_name: "pop_size"}))

    # add the column name to a variable column
    # add the average incidence
    # replace population size by a percentage of total population
    res["variable"] = utils.clean_predictor_name(predictor_column_name)
    res["global_avg_target"] = global_avg_target
    res["pop_size"] = res["pop_size"]/len(basetable.index)

    # make sure to always return the data with the proper column order
    column_order = ["variable", "label", "pop_size",
                    "global_avg_target", "avg_target"]

    return res[column_order]


[docs]def plot_incidence(pig_tables: pd.DataFrame,
                   variable: str,
                   model_type: str,
                   column_order: list=None,
                   dim: tuple=(12, 8)):
    """Plots a Predictor Insights Graph (PIG), a graph in which the mean
    target value is plotted for a number of bins constructed from a predictor
    variable. When the target is a binary classification target,
    the plotted mean target value is a true incidence rate.

    Bins are ordered in descending order of mean target value
    unless specified otherwise with the `column_order` list.

    Parameters
    ----------
    pig_tables: pd.DataFrame
        Dataframe with cleaned, binned, partitioned and prepared data,
        as created by generate_pig_tables() from this module.
    variable: str
        Name of the predictor variable for which the PIG will be plotted.
    model_type: str
        Type of model (either "classification" or "regression").
    column_order: list, default=None
        Explicit order of the value bins of the predictor variable to be used
        on the PIG.
    dim: tuple, default=(12, 8)
        Optional tuple to configure the width and length of the plot.
    """
    if model_type not in ["classification", "regression"]:
        raise ValueError("An unexpected value was set for the model_type "
                         "parameter. Expected 'classification' or "
                         "'regression'.")

    df_plot = pig_tables[pig_tables['variable'] == variable].copy()

    if column_order is not None:
        if not set(df_plot['label']) == set(column_order):
            raise ValueError(
                'The column_order and pig_tables parameters do not contain '
                'the same set of variables.')

        df_plot['label'] = df_plot['label'].astype('category')
        df_plot['label'].cat.reorder_categories(column_order,
                                                inplace=True)

        df_plot.sort_values(by=['label'], ascending=True, inplace=True)
        df_plot.reset_index(inplace=True)
    else:
        df_plot.sort_values(by=['avg_target'], ascending=False, inplace=True)
        df_plot.reset_index(inplace=True)

    with plt.style.context("seaborn-ticks"):
        fig, ax = plt.subplots(figsize=dim)

        # --------------------------
        # Left axis - average target
        # --------------------------
        ax.plot(df_plot['label'], df_plot['avg_target'],
                color="#00ccff", marker=".",
                markersize=20, linewidth=3,
                label='incidence rate per bin' if model_type == "classification" else "mean target value per bin",
                zorder=10)

        ax.plot(df_plot['label'], df_plot['global_avg_target'],
                color="#022252", linestyle='--', linewidth=4,
                label='average incidence rate' if model_type == "classification" else "global mean target value",
                zorder=10)

        # Dummy line to have label on second axis from first
        ax.plot(np.nan, "#939598", linewidth=6, label='bin size')

        # Set labels & ticks
        ax.set_ylabel('incidence' if model_type == "classification" else "mean target value",
                      fontsize=16)
        ax.set_xlabel('{} bins' ''.format(variable), fontsize=16)
        ax.xaxis.set_tick_params(labelsize=14)
        plt.setp(ax.get_xticklabels(),
                 rotation=45, ha="right", rotation_mode="anchor")
        ax.yaxis.set_tick_params(labelsize=14)

        if model_type == "classification":
            # Mean target values are between 0 and 1 (target incidence rate),
            # so format them as percentages
            ax.set_yticks(np.arange(0, max(df_plot['avg_target'])+0.05, 0.05))
            ax.yaxis.set_major_formatter(
                FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
        elif model_type == "regression":
            # If the difference between the highest avg. target of all bins
            # versus the global avg. target AND the difference between the
            # lowest avg. target versus the global avg. target are both smaller
            # than 25% of the global avg. target itself, we increase the
            # y-axis range, to avoid that the minor avg. target differences are
            # spread out over the configured figure height, suggesting
            # incorrectly that there are big differences in avg. target across
            # the bins and versus the global avg. target.
            # (Motivation for the AND above: if on one end there IS enough
            # difference, the effect that we discuss here does not occur.)
            global_avg_target = max(df_plot['global_avg_target'])  # series of same number, for every bin.
            if ((np.abs((max(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)
                    and (np.abs((min(df_plot['avg_target']) - global_avg_target)) / global_avg_target < 0.25)):
                ax.set_ylim(global_avg_target * 0.75,
                            global_avg_target * 1.25)

        # Remove ticks but keep the labels
        ax.tick_params(axis='both', which='both', length=0)
        ax.tick_params(axis='y', colors="#00ccff")
        ax.yaxis.label.set_color('#00ccff')

        # -----------------
        # Right Axis - bins
        # -----------------
        ax2 = ax.twinx()

        ax2.bar(df_plot['label'], df_plot['pop_size'],
                align='center', color="#939598", zorder=1)

        # Set labels & ticks
        ax2.set_xlabel('{} bins' ''.format(variable), fontsize=16)
        ax2.xaxis.set_tick_params(rotation=45, labelsize=14)

        ax2.yaxis.set_tick_params(labelsize=14)
        ax2.yaxis.set_major_formatter(
            FuncFormatter(lambda y, _: '{:.1%}'.format(y)))
        ax2.set_ylabel('population size', fontsize=16)
        ax2.tick_params(axis='y', colors="#939598")
        ax2.yaxis.label.set_color('#939598')

        # Despine & prettify
        sns.despine(ax=ax, right=True, left=True)
        sns.despine(ax=ax2, left=True, right=False)
        ax2.spines['right'].set_color('white')

        ax2.grid(False)

        # Title & legend
        if model_type == "classification":
            title = "Incidence plot - " + variable
        else:
            title = "Mean target plot - " + variable
        fig.suptitle(title, fontsize=22)
        ax.legend(frameon=False, bbox_to_anchor=(0., 1.01, 1., .102),
                  loc=3, ncol=1, mode="expand", borderaxespad=0.,
                  prop={"size": 14})

        # Set order of layers
        ax.set_zorder(1)
        ax.patch.set_visible(False)

        del df_plot

        plt.tight_layout()
        plt.margins(0.01)

        # Show
        plt.show()