Source code for cobra.model_building.univariate_selection


import pandas as pd
from sklearn.metrics import roc_auc_score, mean_squared_error
from numpy import sqrt

import cobra.utils as utils

[docs]def compute_univariate_preselection(target_enc_train_data: pd.DataFrame,
                                    target_enc_selection_data: pd.DataFrame,
                                    predictors: list,
                                    target_column: str,
                                    model_type: str = "classification",
                                    preselect_auc_threshold: float = 0.053,
                                    preselect_rmse_threshold: float = 5,
                                    preselect_overtrain_threshold: float = 0.05
                                    ) -> pd.DataFrame:
    """Perform a preselection of predictors based on an AUC (in case of
    classification) or a RMSE (in case of regression) threshold of
    a univariate model on a train and selection dataset and return a DataFrame
    containing for each variable the train and selection AUC or RMSE along with a
    boolean "preselection" column.

    As the AUC just calculates the quality of a ranking, all monotonous
    transformations of a given ranking (i.e. transformations that do not alter
    the ranking itself) will lead to the same AUC.
    Hence, pushing a categorical variable (incl. a binned continuous variable)
    through a logistic regression will produce exactly the same ranking as
    pushing it through incidence replacement (i.e. target encoding),
    as it will produce the exact same output: a ranking of the categories on
    the training set.
    Therefore, no univariate model is trained here as the target encoded train
    and selection data is/must be used as inputs for this function. These will
    be used as predicted scores to compute the AUC with against the target.

    Parameters
    ----------
    model_type : str
        Model type ("classification" or "regression").
    target_enc_train_data : pd.DataFrame
        Train data.
    target_enc_selection_data : pd.DataFrame
        Selection data.
    predictors : list
        List of predictors (e.g. column names in the train set and selection
        data sets).
    target_column : str
        Name of the target column.
    preselect_auc_threshold : float, optional
        Threshold on min. AUC to select predictor. Ignored if model_type is "regression".
    preselect_rmse_threshold : float, optional
        Threshold on max. RMSE to select predictor. Ignored if model_type is "classification".
        It is important to note that the threshold depends heavily on the scale of
        the target variable, and should be modified accordingly.
    preselect_overtrain_threshold : float, optional
        Threshold on the difference between train and selection AUC or RMSE (in case
        of the latter, as a proportion).

    Returns
    -------
    pd.DataFrame
        DataFrame containing for each variable the train AUC or RMSE and
        selection AUC or RMSE along with a boolean indicating whether or not it is
        selected based on the criteria.
    """
    result = []

    if model_type == "classification":
        for predictor in predictors:

            cleaned_predictor = utils.clean_predictor_name(predictor)

            auc_train = roc_auc_score(
                y_true=target_enc_train_data[target_column],
                y_score=target_enc_train_data[predictor])

            auc_selection = roc_auc_score(
                y_true=target_enc_selection_data[target_column],
                y_score=target_enc_selection_data[predictor])

            result.append({"predictor": cleaned_predictor,
                           "AUC train": auc_train,
                           "AUC selection": auc_selection})

        df_auc = pd.DataFrame(result)

        # Filter based on min. AUC
        auc_thresh = df_auc.loc[:, "AUC selection"] > preselect_auc_threshold

        # Identify those variables for which the AUC difference between train
        # and selection is within a user-defined ratio
        auc_overtrain = ((df_auc["AUC train"] - df_auc["AUC selection"])
                         < preselect_overtrain_threshold)

        df_auc["preselection"] = auc_thresh & auc_overtrain

        df_out = df_auc.sort_values(by="AUC selection", ascending=False).reset_index(drop=True)

    elif model_type == "regression":
        for predictor in predictors:
            cleaned_predictor = utils.clean_predictor_name(predictor)

            rmse_train = sqrt(mean_squared_error(
                y_true=target_enc_train_data[target_column],
                y_pred=target_enc_train_data[predictor]))

            rmse_selection = sqrt(mean_squared_error(
                y_true=target_enc_selection_data[target_column],
                y_pred=target_enc_selection_data[predictor]))

            result.append({"predictor": cleaned_predictor,
                           "RMSE train": rmse_train,
                           "RMSE selection": rmse_selection})

        df_rmse = pd.DataFrame(result)

        # Filter based on max. RMSE
        rmse_thresh = df_rmse.loc[:, "RMSE selection"] < preselect_rmse_threshold

        # Identify those variables for which the RMSE difference between train
        # and selection is within a user-defined ratio
        rmse_overtrain = ((df_rmse["RMSE selection"] - df_rmse["RMSE train"])  # flip subtraction vs. AUC
                          < preselect_overtrain_threshold)

        df_rmse["preselection"] = rmse_thresh & rmse_overtrain

        df_out = df_rmse.sort_values(by="RMSE selection", ascending=True).reset_index(drop=True)  # lower is better

    return df_out

[docs]def get_preselected_predictors(df_metric: pd.DataFrame) -> list:
    """Wrapper function to extract a list of predictors from df_metric.

    Parameters
    ----------
    df_metric : pd.DataFrame
        DataFrame containing for each variable the train AUC or RMSE and
        test AUC or RMSE along with a boolean indicating whether or not it is selected
        based on the criteria.

    Returns
    -------
    list
        List of preselected predictors.
    """

    if "AUC selection" in df_metric.columns:
        predictor_list = (df_metric[df_metric["preselection"]]
                          .sort_values(by="AUC selection", ascending=False)
                          .predictor.tolist())
    elif "RMSE selection" in df_metric.columns:
        predictor_list = (df_metric[df_metric["preselection"]]
                          .sort_values(by="RMSE selection", ascending=True)  # lower is better
                          .predictor.tolist())

    return [col + "_enc" for col in predictor_list]

[docs]def compute_correlations(target_enc_train_data: pd.DataFrame,
                         predictors: list) -> pd.DataFrame:
    """Given a DataFrame and a list of predictors, compute the correlations
    amongst the predictors in the DataFrame.

    Parameters
    ----------
    target_enc_train_data : pd.DataFrame
        Data to compute correlation.
    predictors : list
        List of column names of the DataFrame between which to compute
        the correlation matrix.

    Returns
    -------
    pd.DataFrame
        The correlation matrix of the training set.
    """

    correlations = target_enc_train_data[predictors].corr()

    predictors_cleaned = [utils.clean_predictor_name(predictor)
                          for predictor in predictors]

    # Change index and columns with the cleaned version of the predictors
    # e.g. change "var1_enc" with "var1"
    correlations.columns = predictors_cleaned
    correlations.index = predictors_cleaned

    return correlations