Source code for cobra.preprocessing.target_encoder


import logging

import pandas as pd
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

log = logging.getLogger(__name__)

[docs]class TargetEncoder(BaseEstimator): """Target encoding for categorical features, inspired by http://contrib.scikit-learn.org/category_encoders/targetencoder.html. Replace each value of the categorical feature with the average of the target values (in case of a binary target, this is the incidence of the group). This encoding scheme is also called Mean encoding. Note that, when applying this target encoding, values of the categorical feature that have not been seen during fit will be imputed according to the configured imputation strategy (replacement with the mean, minimum or maximum value of the categorical variable). The main problem with Target encoding is overfitting; the fact that we are encoding the feature based on target classes may lead to data leakage, rendering the feature biased. This can be solved using some type of regularization. A popular way to handle this is to use cross-validation and compute the means in each out-of-fold. However, the approach implemented here makes use of additive smoothing (https://en.wikipedia.org/wiki/Additive_smoothing). In summary: - with a binary classification target, a value of a categorical variable is replaced with: [count(variable=value) * P(target=1|variable=value) + weight * P(target=1)] / [count(variable=value) + weight] - with a regression target, a value of a categorical variable is replaced with: [count(variable=value) * E(target|variable=value) + weight * E(target)] / [count(variable=value) + weight] Attributes ---------- imputation_strategy : str In case there is a particular column which contains new categories, the encoding will lead to NULL values which should be imputed. Valid strategies then are to replace the NULL values with the global mean of the train set or the min (resp. max) incidence of the categories of that particular variable. weight : float Smoothing parameter (non-negative). The higher the value of the parameter, the bigger the contribution of the overall mean of targets learnt from all training data (prior) and the smaller the contribution of the mean target learnt from data with the current categorical value (posterior), so the bigger the smoothing (regularization) effect. When set to zero, there is no smoothing (e.g. the mean target of the current categorical value is used). """ valid_imputation_strategies = ("mean", "min", "max") def __init__(self, weight: float=0.0, imputation_strategy: str="mean"): if weight < 0: raise ValueError("The value of weight cannot be smaller than zero.") elif imputation_strategy not in self.valid_imputation_strategies: raise ValueError("Valid options for 'imputation_strategy' are {}." " Got imputation_strategy={!r} instead." .format(self.valid_imputation_strategies, imputation_strategy)) if weight == 0: log.warning("The target encoder's additive smoothing weight is " "set to 0. This disables smoothing and may make the " "encoding prone to overfitting. Increase the weight " "if needed.") self.weight = weight self.imputation_strategy = imputation_strategy self._mapping = {} # placeholder for fitted output # placeholder for the global incidence of the data used for fitting self._global_mean = None
[docs] def attributes_to_dict(self) -> dict: """Return the attributes of TargetEncoder in a dictionary. Returns ------- dict Contains the attributes of TargetEncoder instance with the names as keys. """ params = self.get_params() params["_mapping"] = { key: value.to_dict() for key, value in self._mapping.items() } params["_global_mean"] = self._global_mean return params
[docs] def set_attributes_from_dict(self, params: dict): """Set instance attributes from a dictionary of values with key the name of the attribute. Parameters ---------- params : dict Contains the attributes of TargetEncoder with their names as key. """ if "weight" in params and type(params["weight"]) == float: self.weight = params["weight"] if ("imputation_strategy" in params and params["imputation_strategy"] in self.valid_imputation_strategies): self.imputation_strategy = params["imputation_strategy"] if "_global_mean" in params and type(params["_global_mean"]) == float: self._global_mean = params["_global_mean"] _mapping = {} if "_mapping" in params and type(params["_mapping"]) == dict: _mapping = params["_mapping"] def dict_to_series(key, value): s = pd.Series(value) s.index.name = key return s self._mapping = { key: dict_to_series(key, value) for key, value in _mapping.items() } return self
[docs] def fit(self, data: pd.DataFrame, column_names: list, target_column: str): """Fit the TargetEncoder to the data. Parameters ---------- data : pd.DataFrame Data used to compute the mapping to encode the categorical variables with. column_names : list Columns of data to be encoded. target_column : str Column name of the target. """ # compute global mean (target incidence in case of binary target) y = data[target_column] self._global_mean = y.sum() / y.count() for column in tqdm(column_names, desc="Fitting target encoding..."): if column not in data.columns: log.warning("DataFrame has no column '{}', so it will be " "skipped in fitting" .format(column)) continue self._mapping[column] = self._fit_column(data[column], y)
def _fit_column(self, X: pd.Series, y: pd.Series) -> pd.Series: """Replace the values of a column, holding a categorical value, with a new value reflecting the formulas mentioned in the docstring of this class. Parameters ---------- X : pd.Series Data used to compute the encoding mapping for an individual categorical variable. y : pd.Series Series containing the targets for each observation (value) of this categorical variable. Returns ------- pd.Series Mapping containing the new value to replace each distinct value of the categorical variable with. """ stats = y.groupby(X).agg(["mean", "count"]) # Note: if self.weight = 0, we have the ordinary incidence replacement numerator = (stats["count"] * stats["mean"] + self.weight * self._global_mean) denominator = stats["count"] + self.weight return numerator / denominator
[docs] def transform(self, data: pd.DataFrame, column_names: list) -> pd.DataFrame: """Replace (e.g. encode) values of each categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), which was computed when the fit method was called. Parameters ---------- data : pd.DataFrame Data to encode. column_names : list Name of the categorical columns in the data to be encoded. Returns ------- pd.DataFrame The resulting transformed data. Raises ------ NotFittedError Exception when TargetEncoder was not fitted before calling this method. """ if (len(self._mapping) == 0) or (self._global_mean is None): msg = ("This {} instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this method.") raise NotFittedError(msg.format(self.__class__.__name__)) for column in tqdm(column_names, desc="Applying target encoding..."): if column not in data.columns: log.warning("Unknown column '{}' will be skipped." .format(column)) continue elif column not in self._mapping: log.warning("Column '{}' is not in fitted output " "and will be skipped.".format(column)) continue data = self._transform_column(data, column) return data
def _transform_column(self, data: pd.DataFrame, column_name: str) -> pd.DataFrame: """Replace (e.g. encode) values of a categorical column with a new value (reflecting the corresponding average target value, optionally smoothed by a regularization weight), which was computed when the fit method was called. Parameters ---------- data : pd.DataFrame Data to encode. column_name : str Name of the column in the data to be encoded. Returns ------- pd.DataFrame Resulting transformed data. """ new_column = TargetEncoder._clean_column_name(column_name) # Convert dtype to float, because when the original dtype # is of type "category", the resulting dtype would otherwise also be of # type "category": data[new_column] = (data[column_name].map(self._mapping[column_name]) .astype("float")) # In case of categorical data, it could be that new categories will # emerge which were not present in the train set, so this will result # in missing values, which should be replaced according to the # configured imputation strategy: if data[new_column].isnull().sum() > 0: if self.imputation_strategy == "mean": data[new_column].fillna(self._global_mean, inplace=True) elif self.imputation_strategy == "min": data[new_column].fillna(data[new_column].min(), inplace=True) elif self.imputation_strategy == "max": data[new_column].fillna(data[new_column].max(), inplace=True) return data
[docs] def fit_transform(self, data: pd.DataFrame, column_names: list, target_column: str) -> pd.DataFrame: """Fit the encoder and transform the data. Parameters ---------- data : pd.DataFrame Data to be encoded. column_names : list Columns of data to be encoded. target_column : str Column name of the target. Returns ------- pd.DataFrame Data with additional columns, holding the target-encoded variables. """ self.fit(data, column_names, target_column) return self.transform(data, column_names)
@staticmethod def _clean_column_name(column_name: str) -> str: """Generate a name for the new column that this target encoder generates in the given data, by removing "_bin", "_processed" or "_cleaned" from the original categorical column, and adding "_enc". Parameters ---------- column_name : str Column name to be cleaned. Returns ------- str Cleaned column name. """ if "_bin" in column_name: return column_name.replace("_bin", "") + "_enc" elif "_processed" in column_name: return column_name.replace("_processed", "") + "_enc" elif "_cleaned" in column_name: return column_name.replace("_cleaned", "") + "_enc" else: return column_name + "_enc"