Source code for cobra.model_building.forward_selection


import logging
from typing import Callable, Optional

import pandas as pd
from tqdm.auto import tqdm

from cobra.model_building import LogisticRegressionModel, LinearRegressionModel

log = logging.getLogger(__name__)

[docs]class ForwardFeatureSelection: """Perform forward feature selection for a given dataset using a given algorithm. Predictors are sequentially added to the model, starting with the one that has the highest univariate predictive power, and then proceeding with those that jointly lead to the best fit, optimizing for selection AUC or RMSE. Interaction effects are not explicitly modeled, yet they are implicitly present given the feature selection and the underlying feature correlation structure. Attributes ---------- model_type : str Model type (``classification`` or ``regression``). MLModel: Cobra model LogisticRegressionModel or LinearRegressionModel. max_predictors : int Maximum number of predictors allowed in any model. This corresponds more or less with the maximum number of steps in the forward feature selection. pos_only : bool Whether or not the model coefficients should all be positive (no sign flips). self._fitted_models : list List of fitted models. """ def __init__(self, model_type: str="classification", max_predictors: int=50, pos_only: bool=True): self.model_type = model_type if model_type == "classification": self.MLModel = LogisticRegressionModel elif model_type == "regression": self.MLModel = LinearRegressionModel self.max_predictors = max_predictors self.pos_only = pos_only self._fitted_models = []
[docs] def get_model_from_step(self, step: int): """Get fitted model from a particular step. Parameters ---------- step : int Particular step in the forward selection. Returns ------- self.MLModel Fitted model from the given step. Raises ------ ValueError In case step is larger than the number of available models. """ if len(self._fitted_models) <= step: raise ValueError(f"No model available for step {step}. " "The first step starts from index 0.") return self._fitted_models[step]
[docs] def compute_model_performances(self, data: pd.DataFrame, target_column_name: str, splits: list=["train", "selection", "validation"], metric: Optional[Callable]=None, ) -> pd.DataFrame: """Compute for each model the performance for different sets (e.g. train-selection-validation) and return them along with a list of predictors used in the model. Note that the computation of the performance for each split is cached inside the model itself, so it is inexpensive to perform it multiple times! Parameters ---------- data : pd.DataFrame Dataset for which to compute performance of each model. target_column_name : str Name of the target column. splits : list, optional List of splits to compute performance on. metric: Callable (function), optional Function that computes an evaluation metric to evaluate the model's performances, instead of the default metric (AUC for classification, RMSE for regression). The function should require y_true and y_pred arguments. Metric functions from sklearn can be used, for example, see https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics. Returns ------- DatFrame Contains for each model the performance for train, selection and validation sets as well as the set of predictors used in this model. """ results = [] predictor_set = set([]) for model in self._fitted_models: last_added_predictor = (set(model.predictors) .difference(predictor_set)) tmp = { "predictors": model.predictors, "last_added_predictor": list(last_added_predictor)[0] } # Evaluate model on each dataset split, # e.g. train-selection-validation tmp.update({ f"{split}_performance": model.evaluate( data[data["split"] == split], data[data["split"] == split][target_column_name], split=split, # parameter used for caching metric=metric ) for split in splits }) results.append(tmp) predictor_set = predictor_set.union(set(model.predictors)) df = pd.DataFrame(results) df["model_type"] = self.model_type return df
[docs] def fit(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list=[], excluded_predictors: list=[]): """Fit the forward feature selection estimator. Parameters ---------- train_data : pd.DataFrame Data on which to fit the model. Should include a "train" and "selection" split for correct model selection! The "train" split is used to train a model, the "selection" split is used to evaluate which model to include in the actual forward feature selection. target_column_name : str Name of the target column. predictors : list List of predictors on which to train the estimator. forced_predictors : list, optional List of predictors to force in the estimator. excluded_predictors : list, optional List of predictors to exclude from the estimator. Raises ------ ValueError In case the number of forced predictors is larger than the maximum number of allowed predictors in the model. """ assert "split" in train_data.columns, "The train_data input df does not include a split column." assert len(set(["train", "selection"]).difference(set(train_data["split"].unique()))) == 0, \ "The train_data input df does not include a 'train' and 'selection' split." # remove excluded predictors from predictor lists filtered_predictors = [var for var in predictors if (var not in excluded_predictors and var not in forced_predictors)] # checks on predictor lists and self.max_predictors attr if len(forced_predictors) > self.max_predictors: raise ValueError("Size of forced_predictors cannot be bigger than " "max_predictors.") elif len(forced_predictors) == self.max_predictors: log.info("Size of forced_predictors equals max_predictors " "only one model will be trained...") # train model with all forced_predictors (only) (self._fitted_models .append(self._train_model(train_data[train_data["split"] == "train"], target_column_name, forced_predictors))) else: self._fitted_models = self._forward_selection(train_data, target_column_name, filtered_predictors, forced_predictors)
def _forward_selection(self, train_data: pd.DataFrame, target_column_name: str, predictors: list, forced_predictors: list = []) -> list: """Perform the forward feature selection algorithm to compute a list of models (with increasing performance). The length of the list, i.e. the number of models, is bounded by the max_predictors class attribute. Parameters ---------- train_data : pd.DataFrame Data on which to fit the model. target_column_name : str Name of the target column. predictors : list List of predictors on which to train the models. forced_predictors : list, optional List of predictors to force in the models. Returns ------- list List of fitted models where the index of the list indicates the number of predictors minus one (as indices start from 0). """ fitted_models = [] current_predictors = [] max_steps = 1 + min(self.max_predictors, len(predictors) + len(forced_predictors)) for step in tqdm(range(1, max_steps), desc="Sequentially adding best " "predictor..."): if step <= len(forced_predictors): # first, we go through the forced predictors candidate_predictors = [var for var in forced_predictors if var not in current_predictors] else: candidate_predictors = [var for var in (predictors + forced_predictors) if var not in current_predictors] model = self._find_next_best_model(train_data, target_column_name, candidate_predictors, current_predictors) if model is not None: # Add new model predictors to the list of current predictors current_predictors = list(set(current_predictors) .union(set(model.predictors))) fitted_models.append(model) # else: # # If model returns None for the first time, # # one can in theory stop the feature selection process # # but we leave it run such that tqdm cleanly finishes # break if not fitted_models: log.error("No models found in forward selection.") return fitted_models def _find_next_best_model(self, train_data: pd.DataFrame, target_column_name: str, candidate_predictors: list, current_predictors: list): """Given a list of current predictors which are already selected to be include in the model, find amongst a list candidate predictors the predictor to add to the selected list so that the resulting model has the best performance. Parameters ---------- train_data : pd.DataFrame Data on which to fit the model. target_column_name : str Name of the target column. candidate_predictors : list List of candidate predictors to test. current_predictors : list List of predictors on which to train the models. Returns ------- self.MLModel Best performing model. """ # placeholders best_model = None if self.MLModel == LogisticRegressionModel: best_performance = -1 # AUC metric is used elif self.MLModel == LinearRegressionModel: best_performance = float("inf") # RMSE metric is used else: raise ValueError("No metric comparison method has been configured " "for the given model_type specified as " "ForwardFeatureSelection argument.") fit_data = train_data[train_data["split"] == "train"] # data to fit the models with sel_data = train_data[train_data["split"] == "selection"] # data to compare the models with for pred in candidate_predictors: # Train a model with an additional predictor model = self._train_model(fit_data, target_column_name, (current_predictors + [pred])) # Evaluate the model performance = (model .evaluate(sel_data[current_predictors + [pred]], sel_data[target_column_name], split="selection")) if self.pos_only and (not (model.get_coef() >= 0).all()): continue # Check if the model is better than the current best model # and if it is, replace the current best. if self.MLModel == LogisticRegressionModel \ and performance > best_performance: # AUC metric is used best_performance = performance best_model = model elif self.MLModel == LinearRegressionModel \ and performance < best_performance: # RMSE metric is used best_performance = performance best_model = model return best_model def _train_model(self, train_data: pd.DataFrame, target_column_name: str, predictors: list): """Train the model with a given set of predictors. Parameters ---------- train_data : pd.DataFrame Data on which to fit the model. target_column_name : str Name of the target column. predictors : list List of predictors on which to train the models. Returns ------- self.MLModel Trained model. """ model = self.MLModel() model.fit(train_data[predictors], train_data[target_column_name]) return model