Source code for cobra.model_building.models


from typing import Callable, Optional

# third party imports
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import roc_auc_score, mean_squared_error
from numpy import sqrt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_curve

# custom imports
import cobra.utils as utils
from cobra.evaluation import ClassificationEvaluator

[docs]class LogisticRegressionModel: """Wrapper around the LogisticRegression class, with additional methods implemented such as evaluation (using AUC), getting a list of coefficients, a dictionary of coefficients per predictor, ... for convenience. Attributes ---------- logit : LogisticRegression scikit-learn logistic regression model. predictors : list List of predictors used in the model. """ def __init__(self): self.logit = LogisticRegression(fit_intercept=True, C=1e9, solver='liblinear', random_state=42) self._is_fitted = False # placeholder to keep track of a list of predictors self.predictors = [] self._eval_metrics_by_split = {}
[docs] def serialize(self) -> dict: """Serialize model as JSON. Returns ------- dict Dictionary containing the serialized JSON. """ serialized_model = { "meta": "logistic-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, "params": self.logit.get_params() } if self._is_fitted: serialized_model.update({ "classes_": self.logit.classes_.tolist(), "coef_": self.logit.coef_.tolist(), "intercept_": self.logit.intercept_.tolist(), "n_iter_": self.logit.n_iter_.tolist(), }) return serialized_model
[docs] def deserialize(self, model_dict: dict): """Deserialize a model previously stored as JSON. Parameters ---------- model_dict : dict Serialized JSON file as a dict. Raises ------ ValueError In case JSON file is no valid serialized model. """ if not self._is_valid_dict(model_dict): raise ValueError("No valid serialized model") self.logit = LogisticRegression() self.logit.set_params(**model_dict["params"]) self.logit.classes_ = np.array(model_dict["classes_"]) self.logit.coef_ = np.array(model_dict["coef_"]) self.logit.intercept_ = np.array(model_dict["intercept_"]) self.logit.n_iter_ = np.array(model_dict["intercept_"]) self.predictors = model_dict["predictors"] self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
[docs] def get_coef(self) -> np.array: """Returns the model coefficients. Returns ------- np.array Array of model coefficients. """ return self.logit.coef_[0]
[docs] def get_intercept(self) -> float: """Returns the intercept of the model. Returns ------- float Intercept of the model. """ return self.logit.intercept_[0]
[docs] def get_coef_by_predictor(self) -> dict: """Returns a dictionary mapping predictor (key) to coefficient (value). Returns ------- dict A map ``{predictor: coefficient}``. """ return dict(zip(self.predictors, self.logit.coef_[0]))
[docs] def fit(self, X_train: pd.DataFrame, y_train: pd.Series): """Fit the model. Parameters ---------- X_train : pd.DataFrame Predictors of train data. y_train : pd.Series Target of train data. """ self.predictors = list(X_train.columns) self.logit.fit(X_train, y_train) self._is_fitted = True
[docs] def score_model(self, X: pd.DataFrame) -> np.ndarray: """Score a model on a (new) dataset. Parameters ---------- X : pd.DataFrame Dataset of predictors to score the model. Returns ------- np.ndarray Score (i.e. predicted probabilities) of the model for each observation. """ # We select predictor columns (self.predictors) here to # ensure we have the proper predictors and the proper order return self.logit.predict_proba(X[self.predictors])[:, 1]
[docs] def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str=None, metric: Optional[Callable]=None) -> float: """Evaluate the model on a given dataset (X, y). The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets can be cached! Parameters ---------- X : pd.DataFrame Dataset containing the predictor values for each observation. y : pd.Series Dataset containing the target of each observation. split : str, optional Split name of the dataset (e.g. "train", "selection", or "validation"). metric: Callable (function), optional Function that computes an evaluation metric to evaluate the model's performances, instead of the default metric (AUC). The function should require y_true and y_pred (binary output) arguments. Metric functions from sklearn can be used, for example, see https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics. Returns ------- float The performance score of the model (AUC by default). """ if metric is not None: # decouple from _eval_metrics_by_split attribute y_pred = self.score_model(X) fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred) cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds)) y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred]) performance = metric(y_true=y, y_pred=y_pred_b) return performance else: if (split is None) or (split not in self._eval_metrics_by_split): y_pred = self.score_model(X) performance = roc_auc_score(y_true=y, y_score=y_pred) if split is None: return performance else: self._eval_metrics_by_split[split] = performance return self._eval_metrics_by_split[split]
[docs] def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: """Compute the importance of each predictor in the model and return it as a DataFrame. Parameters ---------- data : pd.DataFrame Data to score the model. Returns ------- pd.DataFrame DataFrame containing columns predictor and importance. """ y_pred = self.score_model(data) importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr( data[predictor], y_pred )[0] for predictor in self.predictors } df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() df.columns = ["predictor", "importance"] return (df.sort_values(by="importance", ascending=False) .reset_index(drop=True))
def _is_valid_dict(self, model_dict: dict) -> bool: if ("meta" not in model_dict or model_dict["meta"] != "logistic-regression"): return False attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"] for key in attr: if not (key in model_dict or type(model_dict[key]) != list): return False if ("params" not in model_dict or "_eval_metrics_by_split" not in model_dict): return False return True
[docs]class LinearRegressionModel: """Wrapper around the LinearRegression class, with additional methods implemented such as evaluation (using RMSE), getting a list of coefficients, a dictionary of coefficients per predictor, ... for convenience. Attributes ---------- linear : LinearRegression scikit-learn linear regression model. predictors : list List of predictors used in the model. """ def __init__(self): self.linear = LinearRegression(fit_intercept=True, normalize=False) self._is_fitted = False # placeholder to keep track of a list of predictors self.predictors = [] self._eval_metrics_by_split = {}
[docs] def serialize(self) -> dict: """Serialize model as JSON. Returns ------- dict Dictionary containing the serialized JSON. """ serialized_model = { "meta": "linear-regression", "predictors": self.predictors, "_eval_metrics_by_split": self._eval_metrics_by_split, "params": self.linear.get_params() } if self._is_fitted: serialized_model.update({ "coef_": self.linear.coef_.tolist(), "intercept_": self.linear.intercept_.tolist() }) return serialized_model
[docs] def deserialize(self, model_dict: dict): """Deserialize a model previously stored as JSON. Parameters ---------- model_dict : dict Serialized JSON file as a dict. Raises ------ ValueError In case JSON file is no valid serialized model. """ if not self._is_valid_dict(model_dict): raise ValueError("No valid serialized model") self.linear = LinearRegression() self.linear.set_params(**model_dict["params"]) self.linear.coef_ = np.array(model_dict["coef_"]) self.linear.intercept_ = np.array(model_dict["intercept_"]) self.predictors = model_dict["predictors"] self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]
[docs] def get_coef(self) -> np.array: """Returns the model coefficients. Returns ------- np.array Array of model coefficients. """ return self.linear.coef_
[docs] def get_intercept(self) -> float: """Returns the intercept of the model. Returns ------- float Intercept of the model. """ return self.linear.intercept_[0]
[docs] def get_coef_by_predictor(self) -> dict: """Returns a dictionary mapping predictor (key) to coefficient (value). Returns ------- dict A map ``{predictor: coefficient}``. """ return dict(zip(self.predictors, self.linear.coef_))
[docs] def fit(self, X_train: pd.DataFrame, y_train: pd.Series): """Fit the model. Parameters ---------- X_train : pd.DataFrame Predictors of train data. y_train : pd.Series Target of train data. """ self.predictors = list(X_train.columns) self.linear.fit(X_train, y_train) self._is_fitted = True
[docs] def score_model(self, X: pd.DataFrame) -> np.ndarray: """Score a model on a (new) dataset. Parameters ---------- X : pd.DataFrame Dataset of predictors to score the model. Returns ------- np.ndarray Score of the model for each observation. """ # We select predictor columns (self.predictors) here to # ensure we have the proper predictors and the proper order return self.linear.predict(X[self.predictors])
[docs] def evaluate(self, X: pd.DataFrame, y: pd.Series, split: str=None, metric: Optional[Callable]=None) -> float: """Evaluate the model on a given dataset (X, y). The optional split parameter is to indicate that the dataset belongs to (train, selection, validation), so that the computation on these sets can be cached! Parameters ---------- X : pd.DataFrame Dataset containing the predictor values for each observation. y : pd.Series Dataset containing the target of each observation. split : str, optional Split name of the dataset (e.g. "train", "selection", or "validation"). metric: Callable (function), optional Function that computes an evaluation metric to evaluate the model's performances, instead of the default metric (RMSE). The function should require y_true and y_pred arguments. Metric functions from sklearn can be used, for example, see https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics. Returns ------- float The performance score of the model (RMSE by default). """ if metric is not None: # decouple from _eval_metrics_by_split attribute y_pred = self.score_model(X) performance = metric(y_true=y, y_pred=y_pred) return performance else: if (split is None) or (split not in self._eval_metrics_by_split): y_pred = self.score_model(X) performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred)) if split is None: return performance else: self._eval_metrics_by_split[split] = performance return self._eval_metrics_by_split[split]
[docs] def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame: """Compute the importance of each predictor in the model and return it as a DataFrame. Parameters ---------- data : pd.DataFrame Data to score the model. Returns ------- pd.DataFrame DataFrame containing columns predictor and importance. """ y_pred = self.score_model(data) importance_by_variable = { utils.clean_predictor_name(predictor): stats.pearsonr( data[predictor], y_pred )[0] for predictor in self.predictors } df = pd.DataFrame.from_dict(importance_by_variable, orient="index").reset_index() df.columns = ["predictor", "importance"] return (df.sort_values(by="importance", ascending=False) .reset_index(drop=True))
def _is_valid_dict(self, model_dict: dict) -> bool: if ("meta" not in model_dict or model_dict["meta"] != "linear-regression"): return False attr = ["coef_", "intercept_", "predictors"] for key in attr: if not (key in model_dict or type(model_dict[key]) != list): return False if ("params" not in model_dict or "_eval_metrics_by_split" not in model_dict): return False return True