Source code for cobra.model_building.models


from typing import Callable, Optional

# third party imports
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import roc_auc_score, mean_squared_error
from numpy import sqrt
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import roc_curve

# custom imports
import cobra.utils as utils
from cobra.evaluation import ClassificationEvaluator

[docs]class LogisticRegressionModel:
    """Wrapper around the LogisticRegression class, with additional methods
    implemented such as evaluation (using AUC), getting a list of coefficients,
    a dictionary of coefficients per predictor, ... for convenience.

    Attributes
    ----------
    logit : LogisticRegression
        scikit-learn logistic regression model.
    predictors : list
        List of predictors used in the model.
    """

    def __init__(self):
        self.logit = LogisticRegression(fit_intercept=True, C=1e9,
                                        solver='liblinear', random_state=42)
        self._is_fitted = False
        # placeholder to keep track of a list of predictors
        self.predictors = []
        self._eval_metrics_by_split = {}

[docs]    def serialize(self) -> dict:
        """Serialize model as JSON.

        Returns
        -------
        dict
            Dictionary containing the serialized JSON.
        """
        serialized_model = {
            "meta": "logistic-regression",
            "predictors": self.predictors,
            "_eval_metrics_by_split": self._eval_metrics_by_split,
            "params": self.logit.get_params()
        }

        if self._is_fitted:
            serialized_model.update({
                "classes_": self.logit.classes_.tolist(),
                "coef_": self.logit.coef_.tolist(),
                "intercept_": self.logit.intercept_.tolist(),
                "n_iter_": self.logit.n_iter_.tolist(),
            })

        return serialized_model

[docs]    def deserialize(self, model_dict: dict):
        """Deserialize a model previously stored as JSON.

        Parameters
        ----------
        model_dict : dict
            Serialized JSON file as a dict.

        Raises
        ------
        ValueError
            In case JSON file is no valid serialized model.
        """

        if not self._is_valid_dict(model_dict):
            raise ValueError("No valid serialized model")

        self.logit = LogisticRegression()
        self.logit.set_params(**model_dict["params"])
        self.logit.classes_ = np.array(model_dict["classes_"])
        self.logit.coef_ = np.array(model_dict["coef_"])
        self.logit.intercept_ = np.array(model_dict["intercept_"])
        self.logit.n_iter_ = np.array(model_dict["intercept_"])
        self.predictors = model_dict["predictors"]
        self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

[docs]    def get_coef(self) -> np.array:
        """Returns the model coefficients.

        Returns
        -------
        np.array
            Array of model coefficients.
        """
        return self.logit.coef_[0]

[docs]    def get_intercept(self) -> float:
        """Returns the intercept of the model.

        Returns
        -------
        float
            Intercept of the model.
        """
        return self.logit.intercept_[0]

[docs]    def get_coef_by_predictor(self) -> dict:
        """Returns a dictionary mapping predictor (key) to coefficient (value).

        Returns
        -------
        dict
            A map ``{predictor: coefficient}``.
        """
        return dict(zip(self.predictors, self.logit.coef_[0]))

[docs]    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """Fit the model.

        Parameters
        ----------
        X_train : pd.DataFrame
            Predictors of train data.
        y_train : pd.Series
            Target of train data.
        """
        self.predictors = list(X_train.columns)
        self.logit.fit(X_train, y_train)
        self._is_fitted = True

[docs]    def score_model(self, X: pd.DataFrame) -> np.ndarray:
        """Score a model on a (new) dataset.

        Parameters
        ----------
        X : pd.DataFrame
            Dataset of predictors to score the model.

        Returns
        -------
        np.ndarray
            Score (i.e. predicted probabilities) of the model for each observation.
        """
        # We select predictor columns (self.predictors) here to
        # ensure we have the proper predictors and the proper order
        return self.logit.predict_proba(X[self.predictors])[:, 1]

[docs]    def evaluate(self, X: pd.DataFrame, y: pd.Series,
                 split: str=None,
                 metric: Optional[Callable]=None) -> float:
        """Evaluate the model on a given dataset (X, y). The optional split
        parameter is to indicate that the dataset belongs to
        (train, selection, validation), so that the computation on these sets
        can be cached!

        Parameters
        ----------
        X : pd.DataFrame
            Dataset containing the predictor values for each observation.
        y : pd.Series
            Dataset containing the target of each observation.
        split : str, optional
            Split name of the dataset (e.g. "train", "selection", or "validation").
        metric: Callable (function), optional
            Function that computes an evaluation metric to evaluate the model's
            performances, instead of the default metric (AUC).
            The function should require y_true and y_pred (binary output) arguments.
            Metric functions from sklearn can be used, for example, see
            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.

        Returns
        -------
        float
            The performance score of the model (AUC by default).
        """
        if metric is not None:  # decouple from _eval_metrics_by_split attribute
            y_pred = self.score_model(X)

            fpr, tpr, thresholds = roc_curve(y_true=y, y_score=y_pred)
            cutoff = (ClassificationEvaluator._compute_optimal_cutoff(fpr, tpr, thresholds))
            y_pred_b = np.array([0 if pred <= cutoff else 1 for pred in y_pred])

            performance = metric(y_true=y, y_pred=y_pred_b)

            return performance
        else:
            if (split is None) or (split not in self._eval_metrics_by_split):
                y_pred = self.score_model(X)
                performance = roc_auc_score(y_true=y, y_score=y_pred)

                if split is None:
                    return performance
                else:
                    self._eval_metrics_by_split[split] = performance

        return self._eval_metrics_by_split[split]

[docs]    def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
        """Compute the importance of each predictor in the model and return
        it as a DataFrame.

        Parameters
        ----------
        data : pd.DataFrame
            Data to score the model.

        Returns
        -------
        pd.DataFrame
            DataFrame containing columns predictor and importance.
        """

        y_pred = self.score_model(data)

        importance_by_variable = {
            utils.clean_predictor_name(predictor): stats.pearsonr(
                data[predictor],
                y_pred
                )[0]
            for predictor in self.predictors
        }

        df = pd.DataFrame.from_dict(importance_by_variable,
                                    orient="index").reset_index()
        df.columns = ["predictor", "importance"]

        return (df.sort_values(by="importance", ascending=False)
                .reset_index(drop=True))

    def _is_valid_dict(self, model_dict: dict) -> bool:

        if ("meta" not in model_dict
                or model_dict["meta"] != "logistic-regression"):
            return False

        attr = ["classes_", "coef_", "intercept_", "n_iter_", "predictors"]
        for key in attr:
            if not (key in model_dict or type(model_dict[key]) != list):
                return False

        if ("params" not in model_dict
                or "_eval_metrics_by_split" not in model_dict):
            return False

        return True


[docs]class LinearRegressionModel:
    """Wrapper around the LinearRegression class, with additional methods
    implemented such as evaluation (using RMSE), getting a list of coefficients,
    a dictionary of coefficients per predictor, ... for convenience.

    Attributes
    ----------
    linear : LinearRegression
        scikit-learn linear regression model.
    predictors : list
        List of predictors used in the model.
    """

    def __init__(self):
        self.linear = LinearRegression(fit_intercept=True, normalize=False)
        self._is_fitted = False
        # placeholder to keep track of a list of predictors
        self.predictors = []
        self._eval_metrics_by_split = {}

[docs]    def serialize(self) -> dict:
        """Serialize model as JSON.

        Returns
        -------
        dict
            Dictionary containing the serialized JSON.
        """
        serialized_model = {
            "meta": "linear-regression",
            "predictors": self.predictors,
            "_eval_metrics_by_split": self._eval_metrics_by_split,
            "params": self.linear.get_params()
        }

        if self._is_fitted:
            serialized_model.update({
                "coef_": self.linear.coef_.tolist(),
                "intercept_": self.linear.intercept_.tolist()
            })

        return serialized_model

[docs]    def deserialize(self, model_dict: dict):
        """Deserialize a model previously stored as JSON.

        Parameters
        ----------
        model_dict : dict
            Serialized JSON file as a dict.

        Raises
        ------
        ValueError
            In case JSON file is no valid serialized model.
        """

        if not self._is_valid_dict(model_dict):
            raise ValueError("No valid serialized model")

        self.linear = LinearRegression()
        self.linear.set_params(**model_dict["params"])
        self.linear.coef_ = np.array(model_dict["coef_"])
        self.linear.intercept_ = np.array(model_dict["intercept_"])
        self.predictors = model_dict["predictors"]
        self._eval_metrics_by_split = model_dict["_eval_metrics_by_split"]

[docs]    def get_coef(self) -> np.array:
        """Returns the model coefficients.

        Returns
        -------
        np.array
            Array of model coefficients.
        """
        return self.linear.coef_

[docs]    def get_intercept(self) -> float:
        """Returns the intercept of the model.

        Returns
        -------
        float
            Intercept of the model.
        """
        return self.linear.intercept_[0]

[docs]    def get_coef_by_predictor(self) -> dict:
        """Returns a dictionary mapping predictor (key) to coefficient (value).

        Returns
        -------
        dict
            A map ``{predictor: coefficient}``.
        """
        return dict(zip(self.predictors, self.linear.coef_))

[docs]    def fit(self, X_train: pd.DataFrame, y_train: pd.Series):
        """Fit the model.

        Parameters
        ----------
        X_train : pd.DataFrame
            Predictors of train data.
        y_train : pd.Series
            Target of train data.
        """
        self.predictors = list(X_train.columns)
        self.linear.fit(X_train, y_train)
        self._is_fitted = True

[docs]    def score_model(self, X: pd.DataFrame) -> np.ndarray:
        """Score a model on a (new) dataset.

        Parameters
        ----------
        X : pd.DataFrame
            Dataset of predictors to score the model.

        Returns
        -------
        np.ndarray
            Score of the model for each observation.
        """
        # We select predictor columns (self.predictors) here to
        # ensure we have the proper predictors and the proper order
        return self.linear.predict(X[self.predictors])

[docs]    def evaluate(self, X: pd.DataFrame, y: pd.Series,
                 split: str=None,
                 metric: Optional[Callable]=None) -> float:
        """Evaluate the model on a given dataset (X, y). The optional split
        parameter is to indicate that the dataset belongs to
        (train, selection, validation), so that the computation on these sets
        can be cached!

        Parameters
        ----------
        X : pd.DataFrame
            Dataset containing the predictor values for each observation.
        y : pd.Series
            Dataset containing the target of each observation.
        split : str, optional
            Split name of the dataset (e.g. "train", "selection", or "validation").
        metric: Callable (function), optional
            Function that computes an evaluation metric to evaluate the model's
            performances, instead of the default metric (RMSE).
            The function should require y_true and y_pred arguments.
            Metric functions from sklearn can be used, for example, see
            https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.

        Returns
        -------
        float
            The performance score of the model (RMSE by default).
        """
        if metric is not None:  # decouple from _eval_metrics_by_split attribute
            y_pred = self.score_model(X)
            performance = metric(y_true=y, y_pred=y_pred)

            return performance
        else:
            if (split is None) or (split not in self._eval_metrics_by_split):
                y_pred = self.score_model(X)
                performance = sqrt(mean_squared_error(y_true=y, y_pred=y_pred))

                if split is None:
                    return performance
                else:
                    self._eval_metrics_by_split[split] = performance

        return self._eval_metrics_by_split[split]

[docs]    def compute_variable_importance(self, data: pd.DataFrame) -> pd.DataFrame:
        """Compute the importance of each predictor in the model and return
        it as a DataFrame.

        Parameters
        ----------
        data : pd.DataFrame
            Data to score the model.

        Returns
        -------
        pd.DataFrame
            DataFrame containing columns predictor and importance.
        """

        y_pred = self.score_model(data)

        importance_by_variable = {
            utils.clean_predictor_name(predictor): stats.pearsonr(
                data[predictor],
                y_pred
                )[0]
            for predictor in self.predictors
        }

        df = pd.DataFrame.from_dict(importance_by_variable,
                                    orient="index").reset_index()
        df.columns = ["predictor", "importance"]

        return (df.sort_values(by="importance", ascending=False)
                .reset_index(drop=True))

    def _is_valid_dict(self, model_dict: dict) -> bool:

        if ("meta" not in model_dict
                or model_dict["meta"] != "linear-regression"):
            return False

        attr = ["coef_", "intercept_", "predictors"]
        for key in attr:
            if not (key in model_dict or type(model_dict[key]) != list):
                return False

        if ("params" not in model_dict
                or "_eval_metrics_by_split" not in model_dict):
            return False

        return True