Source code for cobra.evaluation.plotting_utils


# third party imports
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

[docs]def plot_univariate_predictor_quality(df_metric: pd.DataFrame, dim: tuple=(12, 8), path: str=None): """Plot univariate quality of the predictors. Parameters ---------- df_metric : pd.DatFrame DataFrame containing for each variable the train AUC or RMSE and test AUC or RMSE along with a boolean indicating whether or not it is selected based on the criteria. dim : tuple, optional Width and length of the plot. path : str, optional Path to store the figure. """ if "AUC selection" in df_metric.columns: metric = "AUC" ascending = False elif "RMSE selection" in df_metric.columns: metric = "RMSE" ascending = True df = (df_metric[df_metric["preselection"]] .sort_values(by=metric+" selection", ascending=ascending)) df = pd.melt(df, id_vars=["predictor"], value_vars=[metric+" train", metric+" selection"], var_name="split", value_name=metric) # plot data with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) ax = sns.barplot(x=metric, y="predictor", hue="split", data=df) ax.set_title("Univariate Quality of Predictors") # Set pretty axis sns.despine(ax=ax, right=True) # Remove white lines from the second axis ax.grid(False) if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show()
[docs]def plot_correlation_matrix(df_corr: pd.DataFrame, dim: tuple=(12, 8), path: str=None): """Plot correlation matrix amongst the predictors. Parameters ---------- df_corr : pd.DataFrame Correlation matrix. dim : tuple, optional Width and length of the plot. path : str, optional Path to store the figure. """ fig, ax = plt.subplots(figsize=dim) ax = sns.heatmap(df_corr, cmap='Blues') ax.set_title('Correlation Matrix') if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show()
[docs]def plot_performance_curves(model_performance: pd.DataFrame, dim: tuple=(12, 8), path: str=None, colors: dict={"train": "#0099bf", "selection": "#ff9500", "validation": "#8064a2"}, metric_name: str=None): """Plot performance curves generated by the forward feature selection for the train-selection-validation sets. Parameters ---------- model_performance : pd.DataFrame Contains train-selection-validation performance for each model trained in the forward feature selection. dim : tuple, optional Width and length of the plot. path : str, optional Path to store the figure. colors : dict, optional Map with colors for train-selection-validation curves. metric_name : str, optional Name to indicate the metric used in model_performance. Defaults to RMSE in case of regression and AUC in case of classification. """ model_type = model_performance["model_type"][0] if metric_name is None: if model_type == "classification": metric_name = "AUC" elif model_type == "regression": metric_name = "RMSE" max_metric = np.round(max(max(model_performance['train_performance']), max(model_performance['selection_performance']), max(model_performance['validation_performance'])), 1) with plt.style.context("seaborn-whitegrid"): fig, ax = plt.subplots(figsize=dim) plt.plot(model_performance['train_performance'], marker=".", markersize=20, linewidth=3, label="train", color=colors["train"]) plt.plot(model_performance['selection_performance'], marker=".", markersize=20, linewidth=3, label="selection", color=colors["selection"]) plt.plot(model_performance['validation_performance'], marker=".", markersize=20, linewidth=3, label="validation", color=colors["validation"]) # Set x- and y-ticks ax.set_xticks(np.arange(len(model_performance['last_added_predictor']))) ax.set_xticklabels(model_performance['last_added_predictor'].tolist(), rotation=40, ha='right') if model_type == "classification": ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05)) elif model_type == "regression": # In regression, the scale of the y-axis can largely vary depending # on the dataset, it is easier to just set the y-axis bounds, # but not the tick distance. ax.set_ylim(0, max_metric*1.1) # Make pretty ax.legend(loc='lower right') fig.suptitle('Performance curves forward feature selection', fontsize=20) plt.title("Metric: "+metric_name, fontsize=15, loc="left") plt.ylabel('Model performance') if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show()
[docs]def plot_variable_importance(df_variable_importance: pd.DataFrame, title: str=None, dim: tuple=(12, 8), path: str=None): """Plot variable importance of a given model. Parameters ---------- df_variable_importance : pd.DataFrame DataFrame containing columns predictor and importance. title : str, optional Title of the plot. dim : tuple, optional Width and length of the plot. path : str, optional Path to store the figure. """ with plt.style.context("seaborn-ticks"): fig, ax = plt.subplots(figsize=dim) ax = sns.barplot(x="importance", y="predictor", data=df_variable_importance, color="cornflowerblue") if title: ax.set_title(title) else: ax.set_title("Variable importance") # Set Axis - make them pretty sns.despine(ax=ax, right=True) # Remove white lines from the second axis ax.grid(False) if path is not None: plt.savefig(path, format="png", dpi=300, bbox_inches="tight") plt.show()