Source code for cobra.evaluation.plotting_utils
# third party imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
[docs]def plot_univariate_predictor_quality(df_metric: pd.DataFrame,
dim: tuple=(12, 8),
path: str=None):
"""Plot univariate quality of the predictors.
Parameters
----------
df_metric : pd.DatFrame
DataFrame containing for each variable the train AUC or RMSE and
test AUC or RMSE along with a boolean indicating whether or not it is selected
based on the criteria.
dim : tuple, optional
Width and length of the plot.
path : str, optional
Path to store the figure.
"""
if "AUC selection" in df_metric.columns:
metric = "AUC"
ascending = False
elif "RMSE selection" in df_metric.columns:
metric = "RMSE"
ascending = True
df = (df_metric[df_metric["preselection"]]
.sort_values(by=metric+" selection", ascending=ascending))
df = pd.melt(df, id_vars=["predictor"],
value_vars=[metric+" train", metric+" selection"],
var_name="split",
value_name=metric)
# plot data
with plt.style.context("seaborn-ticks"):
fig, ax = plt.subplots(figsize=dim)
ax = sns.barplot(x=metric, y="predictor", hue="split", data=df)
ax.set_title("Univariate Quality of Predictors")
# Set pretty axis
sns.despine(ax=ax, right=True)
# Remove white lines from the second axis
ax.grid(False)
if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
plt.show()
[docs]def plot_correlation_matrix(df_corr: pd.DataFrame,
dim: tuple=(12, 8),
path: str=None):
"""Plot correlation matrix amongst the predictors.
Parameters
----------
df_corr : pd.DataFrame
Correlation matrix.
dim : tuple, optional
Width and length of the plot.
path : str, optional
Path to store the figure.
"""
fig, ax = plt.subplots(figsize=dim)
ax = sns.heatmap(df_corr, cmap='Blues')
ax.set_title('Correlation Matrix')
if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
plt.show()
[docs]def plot_performance_curves(model_performance: pd.DataFrame,
dim: tuple=(12, 8),
path: str=None,
colors: dict={"train": "#0099bf",
"selection": "#ff9500",
"validation": "#8064a2"},
metric_name: str=None):
"""Plot performance curves generated by the forward feature selection
for the train-selection-validation sets.
Parameters
----------
model_performance : pd.DataFrame
Contains train-selection-validation performance for each model trained
in the forward feature selection.
dim : tuple, optional
Width and length of the plot.
path : str, optional
Path to store the figure.
colors : dict, optional
Map with colors for train-selection-validation curves.
metric_name : str, optional
Name to indicate the metric used in model_performance.
Defaults to RMSE in case of regression and AUC in case of
classification.
"""
model_type = model_performance["model_type"][0]
if metric_name is None:
if model_type == "classification":
metric_name = "AUC"
elif model_type == "regression":
metric_name = "RMSE"
max_metric = np.round(max(max(model_performance['train_performance']),
max(model_performance['selection_performance']),
max(model_performance['validation_performance'])), 1)
with plt.style.context("seaborn-whitegrid"):
fig, ax = plt.subplots(figsize=dim)
plt.plot(model_performance['train_performance'], marker=".",
markersize=20, linewidth=3, label="train",
color=colors["train"])
plt.plot(model_performance['selection_performance'], marker=".",
markersize=20, linewidth=3, label="selection",
color=colors["selection"])
plt.plot(model_performance['validation_performance'], marker=".",
markersize=20, linewidth=3, label="validation",
color=colors["validation"])
# Set x- and y-ticks
ax.set_xticks(np.arange(len(model_performance['last_added_predictor'])))
ax.set_xticklabels(model_performance['last_added_predictor'].tolist(),
rotation=40, ha='right')
if model_type == "classification":
ax.set_yticks(np.arange(0.5, max_metric + 0.02, 0.05))
elif model_type == "regression":
# In regression, the scale of the y-axis can largely vary depending
# on the dataset, it is easier to just set the y-axis bounds,
# but not the tick distance.
ax.set_ylim(0, max_metric*1.1)
# Make pretty
ax.legend(loc='lower right')
fig.suptitle('Performance curves forward feature selection',
fontsize=20)
plt.title("Metric: "+metric_name, fontsize=15, loc="left")
plt.ylabel('Model performance')
if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
plt.show()
[docs]def plot_variable_importance(df_variable_importance: pd.DataFrame,
title: str=None,
dim: tuple=(12, 8),
path: str=None):
"""Plot variable importance of a given model.
Parameters
----------
df_variable_importance : pd.DataFrame
DataFrame containing columns predictor and importance.
title : str, optional
Title of the plot.
dim : tuple, optional
Width and length of the plot.
path : str, optional
Path to store the figure.
"""
with plt.style.context("seaborn-ticks"):
fig, ax = plt.subplots(figsize=dim)
ax = sns.barplot(x="importance", y="predictor",
data=df_variable_importance,
color="cornflowerblue")
if title:
ax.set_title(title)
else:
ax.set_title("Variable importance")
# Set Axis - make them pretty
sns.despine(ax=ax, right=True)
# Remove white lines from the second axis
ax.grid(False)
if path is not None:
plt.savefig(path, format="png", dpi=300, bbox_inches="tight")
plt.show()