From 7fa2aa629aaba1fc57b0831eb2d7fc7fdecf7fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= Date: Sun, 27 Aug 2023 02:12:54 +0100 Subject: [PATCH] [ENH] diagnostic plots part 1 - cross-plots (#46) Moves the diagnostic plots to the new interface. Part 1 - cross-plots. --- examples/utils.py | 51 ------- skpro/utils/plotting.py | 236 ++++++++++++++++++++++++++++++++ skpro/utils/tests/__init__.py | 2 + skpro/utils/tests/test_plots.py | 87 ++++++++++++ 4 files changed, 325 insertions(+), 51 deletions(-) delete mode 100644 examples/utils.py create mode 100644 skpro/utils/plotting.py create mode 100644 skpro/utils/tests/__init__.py create mode 100644 skpro/utils/tests/test_plots.py diff --git a/examples/utils.py b/examples/utils.py deleted file mode 100644 index 257dbd7f4..000000000 --- a/examples/utils.py +++ /dev/null @@ -1,51 +0,0 @@ -# -*- coding: utf-8 -*- -import numpy as np -from matplotlib import pyplot - -from skpro.metrics import log_loss - - -def plot_performance(y_test, y_pred, filename=None): - """ - Visualises the prediction performance - - Parameters - ---------- - y_test Ground truth - y_pred Predictions - filename If string, figure will be saved to file - - Returns - ------- - Matplotlib plot - """ - - fig, ax1 = pyplot.subplots() - - ax1.plot(y_test, y_test, "g.", label="Optimum") - sigma = np.std(y_pred) - ax1.errorbar( - y_test, - y_pred.point(), - yerr=sigma, - label="Predictions", - fmt="b.", - ecolor="r", - linewidth=0.5, - ) - ax1.set_ylabel("Predicted $y_{pred}$") - ax1.set_xlabel("Correct label $y_{true}$") - ax1.legend(loc="best") - - losses = log_loss(y_test, y_pred, sample=False) - ax2 = ax1.twinx() - overall = "{0:.2f}".format(np.mean(losses)) + " +/- {0:.2f}".format(np.std(losses)) - ax2.set_ylabel("Loss") - ax2.plot(y_test, losses, "y_", label="Loss: " + overall) - ax2.tick_params(colors="y") - ax2.legend(loc=1) - - if not isinstance(filename, str): - pyplot.show() - else: - pyplot.savefig(filename, transparent=True, bbox_inches="tight") diff --git a/skpro/utils/plotting.py b/skpro/utils/plotting.py new file mode 100644 index 000000000..4d89ef22e --- /dev/null +++ b/skpro/utils/plotting.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- +"""Utility functions for plotting.""" +import numpy as np +import pandas as pd + +from skpro.utils.validation._dependencies import _check_soft_dependencies + +__authors__ = ["fkiraly", "frthjf"] + + +def plot_crossplot_interval(y_true, y_pred, coverage=None, ax=None): + """Probabilistic cross-plot for regression, truth vs prediction interval. + + Plots: + + * x-axis: ground truth value + * y-axis: median predictive value, with error bars being + the prediction interval at symmetric coverage ``coverage`` + + Parameters + ---------- + y_true : array-like, [n_samples, n_targets] + Ground truth values + y_pred : skpro distribution, or predict_interval return, [n_samples, n_targets] + symmetric prediction intervals are obtained + via the coverage parameterfrom y_pred + Predicted values + coverage : float, optional, default=0.9 + Coverage of the prediction interval + Used only if y_pred a distribution + ax : matplotlib axes, optional + Axes to plot on, if None, a new figure is created and returned + + Returns + ------- + ax : matplotlib axes + Axes containing the plot + If ax was None, a new figure is created and returned + If ax was not None, the same ax is returned with plot added + + Example + ------- + >>> from skpro.utils.plotting import plot_crossplot_interval # doctest: +SKIP + >>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP + >>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP + >>> from sklearn.linear_model import LinearRegression # doctest: +SKIP + >>> from sklearn.datasets import load_diabetes # doctest: +SKIP + >>> + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP + >>> reg_mean = LinearRegression() # doctest: +SKIP + >>> reg_resid = RandomForestRegressor() # doctest: +SKIP + >>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP + >>> + >>> reg_proba.fit(X, y) # doctest: +SKIP + ResidualDouble(...) + >>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP + >>> plot_crossplot_interval(y, y_pred) # doctest: +SKIP + """ + _check_soft_dependencies("matplotlib") + + from matplotlib import pyplot + + if hasattr(y_pred, "quantile"): + if coverage is None: + coverage = 0.9 + quantile_pts = [0.5 - coverage / 2, 0.5, 0.5 + coverage / 2] + y_quantiles = y_pred.quantile(quantile_pts) + y_mid = y_quantiles.iloc[:, 1] + y_quantiles = y_quantiles.iloc[:, [0, 2]] + else: + y_quantiles = y_pred + y_mid = y_quantiles.mean(axis=1) + + y_mid_two = pd.DataFrame([y_mid, y_mid]).values + y_quantiles_np = y_quantiles.values.T + y_bars = np.abs(y_mid_two - y_quantiles_np) + + if ax is None: + _, ax = pyplot.subplots() + + ax.plot(y_true, y_true, "g.", label="Optimum") + ax.errorbar( + y_true.values, + y_mid, + yerr=y_bars, + label="Predictions", + fmt="b.", + ecolor="r", + linewidth=0.5, + ) + ax.set_ylabel(r"Prediction interval $\widehat{y}_i$") + ax.set_xlabel(r"Correct label $y_i$") + ax.legend(loc="best") + + return ax + + +def plot_crossplot_std(y_true, y_pred, ax=None): + r"""Probabilistic cross-plot for regression, error vs predictive standard deviation. + + Plots: + + * x-axis: absolute error samples $|y_i - \widehat{y}_i.\mu|$ + * y-axis: predictive standard deviation $\widehat{y}_i.\sigma$, + of the prediction $\widehat{y}_i$ corresponding to $y_i$ + + Parameters + ---------- + y_true : array-like, [n_samples, n_targets] + Ground truth values + y_pred : skpro distribution, or predict_var return, [n_samples, n_targets] + Predicted values + ax : matplotlib axes, optional + Axes to plot on, if None, a new figure is created and returned + + Returns + ------- + ax : matplotlib axes + Axes containing the plot + If ax was None, a new figure is created and returned + If ax was not None, the same ax is returned with plot added + + Example + ------- + >>> from skpro.utils.plotting import plot_crossplot_std # doctest: +SKIP + >>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP + >>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP + >>> from sklearn.linear_model import LinearRegression # doctest: +SKIP + >>> from sklearn.datasets import load_diabetes # doctest: +SKIP + >>> + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP + >>> reg_mean = LinearRegression() # doctest: +SKIP + >>> reg_resid = RandomForestRegressor() # doctest: +SKIP + >>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP + >>> + >>> reg_proba.fit(X, y) # doctest: +SKIP + ResidualDouble(...) + >>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP + >>> plot_crossplot_std(y, y_pred) # doctest: +SKIP + """ + _check_soft_dependencies("matplotlib") + + from matplotlib import pyplot + + if hasattr(y_pred, "_tags"): + y_var = y_pred.var() + + y_std = np.sqrt(y_var) + + if ax is None: + _, ax = pyplot.subplots() + + ax.plot( + np.abs(y_pred.mean().values.flatten() - y_true.values.flatten()), + y_std.values.flatten(), + "b.", + ) + ax.set_ylabel(r"Predictive variance of $\widehat{y}_i$") + ax.set_xlabel(r"Absolute errors $|y_i - \widehat{y}_i|$") + # ax.legend(loc="best") + + return ax + + +def plot_crossplot_loss(y_true, y_pred, metric, ax=None): + r"""Cross-loss-plot for probabilistic regression. + + Plots: + + * x-axis: ground truth values $y_i$ + * y-axis: loss of the prediction $\widehat{y}_i$ corresponding to $y_i$, + as calculated by ``metric.evaluate_by_index`` + + Parameters + ---------- + y_true : array-like, [n_samples, n_targets] + Ground truth values + y_pred : skpro distribution, or predict_var return, [n_samples, n_targets] + Predicted values + metric : skpro metric + Metric to calculate the loss + ax : matplotlib axes, optional + Axes to plot on, if None, a new figure is created and returned + + Returns + ------- + ax : matplotlib axes + Axes containing the plot + If ax was None, a new figure is created and returned + If ax was not None, the same ax is returned with plot added + + Example + ------- + >>> from skpro.utils.plotting import plot_crossplot_loss # doctest: +SKIP + >>> from skpro.metrics import CRPS # doctest: +SKIP + >>> from skpro.regression.residual import ResidualDouble # doctest: +SKIP + >>> from sklearn.ensemble import RandomForestRegressor # doctest: +SKIP + >>> from sklearn.linear_model import LinearRegression # doctest: +SKIP + >>> from sklearn.datasets import load_diabetes # doctest: +SKIP + >>> + >>> X, y = load_diabetes(return_X_y=True, as_frame=True) # doctest: +SKIP + >>> reg_mean = LinearRegression() # doctest: +SKIP + >>> reg_resid = RandomForestRegressor() # doctest: +SKIP + >>> reg_proba = ResidualDouble(reg_mean, reg_resid) # doctest: +SKIP + >>> + >>> reg_proba.fit(X, y) # doctest: +SKIP + ResidualDouble(...) + >>> y_pred = reg_proba.predict_proba(X) # doctest: +SKIP + >>> crps_metric = CRPS() # doctest: +SKIP + >>> plot_crossplot_loss(y, y_pred, crps_metric) # doctest: +SKIP + """ + _check_soft_dependencies("matplotlib") + + from matplotlib import pyplot + + losses = metric.evaluate_by_index(y_true, y_pred) + loss_vals = losses.values.flatten() + total_loss = np.mean(loss_vals).round(2) + total_loss_std = np.std(loss_vals) / np.sqrt(len(loss_vals)) + total_loss_std = total_loss_std.round(2) + + overall = f"{total_loss} +/- {total_loss_std} sterr of mean" + + if ax is None: + _, ax = pyplot.subplots() + + ax.plot(y_true, losses, "y_") + + ax.set_title(f"mean {metric.name}: {overall}") + + ax.set_xlabel(r"Correct label $y_i$") + ax.set_ylabel(metric.name + r"($y_i$, $\widehat{y}_i$)") + + ax.tick_params(colors="y") + + return ax diff --git a/skpro/utils/tests/__init__.py b/skpro/utils/tests/__init__.py new file mode 100644 index 000000000..165478ec6 --- /dev/null +++ b/skpro/utils/tests/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Tests for utilities.""" diff --git a/skpro/utils/tests/test_plots.py b/skpro/utils/tests/test_plots.py new file mode 100644 index 000000000..68b41c2a5 --- /dev/null +++ b/skpro/utils/tests/test_plots.py @@ -0,0 +1,87 @@ +# -*- coding: utf-8 -*- +"""Test functionality of time series plotting functions.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) + +import pytest + +from skpro.utils.validation._dependencies import _check_soft_dependencies + + +@pytest.mark.skipif( + not _check_soft_dependencies("matplotlib", severity="none"), + reason="skip test if required soft dependency for matplotlib not available", +) +def test_plot_crossplot_interval(): + """Test that plot_crossplot_interval runs without error.""" + _check_soft_dependencies("matplotlib") + + from sklearn.datasets import load_diabetes + from sklearn.ensemble import RandomForestRegressor + from sklearn.linear_model import LinearRegression + + from skpro.regression.residual import ResidualDouble + from skpro.utils.plotting import plot_crossplot_interval + + X, y = load_diabetes(return_X_y=True, as_frame=True) + reg_mean = LinearRegression() + reg_resid = RandomForestRegressor() + reg_proba = ResidualDouble(reg_mean, reg_resid) + + reg_proba.fit(X, y) + y_pred = reg_proba.predict_proba(X) + + plot_crossplot_interval(y, y_pred) + + +@pytest.mark.skipif( + not _check_soft_dependencies("matplotlib", severity="none"), + reason="skip test if required soft dependency for matplotlib not available", +) +def test_plot_crossplot_std(): + """Test that plot_crossplot_std runs without error.""" + _check_soft_dependencies("matplotlib") + + from sklearn.datasets import load_diabetes + from sklearn.ensemble import RandomForestRegressor + from sklearn.linear_model import LinearRegression + + from skpro.regression.residual import ResidualDouble + from skpro.utils.plotting import plot_crossplot_std + + X, y = load_diabetes(return_X_y=True, as_frame=True) + reg_mean = LinearRegression() + reg_resid = RandomForestRegressor() + reg_proba = ResidualDouble(reg_mean, reg_resid) + + reg_proba.fit(X, y) + y_pred = reg_proba.predict_proba(X) + + plot_crossplot_std(y, y_pred) + + +@pytest.mark.skipif( + not _check_soft_dependencies("matplotlib", severity="none"), + reason="skip test if required soft dependency for matplotlib not available", +) +def test_plot_crossplot_loss(): + """Test that plot_crossplot_loss runs without error.""" + _check_soft_dependencies("matplotlib") + + from sklearn.datasets import load_diabetes + from sklearn.ensemble import RandomForestRegressor + from sklearn.linear_model import LinearRegression + + from skpro.metrics import CRPS + from skpro.regression.residual import ResidualDouble + from skpro.utils.plotting import plot_crossplot_loss + + X, y = load_diabetes(return_X_y=True, as_frame=True) + reg_mean = LinearRegression() + reg_resid = RandomForestRegressor() + reg_proba = ResidualDouble(reg_mean, reg_resid) + + reg_proba.fit(X, y) + y_pred = reg_proba.predict_proba(X) + + crps_metric = CRPS() + plot_crossplot_loss(y, y_pred, crps_metric)