From 3acc3d6edf41a1eeabc2dcc1832b95cced51b4ea Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Wed, 13 Nov 2024 14:35:44 -0800 Subject: [PATCH 01/13] Implemented a simple sqrt for the scale of residual in residualdouble. --- skpro/regression/residual.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 143c76eb..1ea2e070 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import warnings from sklearn import clone from skpro.regression.base import BaseProbaRegressor @@ -229,6 +230,9 @@ def _fit(self, X, y): resids = (y - y_pred) ** 2 else: resids = residual_trafo.fit_transform(y - y_pred) + warnings.warn( + "Arbitrary transforms are not compatible with the predict_proba method." + ) resids = flatten_to_1D_if_colvector(resids) @@ -295,6 +299,7 @@ def _predict_proba(self, X): est = self.estimator_ est_r = self.estimator_resid_ use_y_pred = self.use_y_pred + residual_trafo = self.residual_trafo distr_type = self.distr_type distr_loc_scale_name = self.distr_loc_scale_name distr_params = self.distr_params @@ -325,6 +330,14 @@ def _predict_proba(self, X): X_r = prep_skl_df(X_r, copy_df=True) y_pred_scale = est_r.predict(X_r) + if residual_trafo == "absolute": + pass + elif residual_trafo == "squared": + y_pred_scale = np.sqrt(y_pred_scale) + else: + raise NotImplementedError( + f"residual_trafo {residual_trafo} not implemented" + ) y_pred_scale = y_pred_scale.clip(min=min_scale) y_pred_scale = y_pred_scale.reshape(-1, n_cols) From b1c2a42eef40663ecf3a55baa9f48e50825ae307 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Wed, 13 Nov 2024 20:54:17 -0800 Subject: [PATCH 02/13] method of moments correctiosn for ResidualDouble scale parameter estimates. --- skpro/regression/residual.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 1ea2e070..f953794f 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -7,12 +7,28 @@ import pandas as pd import warnings from sklearn import clone +from scipy.special import gamma from skpro.regression.base import BaseProbaRegressor from skpro.utils.numpy import flatten_to_1D_if_colvector from skpro.utils.sklearn import prep_skl_df +def half_t_correction(dof: float) -> float: + """Expected value of absolute value of t-distributed variable with mu=0 sigma=1. + + For X ~ t(dof, 0, sigma), the expected value of the absolute value is + ``2 * sigma * sqrt(dof) * gamma((dof + 1) / 2) / (sqrt(pi) * (dof - 1) * gamma(dof / 2))``. + So E[|X|] / half_t_correction(dof) is an estimate of sigma. + """ + return ( + 2 + * np.sqrt(dof) + * gamma((dof + 1) / 2) + / (np.sqrt(np.pi) * (dof - 1) * gamma(dof / 2)) + ) + + class ResidualDouble(BaseProbaRegressor): """Residual double regressor. @@ -348,17 +364,32 @@ def _predict_proba(self, X): distr_type = Normal distr_loc_scale_name = ("mu", "sigma") + if residual_trafo == "absolute": + y_pred_scale = y_pred_scale / np.sqrt(2 / np.pi) elif distr_type == "Laplace": from skpro.distributions.laplace import Laplace distr_type = Laplace distr_loc_scale_name = ("mu", "scale") + if residual_trafo == "absolute": + y_pred_scale = y_pred_scale / np.sqrt(2.0) elif distr_type in ["Cauchy", "t"]: from skpro.distributions.t import TDistribution distr_type = TDistribution distr_loc_scale_name = ("mu", "sigma") - + if "df" not in distr_params or distr_params["df"] <= 2: + raise ValueError("Degrees of freedom must be greater than 2 for t-distribution.") + # Extract degrees of freedom + df = distr_params["df"] + if residual_trafo == "absolute": + y_pred_scale = y_pred_scale / half_t_correction(df) + elif residual_trafo == "squared": + y_pred_scale = y_pred_scale / np.sqrt(df / (df - 2)) + else: + raise NotImplementedError( + f"distr_type {distr_type} not implemented" + ) # collate all parameters for the distribution constructor # distribution params, if passed params = distr_params From 8d581d2193a33d99145569ec40e898d1daa65d68 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Wed, 13 Nov 2024 21:19:52 -0800 Subject: [PATCH 03/13] Fixed a bug in scale normalization for laplace in ResidualDouble. --- skpro/regression/residual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index f953794f..b8a59c4c 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -371,7 +371,7 @@ def _predict_proba(self, X): distr_type = Laplace distr_loc_scale_name = ("mu", "scale") - if residual_trafo == "absolute": + if residual_trafo == "squared": y_pred_scale = y_pred_scale / np.sqrt(2.0) elif distr_type in ["Cauchy", "t"]: from skpro.distributions.t import TDistribution From f4f73bfb253ed790558ce71a78d2d39b9420a87f Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Thu, 14 Nov 2024 16:21:57 -0800 Subject: [PATCH 04/13] added sample weight to fit method for BaseProbaRegressor and ResidualDouble --- skpro/regression/base/_base.py | 47 ++++++++++++++++++++++++++-------- skpro/regression/residual.py | 17 ++++++++---- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/skpro/regression/base/_base.py b/skpro/regression/base/_base.py index c89232f6..53653ce0 100644 --- a/skpro/regression/base/_base.py +++ b/skpro/regression/base/_base.py @@ -72,7 +72,7 @@ def __rmul__(self, other): else: return NotImplemented - def fit(self, X, y, C=None): + def fit(self, X, y, C=None, sample_weight=None): """Fit regressor to training data. Writes to self: @@ -89,6 +89,8 @@ def fit(self, X, y, C=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored + sample_weight : pandas DataFrame, same length as X, default=None + sample weights to fit regressor to Returns ------- @@ -112,13 +114,18 @@ def fit(self, X, y, C=None): # set fitted flag to True self._is_fitted = True - - if not capa_surv: - return self._fit(X_inner, y_inner) + if sample_weight is None: + if not capa_surv: + return self._fit(X_inner, y_inner) + else: + return self._fit(X_inner, y_inner, C=C_inner) else: - return self._fit(X_inner, y_inner, C=C_inner) + if not capa_surv: + return self._fit(X_inner, y_inner, sample_weight=sample_weight) + else: + return self._fit(X_inner, y_inner, C=C_inner, sample_weight=sample_weight) - def _fit(self, X, y, C=None): + def _fit(self, X, y, C=None, sample_weight=None): """Fit regressor to training data. Writes to self: @@ -130,6 +137,11 @@ def _fit(self, X, y, C=None): feature instances to fit regressor to y : pandas DataFrame, must be same length as X labels to fit regressor to + C : ignored, optional (default=None) + censoring information for survival analysis + All probabilistic regressors assume data to be uncensored + sample_weight : pandas DataFrame, same length as X, default=None + sample weights to fit regressor to Returns ------- @@ -137,7 +149,7 @@ def _fit(self, X, y, C=None): """ raise NotImplementedError - def update(self, X, y, C=None): + def update(self, X, y, C=None, sample_weight=None): """Update regressor with a new batch of training data. Only estimators with the ``capability:update`` tag (value ``True``) @@ -159,6 +171,8 @@ def update(self, X, y, C=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored + sample_weight : pandas DataFrame, same length as X, default=None + sample weights to fit regressor to Returns ------- @@ -178,12 +192,18 @@ def update(self, X, y, C=None): if capa_surv: C_inner = check_ret["C_inner"] - if not capa_surv: - return self._update(X_inner, y_inner) + if sample_weight is None: + if not capa_surv: + return self._update(X_inner, y_inner) + else: + return self._update(X_inner, y_inner, C=C_inner) else: - return self._update(X_inner, y_inner, C=C_inner) + if not capa_surv: + return self._update(X_inner, y_inner, sample_weight=sample_weight) + else: + return self._update(X_inner, y_inner, C=C_inner, sample_weight=sample_weight) - def _update(self, X, y, C=None): + def _update(self, X, y, C=None, sample_weight=None): """Update regressor with a new batch of training data. State required: @@ -198,6 +218,11 @@ def _update(self, X, y, C=None): feature instances to fit regressor to y : pandas DataFrame, must be same length as X labels to fit regressor to + C : ignored, optional (default=None) + censoring information for survival analysis + All probabilistic regressors assume data to be uncensored + sample_weight : pandas DataFrame, same length as X, default=None + sample weights to fit regressor to Returns ------- diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index b8a59c4c..2df67e09 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -201,7 +201,7 @@ def _predict_residuals_cv(self, X, y, cv, est): return y_pred - def _fit(self, X, y): + def _fit(self, X, y, sample_weight=None): """Fit regressor to training data. Writes to self: @@ -213,6 +213,8 @@ def _fit(self, X, y): feature instances to fit regressor to y : pandas DataFrame, must be same length as X labels to fit regressor to + sample_weight : pandas DataFrame, same length as X, default=None + sample weights to fit regressor to Returns ------- @@ -232,8 +234,10 @@ def _fit(self, X, y): # flatten column vector to 1D array to avoid sklearn complaints y = y.values y = flatten_to_1D_if_colvector(y) - - est.fit(X, y) + if sample_weight is None: + est.fit(X, y) + else: + est.fit(X, y, sample_weight=sample_weight) if cv is None: y_pred = est.predict(X) @@ -261,7 +265,10 @@ def _fit(self, X, y): # coerce X to pandas DataFrame with string column names X_r = prep_skl_df(X_r, copy_df=True) - est_r.fit(X_r, resids) + if sample_weight is None: + est_r.fit(X_r, resids) + else: + est_r.fit(X_r, resids, sample_weight=sample_weight) return self @@ -373,7 +380,7 @@ def _predict_proba(self, X): distr_loc_scale_name = ("mu", "scale") if residual_trafo == "squared": y_pred_scale = y_pred_scale / np.sqrt(2.0) - elif distr_type in ["Cauchy", "t"]: + elif distr_type == "t": from skpro.distributions.t import TDistribution distr_type = TDistribution From 2badc43bfa543e1b49e6dd2297b54496af87a8a6 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 15 Nov 2024 16:01:55 -0800 Subject: [PATCH 05/13] in ResidualDouble, copy dist_params to avoid mutating argument, and add a warning for t dist df<3 trafo=squared, where we observe poor performance. --- skpro/regression/residual.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 2df67e09..3684b735 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -155,7 +155,10 @@ def __init__( self.residual_trafo = residual_trafo self.distr_type = distr_type self.distr_loc_scale_name = distr_loc_scale_name - self.distr_params = distr_params + if distr_params is not None: + self.distr_params = distr_params.copy() + else: + self.distr_params = None self.use_y_pred = use_y_pred self.cv = cv self.min_scale = min_scale @@ -392,6 +395,10 @@ def _predict_proba(self, X): if residual_trafo == "absolute": y_pred_scale = y_pred_scale / half_t_correction(df) elif residual_trafo == "squared": + if df <= 3: + warnings.warn( + "Degrees of freedom less than 3 tends to yield poor results for squared residuals." + ) y_pred_scale = y_pred_scale / np.sqrt(df / (df - 2)) else: raise NotImplementedError( From 818ed5c0352dfb07b0bff839a7eb39df5c215903 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 15 Nov 2024 16:53:51 -0800 Subject: [PATCH 06/13] Added validity test for ResidualDouble. Fixed formatting of regression base and residual. --- skpro/regression/base/_base.py | 8 +- skpro/regression/residual.py | 30 +++++--- skpro/regression/tests/test_residual.py | 98 +++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 12 deletions(-) create mode 100644 skpro/regression/tests/test_residual.py diff --git a/skpro/regression/base/_base.py b/skpro/regression/base/_base.py index 53653ce0..695d8be1 100644 --- a/skpro/regression/base/_base.py +++ b/skpro/regression/base/_base.py @@ -123,7 +123,9 @@ def fit(self, X, y, C=None, sample_weight=None): if not capa_surv: return self._fit(X_inner, y_inner, sample_weight=sample_weight) else: - return self._fit(X_inner, y_inner, C=C_inner, sample_weight=sample_weight) + return self._fit( + X_inner, y_inner, C=C_inner, sample_weight=sample_weight + ) def _fit(self, X, y, C=None, sample_weight=None): """Fit regressor to training data. @@ -201,7 +203,9 @@ def update(self, X, y, C=None, sample_weight=None): if not capa_surv: return self._update(X_inner, y_inner, sample_weight=sample_weight) else: - return self._update(X_inner, y_inner, C=C_inner, sample_weight=sample_weight) + return self._update( + X_inner, y_inner, C=C_inner, sample_weight=sample_weight + ) def _update(self, X, y, C=None, sample_weight=None): """Update regressor with a new batch of training data. diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 3684b735..246b3c63 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -3,11 +3,12 @@ __author__ = ["fkiraly"] +import warnings + import numpy as np import pandas as pd -import warnings -from sklearn import clone from scipy.special import gamma +from sklearn import clone from skpro.regression.base import BaseProbaRegressor from skpro.utils.numpy import flatten_to_1D_if_colvector @@ -15,10 +16,11 @@ def half_t_correction(dof: float) -> float: - """Expected value of absolute value of t-distributed variable with mu=0 sigma=1. + """Get expected value of absolute value of t-distributed variable with mu=0 sigma=1. For X ~ t(dof, 0, sigma), the expected value of the absolute value is - ``2 * sigma * sqrt(dof) * gamma((dof + 1) / 2) / (sqrt(pi) * (dof - 1) * gamma(dof / 2))``. + ``2 * sigma * sqrt(dof) * gamma((dof + 1) / 2) / + (sqrt(pi) * (dof - 1) * gamma(dof / 2))``. So E[|X|] / half_t_correction(dof) is an estimate of sigma. """ return ( @@ -254,7 +256,11 @@ def _fit(self, X, y, sample_weight=None): else: resids = residual_trafo.fit_transform(y - y_pred) warnings.warn( - "Arbitrary transforms are not compatible with the predict_proba method." + ( + "Arbitrary transforms are not compatible with the " + "predict_proba method." + ), + stacklevel=2, ) resids = flatten_to_1D_if_colvector(resids) @@ -389,7 +395,9 @@ def _predict_proba(self, X): distr_type = TDistribution distr_loc_scale_name = ("mu", "sigma") if "df" not in distr_params or distr_params["df"] <= 2: - raise ValueError("Degrees of freedom must be greater than 2 for t-distribution.") + raise ValueError( + "Degrees of freedom must be greater than 2 for t-distribution." + ) # Extract degrees of freedom df = distr_params["df"] if residual_trafo == "absolute": @@ -397,13 +405,15 @@ def _predict_proba(self, X): elif residual_trafo == "squared": if df <= 3: warnings.warn( - "Degrees of freedom less than 3 tends to yield poor results for squared residuals." + ( + "Degrees of freedom less than 3 tends to yield poor" + " results for squared residuals." + ), + stacklevel=2, ) y_pred_scale = y_pred_scale / np.sqrt(df / (df - 2)) else: - raise NotImplementedError( - f"distr_type {distr_type} not implemented" - ) + raise NotImplementedError(f"distr_type {distr_type} not implemented") # collate all parameters for the distribution constructor # distribution params, if passed params = distr_params diff --git a/skpro/regression/tests/test_residual.py b/skpro/regression/tests/test_residual.py new file mode 100644 index 00000000..cf797ad3 --- /dev/null +++ b/skpro/regression/tests/test_residual.py @@ -0,0 +1,98 @@ +"""Tests Generalized Linear Model regressor.""" + +from typing import Dict, Literal, Optional + +import numpy as np +import pandas as pd +import pytest +from scipy import stats +from sklearn.linear_model import LinearRegression + +from skpro.regression.residual import ResidualDouble +from skpro.tests.test_switch import run_test_for_class + + +def held_out_cdf( + nn: int = 25_000, + distr_type: Literal["Laplace", "Normal", "t"] = "Laplace", + model: Literal["linear", "constant"] = "linear", + trafo: Literal["absolute", "squared"] = "absolute", + distr_params: Optional[Dict[str, float]] = None, +) -> pd.Series: + np.random.seed(42) + if distr_params is None: + distr_params = {} + else: + distr_params = distr_params.copy() + x_df = pd.DataFrame( + {"a": np.random.randn(nn), "b": np.random.randn(nn), "c": np.random.randn(nn)} + ).clip(-2, 2) + # DGP + if model == "linear": + loc_param_vec = pd.Series({"a": -1, "b": 1, "c": 0}) + log_scale_param_vec = pd.Series({"a": 0, "b": 0.01, "c": 0.5}) + loc_vec = x_df.dot(loc_param_vec) + log_scale_vec = x_df.dot(log_scale_param_vec).round(1) + else: + loc_vec = pd.Series(3.0, index=x_df.index) + log_scale_vec = pd.Series(0.0, index=x_df.index) + + if distr_type == "Laplace": + dist_cls = stats.laplace + elif distr_type == "Normal": + dist_cls = stats.norm + elif distr_type == "t": + dist_cls = stats.t + else: + raise ValueError(f"Distribution {distr_type} not supported") + dist = dist_cls(loc=loc_vec, scale=np.exp(log_scale_vec), **distr_params) + y = pd.DataFrame(dist.rvs((2, nn)).T, index=x_df.index, columns=["y0", "y1"]) + reg = ResidualDouble( + estimator=LinearRegression(), + estimator_resid=LinearRegression(), + distr_params=distr_params, + distr_type=distr_type, + residual_trafo=trafo, + # cv=KFold(n_splits=3), + ) + + reg.fit(x_df, y["y0"]) + pred = reg.predict_proba(x_df) + + cdf = pred.cdf(y[["y1"]])["y0"] + return cdf + + +@pytest.mark.skipif( + not run_test_for_class(ResidualDouble), + reason="run test only if softdeps are present and incrementally (if requested)", +) +@pytest.mark.parametrize( + "distr_type,distr_params", + [ + ("t", {"df": 5.1}), + ("t", {"df": 2.5}), + ("Laplace", None), + ("Normal", None), + ], +) +@pytest.mark.parametrize("trafo", ["absolute", "squared"]) +def test_residual_double_constant(distr_type, distr_params, trafo): + """Test validity of ResidualDouble regressor on a constant model.""" + Q_BINS = 4 + TOL_ALPHA = 0.001 + np.random.seed(42) + # Should be uniform(0,1) + held_out_quantiles = held_out_cdf( + model="constant", distr_type=distr_type, distr_params=distr_params, trafo=trafo + ) + # Counts of quantiles in bins + vc = pd.cut(held_out_quantiles, bins=np.linspace(0, 1, Q_BINS + 1)).value_counts() + # Expected counts under uniformity + e_vec = vc * vc.sum() / (Q_BINS * vc) + # Observed counts + o_vec = vc + # Chi-squared test + chsq = stats.chisquare(o_vec, e_vec, ddof=2) + # dist=1, ddf<3, trafo="squared" does very badly, hence the high tolerance + assert chsq.pvalue > TOL_ALPHA From 97893fc1b16de1eeffb1ad2a9239e9c576f75025 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 11:57:24 -0800 Subject: [PATCH 07/13] added sample weights for ResidualDouble predictcv --- skpro/regression/residual.py | 13 ++++++++--- skpro/regression/tests/test_residual.py | 30 ++++++++++++++++++++++++- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 246b3c63..7be8d469 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -176,7 +176,7 @@ def __init__( else: self.estimator_resid_ = clone(estimator_resid) - def _predict_residuals_cv(self, X, y, cv, est): + def _predict_residuals_cv(self, X, y, cv, est=None, sample_weight=None): """Predict out-of-sample residuals for y from X using cv. Parameters @@ -193,7 +193,8 @@ def _predict_residuals_cv(self, X, y, cv, est): y_pred : pandas DataFrame, same length as `X`, same columns as `y` in `fit` labels predicted for `X` """ - est = self.estimator_resid_ + if est is None: + est = self.estimator_resid_ method = "predict" y_pred = y.copy() @@ -201,7 +202,13 @@ def _predict_residuals_cv(self, X, y, cv, est): X_train = X.iloc[tr_idx] X_test = X.iloc[tt_idx] y_train = y[tr_idx] - fitted_est = clone(est).fit(X_train, y_train) + if sample_weight is None: + fitted_est = clone(est).fit(X_train, y_train) + else: + sample_weight_train = sample_weight[tr_idx] + fitted_est = clone(est).fit( + X_train, y_train, sample_weight=sample_weight_train + ) y_pred[tt_idx] = getattr(fitted_est, method)(X_test) return y_pred diff --git a/skpro/regression/tests/test_residual.py b/skpro/regression/tests/test_residual.py index cf797ad3..f11a3510 100644 --- a/skpro/regression/tests/test_residual.py +++ b/skpro/regression/tests/test_residual.py @@ -1,4 +1,4 @@ -"""Tests Generalized Linear Model regressor.""" +"""Tests ResidualDouble regressor for uniform quantiles when model is correct.""" from typing import Dict, Literal, Optional @@ -96,3 +96,31 @@ def test_residual_double_constant(distr_type, distr_params, trafo): chsq = stats.chisquare(o_vec, e_vec, ddof=2) # dist=1, ddf<3, trafo="squared" does very badly, hence the high tolerance assert chsq.pvalue > TOL_ALPHA + + +@pytest.mark.skipif( + not run_test_for_class(ResidualDouble), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_residual_double_sample_weight(): + """Test validity of ResidualDouble regressor on a constant model.""" + trafo = "absolute" + distr_type = "Laplace" + distr_params = None + Q_BINS = 4 + TOL_ALPHA = 0.001 + np.random.seed(42) + # Should be uniform(0,1) + held_out_quantiles = held_out_cdf( + model="constant", distr_type=distr_type, distr_params=distr_params, trafo=trafo + ) + # Counts of quantiles in bins + vc = pd.cut(held_out_quantiles, bins=np.linspace(0, 1, Q_BINS + 1)).value_counts() + # Expected counts under uniformity + e_vec = vc * vc.sum() / (Q_BINS * vc) + # Observed counts + o_vec = vc + # Chi-squared test + chsq = stats.chisquare(o_vec, e_vec, ddof=2) + # dist=1, ddf<3, trafo="squared" does very badly, hence the high tolerance + assert chsq.pvalue > TOL_ALPHA From c83719ed7067309ab5382697b2191ebd2659bd70 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:05:54 -0800 Subject: [PATCH 08/13] Added Mike Hankin to contributors --- .all-contributorsrc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.all-contributorsrc b/.all-contributorsrc index 8d6f60fa..acbe1c44 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -184,6 +184,18 @@ "contributions": [ "doc" ] + }, + + { + "login": "meh2135", + "name": "Mike Hankin", + "avatar_url": "https://avatars.githubusercontent.com/u/313774?v=4", + "profile": "https://github.com/meh2135", + "contributions": [ + "bug", + "code", + "test" + ] } ] } From 42f4e869aaeb2639c6c82a9832981a2ec3027989 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:23:34 -0800 Subject: [PATCH 09/13] fixed sample weight docstring comment. --- skpro/regression/base/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skpro/regression/base/_base.py b/skpro/regression/base/_base.py index 695d8be1..73561708 100644 --- a/skpro/regression/base/_base.py +++ b/skpro/regression/base/_base.py @@ -89,7 +89,7 @@ def fit(self, X, y, C=None, sample_weight=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored - sample_weight : pandas DataFrame, same length as X, default=None + sample_weight : pandas DataFrame, same shape as y, default=None sample weights to fit regressor to Returns @@ -142,7 +142,7 @@ def _fit(self, X, y, C=None, sample_weight=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored - sample_weight : pandas DataFrame, same length as X, default=None + sample_weight : pandas DataFrame, same shape as y, default=None sample weights to fit regressor to Returns @@ -173,7 +173,7 @@ def update(self, X, y, C=None, sample_weight=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored - sample_weight : pandas DataFrame, same length as X, default=None + sample_weight : pandas DataFrame, same shape as y, default=None sample weights to fit regressor to Returns @@ -225,7 +225,7 @@ def _update(self, X, y, C=None, sample_weight=None): C : ignored, optional (default=None) censoring information for survival analysis All probabilistic regressors assume data to be uncensored - sample_weight : pandas DataFrame, same length as X, default=None + sample_weight : pandas DataFrame, same shape as y, default=None sample weights to fit regressor to Returns From 260735d4dcd9983d58a1486a07ce7419b5605440 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:29:39 -0800 Subject: [PATCH 10/13] reincorporated cauchy and arbitrary transforms into ResidualDouble. --- skpro/regression/residual.py | 37 ++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 7be8d469..b8b5156d 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -264,8 +264,8 @@ def _fit(self, X, y, sample_weight=None): resids = residual_trafo.fit_transform(y - y_pred) warnings.warn( ( - "Arbitrary transforms are not compatible with the " - "predict_proba method." + "Arbitrary transforms will result in abberrant behavior in " + "the predict_proba method." ), stacklevel=2, ) @@ -374,8 +374,13 @@ def _predict_proba(self, X): elif residual_trafo == "squared": y_pred_scale = np.sqrt(y_pred_scale) else: - raise NotImplementedError( - f"residual_trafo {residual_trafo} not implemented" + y_pred_scale = residual_trafo.inverse_transform(y_pred_scale) + warnings.warn( + ( + "Arbitrary residual transforms will result in unpredictable" + " behavior." + ), + stacklevel=2, ) y_pred_scale = y_pred_scale.clip(min=min_scale) y_pred_scale = y_pred_scale.reshape(-1, n_cols) @@ -402,8 +407,13 @@ def _predict_proba(self, X): distr_type = TDistribution distr_loc_scale_name = ("mu", "sigma") if "df" not in distr_params or distr_params["df"] <= 2: - raise ValueError( - "Degrees of freedom must be greater than 2 for t-distribution." + warnings.warn( + ( + "t-distribution has no second moment for df <= 2, " + "and no first moment for df <= 1, so predict_proba will " + "result in erratic behavior." + ), + stacklevel=2, ) # Extract degrees of freedom df = distr_params["df"] @@ -419,6 +429,21 @@ def _predict_proba(self, X): stacklevel=2, ) y_pred_scale = y_pred_scale / np.sqrt(df / (df - 2)) + elif distr_type == "Cauchy": + from skpro.distributions.t import TDistribution as CauchyDistribution + + warnings.warn( + ( + "Cauchy distribution has no first or second moments, so " + "predict_proba will result in erratic behavior." + ), + stacklevel=2, + ) + + distr_type = CauchyDistribution + distr_loc_scale_name = ("mu", "sigma") + distr_params = {"df": 1} + else: raise NotImplementedError(f"distr_type {distr_type} not implemented") # collate all parameters for the distribution constructor From 996c40761f9017a47b926ae6af8d0e30789b8400 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:33:55 -0800 Subject: [PATCH 11/13] updated contributors --- .all-contributorsrc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.all-contributorsrc b/.all-contributorsrc index acbe1c44..dbe36584 100644 --- a/.all-contributorsrc +++ b/.all-contributorsrc @@ -185,7 +185,16 @@ "doc" ] }, - + { + "login": "sairevanth25", + "name": "Sai Revanth Gowravajhala", + "avatar_url": "https://avatars.githubusercontent.com/u/132150745?v=4", + "profile": "https://github.com/SaiRevanth25", + "contributions": [ + "code", + "doc" + ] + }, { "login": "meh2135", "name": "Mike Hankin", From e5655e312f0d8fc04de6316228db9e70fdb03b47 Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:40:29 -0800 Subject: [PATCH 12/13] More relevant warnings around t distribution degrees of freedom in residualdouble --- skpro/regression/residual.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index b8b5156d..7cd271e3 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -406,21 +406,30 @@ def _predict_proba(self, X): distr_type = TDistribution distr_loc_scale_name = ("mu", "sigma") - if "df" not in distr_params or distr_params["df"] <= 2: - warnings.warn( - ( - "t-distribution has no second moment for df <= 2, " - "and no first moment for df <= 1, so predict_proba will " - "result in erratic behavior." - ), - stacklevel=2, - ) # Extract degrees of freedom df = distr_params["df"] if residual_trafo == "absolute": + if df <= 1: + warnings.warn( + ( + "Both the t-distribution and the half t-distribution have " + "no first moment for df<=1, so predict_proba will result " + "in erratic behavior." + ), + stacklevel=2, + ) y_pred_scale = y_pred_scale / half_t_correction(df) elif residual_trafo == "squared": - if df <= 3: + if df <= 2: + warnings.warn( + ( + "t-distribution has no second moment for df <= 2, and no " + "first moment for df <= 1, so predict_proba will result " + "in erratic behavior." + ), + stacklevel=2, + ) + elif df <= 3: warnings.warn( ( "Degrees of freedom less than 3 tends to yield poor" From 282bde9c6b45dd26bb652147e8135138d24ea76c Mon Sep 17 00:00:00 2001 From: Mike Hankin Date: Fri, 6 Dec 2024 12:55:17 -0800 Subject: [PATCH 13/13] moved parameter copy to inside predict_proba to fix breaking tests. --- skpro/regression/residual.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/skpro/regression/residual.py b/skpro/regression/residual.py index 7cd271e3..1af1167b 100644 --- a/skpro/regression/residual.py +++ b/skpro/regression/residual.py @@ -157,10 +157,7 @@ def __init__( self.residual_trafo = residual_trafo self.distr_type = distr_type self.distr_loc_scale_name = distr_loc_scale_name - if distr_params is not None: - self.distr_params = distr_params.copy() - else: - self.distr_params = None + self.distr_params = distr_params self.use_y_pred = use_y_pred self.cv = cv self.min_scale = min_scale @@ -351,6 +348,8 @@ def _predict_proba(self, X): if distr_params is None: distr_params = {} + else: + distr_params = distr_params.copy() # predict location - this is the same as in _predict y_pred_loc = est.predict(X)