From 5345bdf77b495169e221844a020b76a0d05cbfe5 Mon Sep 17 00:00:00 2001 From: Nilesh Das Date: Fri, 26 Apr 2024 05:16:22 +0530 Subject: [PATCH] [ENH] interfacing Poisson regressor from sklearn (#213) #### Reference Issues/PRs https://github.com/sktime/skpro/issues/7 #### What does this implement/fix? Explain your changes. Added interface for Poisson Regressor --- docs/source/api_reference/regression.rst | 1 + skpro/regression/linear/__init__.py | 2 + skpro/regression/linear/_sklearn.py | 2 +- skpro/regression/linear/_sklearn_poisson.py | 225 ++++++++++++++++++++ 4 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 skpro/regression/linear/_sklearn_poisson.py diff --git a/docs/source/api_reference/regression.rst b/docs/source/api_reference/regression.rst index fd69c8590..afbfa3355 100644 --- a/docs/source/api_reference/regression.rst +++ b/docs/source/api_reference/regression.rst @@ -102,6 +102,7 @@ Linear regression ARDRegression BayesianRidge GLMRegressor + PoissonRegressor Gaussian process and kernel regression -------------------------------------- diff --git a/skpro/regression/linear/__init__.py b/skpro/regression/linear/__init__.py index edb8efd09..14f25e1e9 100644 --- a/skpro/regression/linear/__init__.py +++ b/skpro/regression/linear/__init__.py @@ -3,9 +3,11 @@ from skpro.regression.linear._glm import GLMRegressor from skpro.regression.linear._sklearn import ARDRegression, BayesianRidge +from skpro.regression.linear._sklearn_poisson import PoissonRegressor __all__ = [ "ARDRegression", "BayesianRidge", "GLMRegressor", + "PoissonRegressor", ] diff --git a/skpro/regression/linear/_sklearn.py b/skpro/regression/linear/_sklearn.py index f85a3974d..da4f68ae0 100644 --- a/skpro/regression/linear/_sklearn.py +++ b/skpro/regression/linear/_sklearn.py @@ -1,4 +1,4 @@ -"""Adapters to sklearnn linear regressors with probabilistic components.""" +"""Adapters to sklearn linear regressors with probabilistic components.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) # based on sktime pipelines diff --git a/skpro/regression/linear/_sklearn_poisson.py b/skpro/regression/linear/_sklearn_poisson.py new file mode 100644 index 000000000..dc6cacc16 --- /dev/null +++ b/skpro/regression/linear/_sklearn_poisson.py @@ -0,0 +1,225 @@ +"""Adapters to sklearn linear regressors with probabilistic components.""" +# copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +# based on sktime pipelines + +import pandas as pd + +from skpro.regression.base import BaseProbaRegressor +from skpro.utils.sklearn import prep_skl_df + + +class PoissonRegressor(BaseProbaRegressor): + """Poisson regression, direct adapter to sklearn PoissonRegressor. + + Generalized Linear Model with a Poisson distribution. + This regressor uses the 'log' link function. + + Parameters + ---------- + alpha : float, default=1.0 + Constant that multiplies the penalty term. Defaults to 1.0. + See the notes for the exact mathematical meaning of this + parameter. alpha = 0 is equivalent to unpenalized GLMs. + + fit_intercept : bool, default=True + Whether to fit an intercept term. + + solver : {'lbfgs', 'newton-cholesky'}, default='lbfgs' + Algorithm to use in the optimization problem. + + 'lbfgs' is an optimization algorithm that approximates the BFGS algorithm + 'newton-cholesky' uses a Newton-CG variant of Newton's method. + + max_iter : int, default=100 + The maximal number of iterations for the solver. + + tol : float, default=1e-4 + The convergence tolerance. If it is not None, training will stop + when (loss > best_loss - tol) for n_iter_no_change consecutive + epochs. + + verbose : int, default=0 + For the 'sag' and 'lbfgs' solvers set verbose to any positive + number for verbosity. + + warm_start : bool, default=False + When set to True, reuse the solution of the previous call to fit as + initialization, otherwise, just erase the previous solution. + + Attributes + ---------- + coef_ : array-like of shape (n_features,) + Coefficients of the regression model (mean of distribution) + + intercept_ : float + Independent term in decision function. + + n_iter_ : int + The actual number of iterations before reaching the stopping criterion. + + n_features_in_ : int + Number of features seen during :term:'fit'. + + feature_names_in_ : ndarray of shape (n_features,) + Names of features seen during :term:'fit'. + """ + + _tags = { + "capability:multioutput": False, + "capability:missing": False, + "X_inner_mtype": "pd_DataFrame_Table", + "y_inner_mtype": "pd_DataFrame_Table", + } + + def __init__( + self, + alpha=1.0, + fit_intercept=True, + max_iter=100, + tol=1e-4, + verbose=0, + warm_start=False, + ): + self.alpha = alpha + self.fit_intercept = fit_intercept + self.max_iter = max_iter + self.tol = tol + self.verbose = verbose + self.warm_start = warm_start + + super().__init__() + + from sklearn.linear_model import PoissonRegressor + + skl_estimator = PoissonRegressor( + alpha=alpha, + fit_intercept=fit_intercept, + max_iter=max_iter, + tol=tol, + verbose=verbose, + warm_start=warm_start, + ) + + self.estimator_ = skl_estimator + + FITTED_PARAMS_TO_FORWARD = [ + "coef_", + "intercept_", + "n_iter_", + ] + + def _fit(self, X, y): + """Fit regressor to training data. + + Writes to self: + Sets fitted model attributes ending in "_". + + Parameters + ---------- + X : pandas DataFrame + feature instances to fit regressor to + y : pandas DataFrame, must be same length as X + labels to fit regressor to + + Returns + ------- + self : reference to self + """ + X_inner = prep_skl_df(X).to_numpy() + y_inner = prep_skl_df(y).to_numpy() + + self._y_cols = y.columns + + if len(y_inner.shape) > 1 and y_inner.shape[1] == 1: + y_inner = y_inner[:, 0] + + estimator = self.estimator_ + estimator.fit(X=X_inner, y=y_inner) + + for attr in self.FITTED_PARAMS_TO_FORWARD: + setattr(self, attr, getattr(estimator, attr)) + + return self + + def _predict(self, X): + """Predict labels for data from features. + + State required: + Requires state to be "fitted" = self.is_fitted=True + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y : pandas DataFrame, same length as `X`, same columns as `y` in `fit` + labels predicted for `X` + """ + X_inner = prep_skl_df(X).to_numpy() + y_pred = self.estimator_.predict(X_inner) + y_pred_df = pd.DataFrame(y_pred, index=X.index, columns=self._y_cols) + return y_pred_df + + def _predict_var(self, X): + """Compute/return variance predictions.""" + return self._predict(X) # Poisson variance is equal to mean + + def _predict_proba(self, X): + """Predict distribution over labels for data from features. + + State required: + Requires state to be "fitted". + + Accesses in self: + Fitted model attributes ending in "_" + + Parameters + ---------- + X : pandas DataFrame, must have same columns as X in `fit` + data to predict labels for + + Returns + ------- + y_pred : skpro BaseDistribution, same length as `X` + labels predicted for `X` + """ + from skpro.distributions.poisson import Poisson + + y_cols = self._y_cols + y_pred = self.predict(X).values + y_pred_proba = Poisson(y_pred, index=X.index, columns=y_cols) + return y_pred_proba + + @classmethod + def get_test_params(cls, parameter_set="default"): + """Return testing parameter settings for the estimator. + + Parameters + ---------- + parameter_set : str, default="default" + Name of the set of test parameters to return, for use in tests. If no + special parameters are defined for a value, will return `"default"` set. + + Returns + ------- + params : dict or list of dict, default = {} + Parameters to create testing instances of the class + Each dict are parameters to construct an "interesting" test instance, i.e., + `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. + `create_test_instance` uses the first (or only) dictionary in `params` + """ + param1 = {} + param2 = { + "alpha": 2.0, + "fit_intercept": False, + "max_iter": 200, + "tol": 2e-4, + "verbose": 1, + "warm_start": True, + } + return [param1, param2]