From 945966eacbba3d0b6968e15678f9049d68122a01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Franz=20Kir=C3=A1ly?= <franzkiraly@inf.elte.hu>
Date: Wed, 13 Sep 2023 17:31:37 +0200
Subject: [PATCH] [ENH] Bagging ensemble of probabilistic regressors (#32)

Adds bagging ensemble of probabilistic regressors.

Complete rewrite of the legacy estimator.

Depends on:
* https://github.com/sktime/skpro/pull/26 for the mixture distribution
used in bagging.
* https://github.com/sktime/skpro/pull/65 for the default of `predict`,
if `_predict_proba` or other proba methods are implemented but not
`_predict`.
---
 skpro/regression/ensemble.py                  | 295 ++++++++++++++----
 skpro/regression/tests/test_all_regressors.py |   6 +
 2 files changed, 234 insertions(+), 67 deletions(-)

diff --git a/skpro/regression/ensemble.py b/skpro/regression/ensemble.py
index 3683bec19..a6667b623 100644
--- a/skpro/regression/ensemble.py
+++ b/skpro/regression/ensemble.py
@@ -1,86 +1,247 @@
-# LEGACY MODULE - TODO: remove or refactor
+"""Bagging probabilistic regressors."""
+
+__author__ = ["fkiraly"]
+__all__ = ["BaggingRegressor"]
+
+from math import ceil
 
 import numpy as np
-from sklearn.ensemble import BaggingRegressor as BaseBaggingRegressor
-from sklearn.utils import check_array
-from sklearn.utils.validation import check_is_fitted
+import pandas as pd
+
+from skpro.distributions.mixture import Mixture
+from skpro.regression.base import BaseProbaRegressor
+
+
+class BaggingRegressor(BaseProbaRegressor):
+    """Bagging ensemble of probabilistic regresesors.
+
+    Fits ``n_estimators`` clones of an skpro regressor on
+    datasets which are instance sub-samples and/or variable sub-samples.
+
+    On ``predict_proba``, the mixture of probabilistic predictions is returned.
+
+    The estimator allows to choose sample sizes for instances, variables,
+    and whether sampling is with or without replacement.
+
+    Direct generalization of ``sklearn``'s ``BaggingClassifier``
+    to the probabilistic regrsesion task.
+
+    Parameters
+    ----------
+    estimator : skpro regressor, descendant of BaseProbaRegressor
+        regressor to use in the bagging estimator
+    n_estimators : int, default=10
+        number of estimators in the sample for bagging
+    n_samples : int or float, default=1.0
+        The number of instances drawn from ``X`` in ``fit`` to train each clone
+        If int, then indicates number of instances precisely
+        If float, interpreted as a fraction, and rounded by ``ceil``
+    n_features : int or float, default=1.0
+        The number of features/variables drawn from ``X`` in ``fit`` to train each clone
+        If int, then indicates number of instances precisely
+        If float, interpreted as a fraction, and rounded by ``ceil``
+    bootstrap : boolean, default=True
+        whether samples/instances are drawn with replacement (True) or not (False)
+    bootstrap_features : boolean, default=False
+        whether features/variables are drawn with replacement (True) or not (False)
+    random_state : int, RandomState instance or None, optional (default=None)
+        If int, ``random_state`` is the seed used by the random number generator;
+        If ``RandomState`` instance, ``random_state`` is the random number generator;
+        If None, the random number generator is the ``RandomState`` instance used
+        by ``np.random``.
+
+    Attributes
+    ----------
+    estimators_ : list of of skpro regressors
+        clones of regressor in `estimator` fitted in the ensemble
+
+    Examples
+    --------
+    >>> from skpro.regression.ensemble import BaggingRegressor
+    >>> from skpro.regression.residual import ResidualDouble
+    >>> from sklearn.linear_model import LinearRegression
+    >>> from sklearn.datasets import load_diabetes
+    >>> from sklearn.model_selection import train_test_split
+    >>>
+    >>> X, y = load_diabetes(return_X_y=True, as_frame=True)
+    >>> X_train, X_test, y_train, y_test = train_test_split(X, y)
+    >>>
+    >>> reg_mean = LinearRegression()
+    >>> reg_proba = ResidualDouble(reg_mean)
+    >>>
+    >>> ens = BaggingRegressor(reg_proba, n_estimators=10)
+    >>> ens.fit(X_train, y_train)
+    BaggingRegressor(...)
+    >>> y_pred = ens.predict_proba(X_test)
+    """
+
+    _tags = {"capability:missing": True}
+
+    def __init__(
+        self,
+        estimator,
+        n_estimators=10,
+        n_samples=1.0,
+        n_features=1.0,
+        bootstrap=True,
+        bootstrap_features=False,
+        random_state=None,
+    ):
+        self.estimator = estimator
+        self.n_estimators = n_estimators
+        self.n_samples = n_samples
+        self.n_features = n_features
+        self.bootstrap = bootstrap
+        self.bootstrap_features = bootstrap_features
+        self.random_state = random_state
+
+        super().__init__()
+
+        tags_to_clone = ["capability:missing"]
+        self.clone_tags(estimator, tags_to_clone)
+
+    def _fit(self, X, y):
+        """Fit regressor to training data.
+
+        Writes to self:
+            Sets fitted model attributes ending in "_".
+
+        Parameters
+        ----------
+        X : pandas DataFrame
+            feature instances to fit regressor to
+        y : pandas DataFrame, must be same length as X
+            labels to fit regressor to
+
+        Returns
+        -------
+        self : reference to self
+        """
+        estimator = self.estimator
+        n_estimators = self.n_estimators
+        n_samples = self.n_samples
+        n_features = self.n_features
+        bootstrap = self.bootstrap
+        bootstrap_ft = self.bootstrap_features
+        random_state = self.random_state
+        np.random.seed(random_state)
+
+        inst_ix = X.index
+        col_ix = X.columns
+        n = len(inst_ix)
+        m = len(col_ix)
+
+        if isinstance(n_samples, float):
+            n_samples_ = ceil(n_samples * n)
+        else:
+            n_samples_ = n_samples
+
+        if isinstance(n_features, float):
+            n_features_ = ceil(n_features * m)
+        else:
+            n_features_ = n_features
 
-from skpro.base.old_base import ProbabilisticEstimator
+        self.estimators_ = []
+        self.cols_ = []
 
+        for _i in range(n_estimators):
+            esti = estimator.clone()
+            row_iloc = pd.RangeIndex(n)
+            row_ss = _random_ss_ix(row_iloc, size=n_samples_, replace=bootstrap)
+            inst_ix_i = inst_ix[row_ss]
+            col_ix_i = _random_ss_ix(col_ix, size=n_features_, replace=bootstrap_ft)
 
-class BaggingRegressor(BaseBaggingRegressor, ProbabilisticEstimator):
-    class Distribution(ProbabilisticEstimator.Distribution):
-        def __init__(self, estimator, X, distributions, n_estimators):
-            super().__init__(estimator, X)
-            self.distributions = distributions
-            self.n_estimators = n_estimators
+            # store column subset for use in predict
+            self.cols_ += [col_ix_i]
 
-        def point(self):
-            return NotImplemented
+            Xi = _subs_cols(X.loc[inst_ix_i], col_ix_i, reset_cols=bootstrap_ft)
+            Xi = Xi.reset_index(drop=True)
 
-        def std(self):
-            return NotImplemented
+            yi = y.loc[inst_ix_i].reset_index(drop=True)
 
-        def pdf(self, x):
-            # Average the predicted PDFs
-            arr = np.array(
-                [d.pdf(x) for distribution in self.distributions for d in distribution]
-            )
+            self.estimators_ += [esti.fit(Xi, yi)]
 
-            return np.mean(arr, axis=0)
+        return self
 
-    def predict(self, X):
-        """Predict regression target for X.
+    def _predict_proba(self, X) -> np.ndarray:
+        """Predict distribution over labels for data from features.
 
-        The predicted regression target of an input sample is computed as the
-        averaged predicted distributions of the estimators in the ensemble.
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
 
         Parameters
         ----------
-        X : {array-like, sparse matrix} of shape = [n_samples, n_features]
-            The training input samples. Sparse matrices are accepted only if
-            they are supported by the base estimator.
+        X : pandas DataFrame, must have same columns as X in `fit`
+            data to predict labels for
 
         Returns
         -------
-        y : skpro.base.Distribution = [n_samples]
-            The predicted bagged distributions.
+        y : skpro BaseDistribution, same length as `X`
+            labels predicted for `X`
         """
+        reset_cols = self.bootstrap_features
+        Xis = [_subs_cols(X, col_ix_i, reset_cols) for col_ix_i in self.cols_]
+
+        y_probas = [est.predict_proba(Xi) for est, Xi in zip(self.estimators_, Xis)]
+
+        y_proba = Mixture(y_probas)
 
-        # Ensure estimator were being fitted
-        check_is_fitted(self, "estimators_features_")
-        # Check data
-        X = check_array(X, accept_sparse=["csr", "csc"])
-
-        # Parallel loop
-        from sklearn.ensemble.base import _partition_estimators
-
-        n_jobs, n_estimators, starts = _partition_estimators(
-            self.n_estimators, self.n_jobs
-        )
-
-        def _parallel_predict_regression(estimators, estimators_features, X):
-            """Private function used to compute predictions within a job."""
-            return [
-                estimator.predict(X[:, features])
-                for estimator, features in zip(estimators, estimators_features)
-            ]
-
-        # Obtain predictions
-        all_y_hat = [
-            _parallel_predict_regression(
-                self.estimators_[starts[i] : starts[i + 1]],
-                self.estimators_features_[starts[i] : starts[i + 1]],
-                X,
-            )
-            for i in range(n_jobs)
-        ]
-
-        # Reduce
-        return self._distribution()(self, X, all_y_hat, n_estimators)
-
-    def __str__(self, describer=str):
-        return "BaggingRegressor(" + describer(self.base_estimator) + ")"
-
-    def __repr__(self):
-        return self.__str__(repr)
+        return y_proba
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        from sklearn.linear_model import LinearRegression
+
+        from skpro.regression.residual import ResidualDouble
+
+        regressor = ResidualDouble(LinearRegression())
+
+        params1 = {"estimator": regressor}
+        params2 = {
+            "estimator": regressor,
+            "n_samples": 0.5,
+            "n_features": 0.5,
+        }
+        params3 = {
+            "estimator": regressor,
+            "n_samples": 7,
+            "n_features": 2,
+            "bootstrap": False,
+            "bootstrap_features": True,
+        }
+
+        return [params1, params2, params3]
+
+
+def _random_ss_ix(ix, size, replace=True):
+    """Randomly uniformly sample indices from a list of indices."""
+    a = range(len(ix))
+    ixs = ix[np.random.choice(a, size=size, replace=replace)]
+    return ixs
+
+
+def _subs_cols(df, col_ix, reset_cols=False):
+    """Subset columns of a DataFrame, with potential resetting of column index."""
+    df_subset = df.loc[:, col_ix]
+    if reset_cols:
+        df_subset.columns = pd.RangeIndex(len(df_subset.columns))
+    return df_subset
diff --git a/skpro/regression/tests/test_all_regressors.py b/skpro/regression/tests/test_all_regressors.py
index 81b78118c..5fc555e43 100644
--- a/skpro/regression/tests/test_all_regressors.py
+++ b/skpro/regression/tests/test_all_regressors.py
@@ -27,7 +27,10 @@ def test_input_output_contract(self, object_instance):
         from sklearn.model_selection import train_test_split
 
         X, y = load_diabetes(return_X_y=True, as_frame=True)
+        X = X.iloc[:50]
+        y = y.iloc[:50]
         y = pd.DataFrame(y)
+
         X_train, X_test, y_train, y_test = train_test_split(X, y)
 
         # fit - just once for all predict output methods
@@ -145,7 +148,10 @@ def test_pred_quantiles_interval(self, object_instance, alpha):
         from sklearn.model_selection import train_test_split
 
         X, y = load_diabetes(return_X_y=True, as_frame=True)
+        X = X.iloc[:50]
+        y = y.iloc[:50]
         y = pd.DataFrame(y)
+
         X_train, X_test, y_train, _ = train_test_split(X, y)
 
         regressor = object_instance