-
Notifications
You must be signed in to change notification settings - Fork 47
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ENH] Bagging ensemble of probabilistic regressors (#32)
Adds bagging ensemble of probabilistic regressors. Complete rewrite of the legacy estimator. Depends on: * #26 for the mixture distribution used in bagging. * #65 for the default of `predict`, if `_predict_proba` or other proba methods are implemented but not `_predict`.
- Loading branch information
Showing
2 changed files
with
234 additions
and
67 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,86 +1,247 @@ | ||
# LEGACY MODULE - TODO: remove or refactor | ||
"""Bagging probabilistic regressors.""" | ||
|
||
__author__ = ["fkiraly"] | ||
__all__ = ["BaggingRegressor"] | ||
|
||
from math import ceil | ||
|
||
import numpy as np | ||
from sklearn.ensemble import BaggingRegressor as BaseBaggingRegressor | ||
from sklearn.utils import check_array | ||
from sklearn.utils.validation import check_is_fitted | ||
import pandas as pd | ||
|
||
from skpro.distributions.mixture import Mixture | ||
from skpro.regression.base import BaseProbaRegressor | ||
|
||
|
||
class BaggingRegressor(BaseProbaRegressor): | ||
"""Bagging ensemble of probabilistic regresesors. | ||
Fits ``n_estimators`` clones of an skpro regressor on | ||
datasets which are instance sub-samples and/or variable sub-samples. | ||
On ``predict_proba``, the mixture of probabilistic predictions is returned. | ||
The estimator allows to choose sample sizes for instances, variables, | ||
and whether sampling is with or without replacement. | ||
Direct generalization of ``sklearn``'s ``BaggingClassifier`` | ||
to the probabilistic regrsesion task. | ||
Parameters | ||
---------- | ||
estimator : skpro regressor, descendant of BaseProbaRegressor | ||
regressor to use in the bagging estimator | ||
n_estimators : int, default=10 | ||
number of estimators in the sample for bagging | ||
n_samples : int or float, default=1.0 | ||
The number of instances drawn from ``X`` in ``fit`` to train each clone | ||
If int, then indicates number of instances precisely | ||
If float, interpreted as a fraction, and rounded by ``ceil`` | ||
n_features : int or float, default=1.0 | ||
The number of features/variables drawn from ``X`` in ``fit`` to train each clone | ||
If int, then indicates number of instances precisely | ||
If float, interpreted as a fraction, and rounded by ``ceil`` | ||
bootstrap : boolean, default=True | ||
whether samples/instances are drawn with replacement (True) or not (False) | ||
bootstrap_features : boolean, default=False | ||
whether features/variables are drawn with replacement (True) or not (False) | ||
random_state : int, RandomState instance or None, optional (default=None) | ||
If int, ``random_state`` is the seed used by the random number generator; | ||
If ``RandomState`` instance, ``random_state`` is the random number generator; | ||
If None, the random number generator is the ``RandomState`` instance used | ||
by ``np.random``. | ||
Attributes | ||
---------- | ||
estimators_ : list of of skpro regressors | ||
clones of regressor in `estimator` fitted in the ensemble | ||
Examples | ||
-------- | ||
>>> from skpro.regression.ensemble import BaggingRegressor | ||
>>> from skpro.regression.residual import ResidualDouble | ||
>>> from sklearn.linear_model import LinearRegression | ||
>>> from sklearn.datasets import load_diabetes | ||
>>> from sklearn.model_selection import train_test_split | ||
>>> | ||
>>> X, y = load_diabetes(return_X_y=True, as_frame=True) | ||
>>> X_train, X_test, y_train, y_test = train_test_split(X, y) | ||
>>> | ||
>>> reg_mean = LinearRegression() | ||
>>> reg_proba = ResidualDouble(reg_mean) | ||
>>> | ||
>>> ens = BaggingRegressor(reg_proba, n_estimators=10) | ||
>>> ens.fit(X_train, y_train) | ||
BaggingRegressor(...) | ||
>>> y_pred = ens.predict_proba(X_test) | ||
""" | ||
|
||
_tags = {"capability:missing": True} | ||
|
||
def __init__( | ||
self, | ||
estimator, | ||
n_estimators=10, | ||
n_samples=1.0, | ||
n_features=1.0, | ||
bootstrap=True, | ||
bootstrap_features=False, | ||
random_state=None, | ||
): | ||
self.estimator = estimator | ||
self.n_estimators = n_estimators | ||
self.n_samples = n_samples | ||
self.n_features = n_features | ||
self.bootstrap = bootstrap | ||
self.bootstrap_features = bootstrap_features | ||
self.random_state = random_state | ||
|
||
super().__init__() | ||
|
||
tags_to_clone = ["capability:missing"] | ||
self.clone_tags(estimator, tags_to_clone) | ||
|
||
def _fit(self, X, y): | ||
"""Fit regressor to training data. | ||
Writes to self: | ||
Sets fitted model attributes ending in "_". | ||
Parameters | ||
---------- | ||
X : pandas DataFrame | ||
feature instances to fit regressor to | ||
y : pandas DataFrame, must be same length as X | ||
labels to fit regressor to | ||
Returns | ||
------- | ||
self : reference to self | ||
""" | ||
estimator = self.estimator | ||
n_estimators = self.n_estimators | ||
n_samples = self.n_samples | ||
n_features = self.n_features | ||
bootstrap = self.bootstrap | ||
bootstrap_ft = self.bootstrap_features | ||
random_state = self.random_state | ||
np.random.seed(random_state) | ||
|
||
inst_ix = X.index | ||
col_ix = X.columns | ||
n = len(inst_ix) | ||
m = len(col_ix) | ||
|
||
if isinstance(n_samples, float): | ||
n_samples_ = ceil(n_samples * n) | ||
else: | ||
n_samples_ = n_samples | ||
|
||
if isinstance(n_features, float): | ||
n_features_ = ceil(n_features * m) | ||
else: | ||
n_features_ = n_features | ||
|
||
from skpro.base.old_base import ProbabilisticEstimator | ||
self.estimators_ = [] | ||
self.cols_ = [] | ||
|
||
for _i in range(n_estimators): | ||
esti = estimator.clone() | ||
row_iloc = pd.RangeIndex(n) | ||
row_ss = _random_ss_ix(row_iloc, size=n_samples_, replace=bootstrap) | ||
inst_ix_i = inst_ix[row_ss] | ||
col_ix_i = _random_ss_ix(col_ix, size=n_features_, replace=bootstrap_ft) | ||
|
||
class BaggingRegressor(BaseBaggingRegressor, ProbabilisticEstimator): | ||
class Distribution(ProbabilisticEstimator.Distribution): | ||
def __init__(self, estimator, X, distributions, n_estimators): | ||
super().__init__(estimator, X) | ||
self.distributions = distributions | ||
self.n_estimators = n_estimators | ||
# store column subset for use in predict | ||
self.cols_ += [col_ix_i] | ||
|
||
def point(self): | ||
return NotImplemented | ||
Xi = _subs_cols(X.loc[inst_ix_i], col_ix_i, reset_cols=bootstrap_ft) | ||
Xi = Xi.reset_index(drop=True) | ||
|
||
def std(self): | ||
return NotImplemented | ||
yi = y.loc[inst_ix_i].reset_index(drop=True) | ||
|
||
def pdf(self, x): | ||
# Average the predicted PDFs | ||
arr = np.array( | ||
[d.pdf(x) for distribution in self.distributions for d in distribution] | ||
) | ||
self.estimators_ += [esti.fit(Xi, yi)] | ||
|
||
return np.mean(arr, axis=0) | ||
return self | ||
|
||
def predict(self, X): | ||
"""Predict regression target for X. | ||
def _predict_proba(self, X) -> np.ndarray: | ||
"""Predict distribution over labels for data from features. | ||
The predicted regression target of an input sample is computed as the | ||
averaged predicted distributions of the estimators in the ensemble. | ||
State required: | ||
Requires state to be "fitted". | ||
Accesses in self: | ||
Fitted model attributes ending in "_" | ||
Parameters | ||
---------- | ||
X : {array-like, sparse matrix} of shape = [n_samples, n_features] | ||
The training input samples. Sparse matrices are accepted only if | ||
they are supported by the base estimator. | ||
X : pandas DataFrame, must have same columns as X in `fit` | ||
data to predict labels for | ||
Returns | ||
------- | ||
y : skpro.base.Distribution = [n_samples] | ||
The predicted bagged distributions. | ||
y : skpro BaseDistribution, same length as `X` | ||
labels predicted for `X` | ||
""" | ||
reset_cols = self.bootstrap_features | ||
Xis = [_subs_cols(X, col_ix_i, reset_cols) for col_ix_i in self.cols_] | ||
|
||
y_probas = [est.predict_proba(Xi) for est, Xi in zip(self.estimators_, Xis)] | ||
|
||
y_proba = Mixture(y_probas) | ||
|
||
# Ensure estimator were being fitted | ||
check_is_fitted(self, "estimators_features_") | ||
# Check data | ||
X = check_array(X, accept_sparse=["csr", "csc"]) | ||
|
||
# Parallel loop | ||
from sklearn.ensemble.base import _partition_estimators | ||
|
||
n_jobs, n_estimators, starts = _partition_estimators( | ||
self.n_estimators, self.n_jobs | ||
) | ||
|
||
def _parallel_predict_regression(estimators, estimators_features, X): | ||
"""Private function used to compute predictions within a job.""" | ||
return [ | ||
estimator.predict(X[:, features]) | ||
for estimator, features in zip(estimators, estimators_features) | ||
] | ||
|
||
# Obtain predictions | ||
all_y_hat = [ | ||
_parallel_predict_regression( | ||
self.estimators_[starts[i] : starts[i + 1]], | ||
self.estimators_features_[starts[i] : starts[i + 1]], | ||
X, | ||
) | ||
for i in range(n_jobs) | ||
] | ||
|
||
# Reduce | ||
return self._distribution()(self, X, all_y_hat, n_estimators) | ||
|
||
def __str__(self, describer=str): | ||
return "BaggingRegressor(" + describer(self.base_estimator) + ")" | ||
|
||
def __repr__(self): | ||
return self.__str__(repr) | ||
return y_proba | ||
|
||
@classmethod | ||
def get_test_params(cls, parameter_set="default"): | ||
"""Return testing parameter settings for the estimator. | ||
Parameters | ||
---------- | ||
parameter_set : str, default="default" | ||
Name of the set of test parameters to return, for use in tests. If no | ||
special parameters are defined for a value, will return `"default"` set. | ||
Returns | ||
------- | ||
params : dict or list of dict, default = {} | ||
Parameters to create testing instances of the class | ||
Each dict are parameters to construct an "interesting" test instance, i.e., | ||
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. | ||
`create_test_instance` uses the first (or only) dictionary in `params` | ||
""" | ||
from sklearn.linear_model import LinearRegression | ||
|
||
from skpro.regression.residual import ResidualDouble | ||
|
||
regressor = ResidualDouble(LinearRegression()) | ||
|
||
params1 = {"estimator": regressor} | ||
params2 = { | ||
"estimator": regressor, | ||
"n_samples": 0.5, | ||
"n_features": 0.5, | ||
} | ||
params3 = { | ||
"estimator": regressor, | ||
"n_samples": 7, | ||
"n_features": 2, | ||
"bootstrap": False, | ||
"bootstrap_features": True, | ||
} | ||
|
||
return [params1, params2, params3] | ||
|
||
|
||
def _random_ss_ix(ix, size, replace=True): | ||
"""Randomly uniformly sample indices from a list of indices.""" | ||
a = range(len(ix)) | ||
ixs = ix[np.random.choice(a, size=size, replace=replace)] | ||
return ixs | ||
|
||
|
||
def _subs_cols(df, col_ix, reset_cols=False): | ||
"""Subset columns of a DataFrame, with potential resetting of column index.""" | ||
df_subset = df.loc[:, col_ix] | ||
if reset_cols: | ||
df_subset.columns = pd.RangeIndex(len(df_subset.columns)) | ||
return df_subset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters