Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Multiclass classification reduction using Histograms #410

Merged
merged 15 commits into from
Jul 9, 2024
Merged
15 changes: 13 additions & 2 deletions docs/source/api_reference/regression.rst
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ Reduction - adding ``predict_proba``
------------------------------------

This section lists reduction algorithms that
take one or multiple ``sklearn`` estimators and adda probabilistic prediction mode.
take one or multiple ``sklearn`` regressors and add a probabilistic prediction mode.

Formally, these algorithms are reduction algorithms, to tabular regression.

.. currentmodule:: skpro.regression.bootstrap

Expand Down Expand Up @@ -107,6 +109,16 @@ take one or multiple ``sklearn`` estimators and adda probabilistic prediction mo

CyclicBoosting

Reduction to probabilistic classification
-----------------------------------------

.. currentmodule:: skpro.regression.binned._sklearn_bin_regressor

.. autosummary::
:toctree: auto_generated/
:template: class.rst

HistBinnedProbaRegressor

Naive regressors and baselines
------------------------------
Expand Down Expand Up @@ -159,7 +171,6 @@ Adapters to other interfaces

SklearnProbaReg


Base classes
------------

Expand Down
6 changes: 6 additions & 0 deletions skpro/regression/binned/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Reduction to probabilistic classification."""
# copyright: skpro developers, BSD-3-Clause License (see LICENSE file)

from skpro.regression.binned._sklearn_bin_regressor import HistBinnedProbaRegressor

__all__ = ["HistBinnedProbaRegressor"]
272 changes: 272 additions & 0 deletions skpro/regression/binned/_sklearn_bin_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
"""Reduction to classification using sklearn classifiers fit on binned data."""

__author__ = ["ShreeshaM07"]

import numpy as np
import pandas as pd

from skpro.distributions import Histogram
from skpro.regression.base import BaseProbaRegressor
from skpro.utils.sklearn import prep_skl_df


class HistBinnedProbaRegressor(BaseProbaRegressor):
"""A binned probabilistic regressor fitting a histogram distribution.

It is a probabilistic regressor that fits a Histogram Distribution
by presenting binned outcomes to a probabilistic sklearn classifier.
It can be used for predicting the class that a set of X belongs to and
predict_proba can be used to represent the predicted probabilites of
each class for respective values in X in the form of a Histogram
Distribution.

The ``bins`` will be used to bin the ``y`` values in fit into the respective
bins. It then uses these bins as the classes for the classifier and predicts
the probabilites for each class.

Note: Ensure the ``y`` values while calling ``fit`` are within the the ``bins``
range. If it is not then it will be internally replaced to move to the
closest bin.

Parameters
----------
clf : instance of a sklearn classifier
Classifier to wrap, must have ``predict`` and ``predict_proba``.
bins : int or 1D array of float, default: 10
1. If ``int`` then it will be considered as the number of bins.
2. Else if it is an array then it will be used as the bin boundaries.
If the requirement is ``n`` bins then the ``len(bins)`` must be ``n+1``.

Attributes
----------
classes_ : np.array
Contains the names of the classes that it was fit on.
class_bin_map_ : dict
The key contains the class name assigned to the bin.
It maps the key (which indicates the ``i``th bin) to the respective
bin's boundaries np.array([bins[i],bins[i+1]]).
classes_proba_ : pd.DataFrame
Contains the class probabilites.
"""

_tags = {
"authors": ["ShreeshaM07"],
"maintainers": ["ShreeshaM07"],
"capability:multioutput": False,
"capability:missing": True,
}

def __init__(self, clf, bins=10):
self.clf = clf
self.bins = bins

super().__init__()

def _bins_int_arr(self, bins, y):
y = np.array(y).flatten()
start = min(y) * 0.999
stop = max(y) * 1.001
bins = np.linspace(start=start, stop=stop, num=bins + 1)
return bins

def _y_bins_compatiblity(self, y, bins, _y_cols):
y = np.array(y).flatten()
upper_y = bins[-1] - 1e-9
lower_y = bins[0] + 1e-9
y = np.where(y <= bins[0], lower_y, y)
y = np.where(y >= bins[-1], upper_y, y)
y = pd.DataFrame(y, columns=_y_cols)
return y

def _fit(self, X, y):
"""Fit regressor to training data.

Writes to self:
Sets fitted model attributes ending in "_".

Parameters
----------
X : pandas DataFrame
feature instances to fit regressor to
y : pandas DataFrame, must be same length as X
labels to fit regressor to

Returns
-------
self : reference to self
"""
from warnings import warn

from numpy.lib.stride_tricks import sliding_window_view
from sklearn import clone

self.clf_ = clone(self.clf)
bins = self.bins
self._y_cols = y.columns

# create bins array in case of bins being an `int`
if isinstance(bins, int) or isinstance(bins, np.integer):
bins = self._bins_int_arr(bins, y)

# check if y values are within bins range
# if not move it to the closest bin.
y = self._y_bins_compatiblity(y, bins, self._y_cols)

# in case of int it will be internally replaced in fit
self._bins = bins

# Generate class names based on bins
class_bins = [f"class{i}" for i in range(len(bins) - 1)]
self._class_bins = class_bins

if len(bins) != len(class_bins) + 1:
warn(
f"len of `bins` is {len(bins)} != len of classes {len(class_bins)}+1."
" Ensure the bins has all the bin boundaries resulting in"
" number of bins + 1 elements."
)

bins_hist = sliding_window_view(bins, window_shape=2)
# maps the bin boundaries [bin start,bin end] to the classes
class_bin_map_ = {}
for i in range(len(bins_hist)):
class_bin_map_[class_bins[i]] = bins_hist[i]
self.class_bin_map_ = class_bin_map_

# bins the y values into classes.
class_series = pd.cut(y.iloc[:, 0], bins=bins, labels=class_bins, right=True)
y_binned = pd.DataFrame(class_series, columns=self._y_cols)

X = prep_skl_df(X)
y_binned = prep_skl_df(y_binned)

if isinstance(y_binned, pd.DataFrame) and len(y_binned.columns) == 1:
y_binned = y_binned.iloc[:, 0]
elif len(y_binned.shape) > 1 and y_binned.shape[1] == 1:
y_binned = y_binned[:, 0]

self.clf_.fit(X, y_binned)
self.classes_ = self.clf_.classes_

return self

def _predict(self, X):
"""Predict labels for data from features.

State required:
Requires state to be "fitted" = self.is_fitted=True

Accesses in self:
Fitted model attributes ending in "_"

Parameters
----------
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for

Returns
-------
y : pandas DataFrame, same length as `X`, same columns as `y` in `fit`
labels predicted for `X`
"""
X = prep_skl_df(X)
y_pred = self.clf_.predict(X)
y_pred_df = pd.DataFrame(y_pred, index=X.index, columns=self._y_cols)
return y_pred_df

def _predict_proba(self, X):
"""Predict distribution over labels for data from features.

State required:
Requires state to be "fitted".

Accesses in self:
Fitted model attributes ending in "_"

Parameters
----------
X : pandas DataFrame, must have same columns as X in `fit`
data to predict labels for

Returns
-------
y : skpro BaseDistribution, same length as `X`
labels predicted for `X`
"""
from warnings import warn

X = prep_skl_df(X)
bins = self._bins
classes_ = self.classes_
class_bins = self._class_bins

if len(bins) != len(class_bins) + 1:
warn(
f"len of `bins` is {len(bins)} != len of classes {len(class_bins)}+1."
" Ensure the bins has all the bin boundaries resulting in"
" number of bins + 1 elements."
)

y_pred_proba = self.clf_.predict_proba(X)
# map classes probabilities/bin_mass to class names
classes_proba_ = pd.DataFrame(y_pred_proba, columns=classes_)

# Identify missing classes
missing_classes = set(class_bins) - set(classes_)
if missing_classes:
# Add missing classes with 0 values
for missing_class in missing_classes:
classes_proba_[missing_class] = 0
# Sort columns based on the numerical part of the class names
# in order to match with the bins while calling Histogram distribution
classes_proba_ = classes_proba_.reindex(
sorted(classes_proba_.columns, key=lambda x: int(x[5:])), axis=1
)

self.classes_proba_ = classes_proba_
y_pred_proba = np.array(classes_proba_)

if len(X) == 1:
bin_mass = y_pred_proba[0]
pred_proba = Histogram(bins=bins, bin_mass=bin_mass)
return pred_proba

# converting it to a 2D shape
bin_mass = np.array([y_pred_proba])
# Reshape and swap axes to get the desired structure
bin_mass = bin_mass.swapaxes(0, 1).reshape(-1, 1, bin_mass.shape[-1])

bins = np.array([[bins]] * len(X))

pred_proba = Histogram(
bins=bins, bin_mass=bin_mass, index=X.index, columns=self._y_cols
)
return pred_proba

@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.

Returns
-------
params : dict or list of dict, default = {}
Parameters to create testing instances of the class
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
`create_test_instance` uses the first (or only) dictionary in `params`
"""
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelSpreading
from sklearn.tree import DecisionTreeClassifier

param1 = {"clf": DecisionTreeClassifier(), "bins": 4}
param2 = {"clf": GaussianNB()}
params3 = {"clf": LabelSpreading(), "bins": [20, 80, 160, 250, 300, 380, 420]}

return [param1, param2, params3]
Loading