Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Adds class_weight to those classifier that support it. Required for imbalanced datasets. #1776

Merged
merged 3 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions aeon/classification/convolution_based/_arsenal.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,17 @@ class Arsenal(BaseClassifier):
The collections of estimators trained in fit.
weights_ : list of shape (n_estimators) of float
Weight of each estimator in the ensemble.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
n_estimators_ : int
The number of estimators in the ensemble.

Expand Down Expand Up @@ -125,6 +136,7 @@ def __init__(
n_features_per_kernel=4,
time_limit_in_minutes=0.0,
contract_max_n_estimators=100,
class_weight=None,
n_jobs=1,
random_state=None,
):
Expand All @@ -135,6 +147,7 @@ def __init__(
self.n_features_per_kernel = n_features_per_kernel
self.time_limit_in_minutes = time_limit_in_minutes
self.contract_max_n_estimators = contract_max_n_estimators
self.class_weight = class_weight

self.random_state = random_state
self.n_jobs = n_jobs
Expand Down Expand Up @@ -355,7 +368,9 @@ def _fit_ensemble_estimator(self, rocket, X, y, keep_transformed_data):
transformed_x = rocket.fit_transform(X)
scaler = StandardScaler(with_mean=False)
scaler.fit(transformed_x, y)
ridge = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
ridge = RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
)
ridge.fit(scaler.transform(transformed_x), y)
return [
make_pipeline(rocket, scaler, ridge),
Expand All @@ -380,7 +395,9 @@ def _train_probas_for_estimator(self, Xt, y, idx, rng):

clf = make_pipeline(
StandardScaler(with_mean=False),
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
),
)
clf.fit(Xt[idx][subsample], y[subsample])
preds = clf.predict(Xt[idx][oob])
Expand Down
20 changes: 18 additions & 2 deletions aeon/classification/convolution_based/_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ class HydraClassifier(BaseClassifier):
Number of kernels per group.
n_groups : int, default=64
Number of groups per dilation.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
n_jobs : int, default=1
The number of jobs to run in parallel for both `fit` and `predict`.
``-1`` means using all processors.
Expand Down Expand Up @@ -76,9 +87,12 @@ class HydraClassifier(BaseClassifier):
"python_dependencies": "torch",
}

def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
def __init__(
self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
):
self.n_kernels = n_kernels
self.n_groups = n_groups
self.class_weight = class_weight
self.n_jobs = n_jobs
self.random_state = random_state

Expand All @@ -95,7 +109,9 @@ def _fit(self, X, y):
self._clf = make_pipeline(
transform,
_SparseScaler(),
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10)),
RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
),
)
self._clf.fit(X, y)

Expand Down
20 changes: 18 additions & 2 deletions aeon/classification/convolution_based/_mr_hydra.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@ class MultiRocketHydraClassifier(BaseClassifier):
Number of kernels per group for the Hydra transform.
n_groups : int, default=64
Number of groups per dilation for the Hydra transform.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
n_jobs : int, default=1
The number of jobs to run in parallel for both `fit` and `predict`.
``-1`` means using all processors.
Expand Down Expand Up @@ -70,9 +81,12 @@ class MultiRocketHydraClassifier(BaseClassifier):
"python_dependencies": "torch",
}

def __init__(self, n_kernels=8, n_groups=64, n_jobs=1, random_state=None):
def __init__(
self, n_kernels=8, n_groups=64, n_jobs=1, class_weight=None, random_state=None
):
self.n_kernels = n_kernels
self.n_groups = n_groups
self.class_weight = class_weight
self.n_jobs = n_jobs
self.random_state = random_state

Expand Down Expand Up @@ -101,7 +115,9 @@ def _fit(self, X, y):

Xt = np.concatenate((Xt_hydra, Xt_multirocket), axis=1)

self.classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
self.classifier = RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
)
self.classifier.fit(Xt, y)

return self
Expand Down
18 changes: 17 additions & 1 deletion aeon/classification/convolution_based/_rocket_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ class RocketClassifier(BaseClassifier):
estimator : sklearn compatible classifier or None, default=None
The estimator used. If None, a RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
is used.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
Only applies if estimator is None and the default is used.
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
random_state : int, RandomState instance or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `RandomState` instance, random_state is the random number generator;
Expand Down Expand Up @@ -104,6 +116,7 @@ def __init__(
rocket_transform="rocket",
max_dilations_per_kernel=32,
n_features_per_kernel=4,
class_weight=None,
estimator=None,
random_state=None,
n_jobs=1,
Expand All @@ -113,6 +126,7 @@ def __init__(
self.max_dilations_per_kernel = max_dilations_per_kernel
self.n_features_per_kernel = n_features_per_kernel
self.random_state = random_state
self.class_weight = class_weight
self.estimator = estimator
self.n_jobs = n_jobs

Expand Down Expand Up @@ -168,7 +182,9 @@ def _fit(self, X, y):
self._scaler = StandardScaler(with_mean=False)
self._estimator = _clone_estimator(
(
RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
)
if self.estimator is None
else self.estimator
),
Expand Down
19 changes: 17 additions & 2 deletions aeon/classification/dictionary_based/_muse.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,17 @@ class MUSE(BaseClassifier):
If set to True, a LogisticRegression will be trained, which does support
predict_proba(), yet is slower and typically less accuracy. predict_proba() is
needed for example in Early-Classification like TEASER.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
n_jobs : int, default=1
The number of jobs to run in parallel for both `fit` and `predict`.
``-1`` means using all processors.
Expand Down Expand Up @@ -136,6 +147,7 @@ def __init__(
feature_selection="chi2",
p_threshold=0.05,
support_probabilities=False,
class_weight=None,
n_jobs=1,
random_state=None,
):
Expand All @@ -160,6 +172,7 @@ def __init__(
self.n_jobs = n_jobs
self.support_probabilities = support_probabilities
self.total_features_count = 0
self.class_weight = class_weight
self.feature_selection = feature_selection

super().__init__()
Expand Down Expand Up @@ -242,13 +255,15 @@ def _fit(self, X, y):

# Ridge Classifier does not give probabilities
if not self.support_probabilities:
self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
self.clf = RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
)
else:
self.clf = LogisticRegression(
max_iter=5000,
solver="liblinear",
dual=True,
# class_weight="balanced",
class_weight=self.class_weight,
penalty="l2",
random_state=self.random_state,
n_jobs=self.n_jobs,
Expand Down
19 changes: 17 additions & 2 deletions aeon/classification/dictionary_based/_weasel.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,17 @@ class WEASEL(BaseClassifier):
If set to True, a LogisticRegression will be trained, which does support
predict_proba(), yet is slower and typically less accurate. predict_proba() is
needed for example in Early-Classification like TEASER.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
random_state : int, RandomState instance or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `RandomState` instance, random_state is the random number generator;
Expand Down Expand Up @@ -136,6 +147,7 @@ def __init__(
n_jobs=1,
feature_selection="chi2",
support_probabilities=False,
class_weight=None,
random_state=None,
):
self.alphabet_size = alphabet_size
Expand All @@ -159,6 +171,7 @@ def __init__(
self.clf = None
self.n_jobs = n_jobs
self.support_probabilities = support_probabilities
self.class_weight = class_weight
set_num_threads(n_jobs)
super().__init__()

Expand Down Expand Up @@ -223,13 +236,15 @@ def _fit(self, X, y):

# Ridge Classifier does not give probabilities
if not self.support_probabilities:
self.clf = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
self.clf = RidgeClassifierCV(
alphas=np.logspace(-3, 3, 10), class_weight=self.class_weight
)
else:
self.clf = LogisticRegression(
max_iter=5000,
solver="liblinear",
dual=True,
# class_weight="balanced",
class_weight=self.class_weight,
penalty="l2",
random_state=self.random_state,
n_jobs=self.n_jobs,
Expand Down
17 changes: 16 additions & 1 deletion aeon/classification/dictionary_based/_weasel_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@ class WEASEL_V2(BaseClassifier):
max_feature_count : int, default=30_000
size of the dictionary - number of words to use - if feature_selection set to
"chi2" or "random". Else ignored.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
random_state : int or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `None`, the random number generator is the `RandomState` instance used
Expand Down Expand Up @@ -128,6 +139,7 @@ def __init__(
feature_selection="chi2_top_k",
max_feature_count=30_000,
random_state=None,
class_weight=None,
n_jobs=4,
):
self.norm_options = norm_options
Expand All @@ -140,6 +152,7 @@ def __init__(
self.max_feature_count = max_feature_count
self.use_first_differences = use_first_differences
self.feature_selection = feature_selection
self.class_weight = class_weight

self.clf = None
self.n_jobs = n_jobs
Expand Down Expand Up @@ -178,7 +191,9 @@ def _fit(self, X, y):
words = self.transform.fit_transform(X, y)

# use RidgeClassifierCV for classification
self.clf = RidgeClassifierCV(alphas=np.logspace(-1, 5, 10))
self.clf = RidgeClassifierCV(
alphas=np.logspace(-1, 5, 10), class_weight=self.class_weight
)
self.clf.fit(words, y)

if hasattr(self.clf, "best_score_"):
Expand Down
27 changes: 24 additions & 3 deletions aeon/classification/interval_based/_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ class QUANTClassifier(BaseClassifier):
estimator : sklearn estimator, default=None
The estimator to use for classification. If None, an ExtraTreesClassifier
with 200 estimators is used.
class_weight{“balanced”, “balanced_subsample”}, dict or list of dicts, default=None
Only applies if estimator is None, and the default ExtraTreesClassifier is used.
From sklearn documentation:
If not given, all classes are supposed to have weight one.
The “balanced” mode uses the values of y to automatically adjust weights
inversely proportional to class frequencies in the input data as
n_samples / (n_classes * np.bincount(y))
The “balanced_subsample” mode is the same as “balanced” except that weights
are computed based on the bootstrap sample for every tree grown.
For multi-output, the weights of each column of y will be multiplied.
Note that these weights will be multiplied with sample_weight (passed through
the fit method) if sample_weight is specified.
random_state : int, RandomState instance or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `RandomState` instance, random_state is the random number generator;
Expand Down Expand Up @@ -75,13 +87,18 @@ class QUANTClassifier(BaseClassifier):
}

def __init__(
self, interval_depth=6, quantile_divisor=4, estimator=None, random_state=None
self,
interval_depth=6,
quantile_divisor=4,
estimator=None,
random_state=None,
class_weight=None,
):
self.interval_depth = interval_depth
self.quantile_divisor = quantile_divisor
self.estimator = estimator
self.random_state = random_state

self.class_weight = class_weight
super().__init__()

def _fit(self, X, y):
Expand All @@ -107,7 +124,11 @@ def _fit(self, X, y):
self._estimator = _clone_estimator(
(
ExtraTreesClassifier(
n_estimators=200, max_features=0.1, criterion="entropy"
n_estimators=200,
max_features=0.1,
criterion="entropy",
class_weight=self.class_weight,
random_state=self.random_state,
)
if self.estimator is None
else self.estimator
Expand Down
Loading