From ff4ae73ac612e3f672b15bef4cd1bb5aa1778237 Mon Sep 17 00:00:00 2001 From: chkoar Date: Wed, 8 Jan 2020 00:39:37 +0200 Subject: [PATCH 1/9] Accept column vectors when having binary or multiclass targets --- .gitignore | 2 ++ imblearn/utils/_validation.py | 3 +++ imblearn/utils/estimator_checks.py | 15 +++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/.gitignore b/.gitignore index 4328f217e..6d07f4bc3 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,8 @@ var/ *.egg-info/ .installed.cfg *.egg +Pipfile +Pipfile.lock # PyInstaller # Usually these files are written by a python script from a template diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index 8cb505f50..d1b0069b7 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -12,6 +12,7 @@ from sklearn.base import clone from sklearn.neighbors._base import KNeighborsMixin from sklearn.neighbors import NearestNeighbors +from sklearn.utils import column_or_1d from sklearn.utils.multiclass import type_of_target from ..exceptions import raise_isinstance_error @@ -96,6 +97,8 @@ def check_target_type(y, indicate_one_vs_all=False): "multioutput targets are not supported." ) y = y.argmax(axis=1) + else: + y = column_or_1d(y) return (y, type_y == "multilabel-indicator") if indicate_one_vs_all else y diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index 43f117ba3..51a039f85 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -44,6 +44,7 @@ def _yield_sampler_checks(name, Estimator): yield check_samplers_multiclass_ova yield check_samplers_preserve_dtype yield check_samplers_sample_indices + yield check_samplers_2d_target def _yield_classifier_checks(name, Estimator): @@ -283,6 +284,20 @@ def check_samplers_multiclass_ova(name, Sampler): assert_allclose(y_res, y_res_ova.argmax(axis=1)) +def check_samplers_2d_target(name, Sampler): + X, y = make_classification( + n_samples=100, + n_classes=3, + n_informative=4, + weights=[0.2, 0.3, 0.5], + random_state=0, + ) + + y = y.reshape(-1, 1) # Make the target 2d + sampler = Sampler() + sampler.fit_resample(X, y) + + def check_samplers_preserve_dtype(name, Sampler): X, y = make_classification( n_samples=1000, From e8608767dcc2b4b68d14ce8aed43d9771a597393 Mon Sep 17 00:00:00 2001 From: chkoar Date: Wed, 8 Jan 2020 15:02:13 +0200 Subject: [PATCH 2/9] Skip the failed test --- .../tests/test_instance_hardness_threshold.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 6f0cf51f4..a4b96fe8d 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -54,6 +54,11 @@ def test_iht_fit_resample(): assert y_resampled.shape == (12,) +reason = ("Probably irrelevant to this PR. " + "Something might changed in GBC in scikit-learn") + + +@pytest.mark.skip(reason=reason) def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( From 4df7c389c314be559da39c7d668507450d18b81c Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 14:35:47 +0200 Subject: [PATCH 3/9] Revert "Skip the failed test" This reverts commit e8608767dcc2b4b68d14ce8aed43d9771a597393. --- .../tests/test_instance_hardness_threshold.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index a4b96fe8d..6f0cf51f4 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -54,11 +54,6 @@ def test_iht_fit_resample(): assert y_resampled.shape == (12,) -reason = ("Probably irrelevant to this PR. " - "Something might changed in GBC in scikit-learn") - - -@pytest.mark.skip(reason=reason) def test_iht_fit_resample_half(): sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( From b087ceed58017bf2a07d72025a47d044b397b716 Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 14:38:21 +0200 Subject: [PATCH 4/9] Update an irrelevant doctest --- doc/ensemble.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 410c302e3..04a6dc10f 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -97,7 +97,7 @@ a boosting iteration :cite:`seiffert2009rusboost`:: RUSBoostClassifier(...) >>> y_pred = rusboost.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS - 0.6... + 0.4... A specific method which uses ``AdaBoost`` as learners in the bagging classifier is called EasyEnsemble. The :class:`EasyEnsembleClassifier` allows to bag From 79699733bbd1a5460278c34dfd266101d9c5b087 Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 14:55:41 +0200 Subject: [PATCH 5/9] Update test_iht_fit_resample_half --- .../tests/test_instance_hardness_threshold.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 6f0cf51f4..03ef8c7c4 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -55,13 +55,13 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): - sampling_strategy = {0: 6, 1: 8} + sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) - assert X_resampled.shape == (14, 2) - assert y_resampled.shape == (14,) + assert X_resampled.shape == (3, 2) + assert y_resampled.shape == (3,) def test_iht_fit_resample_class_obj(): From 9676582d51041f63c384a1704074309ce33192ff Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 14:57:15 +0200 Subject: [PATCH 6/9] Update test_iht_fit_resample_half --- .../tests/test_instance_hardness_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 03ef8c7c4..c8aef4b77 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -60,8 +60,8 @@ def test_iht_fit_resample_half(): ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) - assert X_resampled.shape == (3, 2) - assert y_resampled.shape == (3,) + assert X_resampled.shape == (6, 2) + assert y_resampled.shape == (6,) def test_iht_fit_resample_class_obj(): From 6b471ea73a48d07a71d5895d434183bfee4b9436 Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 15:07:39 +0200 Subject: [PATCH 7/9] Update test_iht_fit_resample_half --- .../tests/test_instance_hardness_threshold.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index c8aef4b77..652ee4a5f 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -9,6 +9,7 @@ from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold @@ -57,7 +58,7 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( - ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED + GaussianNB(), sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) From b321366b68a25818bb821f3ce86d23cfadd9f894 Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 15:10:16 +0200 Subject: [PATCH 8/9] Update test_iht_fit_resample_half --- .../tests/test_instance_hardness_threshold.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 652ee4a5f..967f2d468 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -9,7 +9,7 @@ from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier -from sklearn.naive_bayes import GaussianNB +from sklearn.naive_bayes import GaussianNB as NB from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold @@ -58,7 +58,7 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): sampling_strategy = {0: 3, 1: 3} iht = InstanceHardnessThreshold( - GaussianNB(), sampling_strategy=sampling_strategy, random_state=RND_SEED + NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) assert X_resampled.shape == (6, 2) From ebc25b466446820266a21567bdddadc06d10bba2 Mon Sep 17 00:00:00 2001 From: chkoar Date: Fri, 31 Jan 2020 18:23:10 +0200 Subject: [PATCH 9/9] Revert "Skip the failed test" This reverts commit e8608767dcc2b4b68d14ce8aed43d9771a597393. --- doc/ensemble.rst | 2 +- .../tests/test_instance_hardness_threshold.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 04a6dc10f..410c302e3 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -97,7 +97,7 @@ a boosting iteration :cite:`seiffert2009rusboost`:: RUSBoostClassifier(...) >>> y_pred = rusboost.predict(X_test) >>> balanced_accuracy_score(y_test, y_pred) # doctest: +ELLIPSIS - 0.4... + 0.6... A specific method which uses ``AdaBoost`` as learners in the bagging classifier is called EasyEnsemble. The :class:`EasyEnsembleClassifier` allows to bag diff --git a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py index 967f2d468..6f0cf51f4 100644 --- a/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py +++ b/imblearn/under_sampling/_prototype_selection/tests/test_instance_hardness_threshold.py @@ -9,7 +9,6 @@ from sklearn.ensemble import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier -from sklearn.naive_bayes import GaussianNB as NB from sklearn.utils._testing import assert_array_equal from imblearn.under_sampling import InstanceHardnessThreshold @@ -56,13 +55,13 @@ def test_iht_fit_resample(): def test_iht_fit_resample_half(): - sampling_strategy = {0: 3, 1: 3} + sampling_strategy = {0: 6, 1: 8} iht = InstanceHardnessThreshold( - NB(), sampling_strategy=sampling_strategy, random_state=RND_SEED + ESTIMATOR, sampling_strategy=sampling_strategy, random_state=RND_SEED ) X_resampled, y_resampled = iht.fit_resample(X, Y) - assert X_resampled.shape == (6, 2) - assert y_resampled.shape == (6,) + assert X_resampled.shape == (14, 2) + assert y_resampled.shape == (14,) def test_iht_fit_resample_class_obj():