From 7e5e19032ad877049de0ce770b20657da7b798e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Dock=C3=A8s?= Date: Tue, 10 Dec 2024 10:48:06 +0100 Subject: [PATCH] Revert "MAINT adapt for scikit-learn 1.6 (#1135)" This reverts commit 18af508c7f279908cb0188140bfc4f336446c9ba. --- CHANGES.rst | 4 - benchmarks/bench_minhash_batch_number.py | 26 +--- skrub/_dataframe/_common.py | 5 +- skrub/_datetime_encoder.py | 21 ---- skrub/_fixes.py | 19 --- skrub/_interpolation_joiner.py | 12 +- skrub/_similarity_encoder.py | 46 +++---- skrub/_table_vectorizer.py | 39 ++---- skrub/_tabular_learner.py | 15 +-- skrub/_to_datetime.py | 2 - skrub/tests/test_sklearn.py | 152 +++++------------------ 11 files changed, 69 insertions(+), 272 deletions(-) delete mode 100644 skrub/_fixes.py diff --git a/CHANGES.rst b/CHANGES.rst index 87c83d84e..03f16dd7e 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -38,10 +38,6 @@ Bug fixes :user:`Jérôme Dockès ` and the matplotlib issue can be tracked [here](https://github.com/matplotlib/matplotlib/issues/25041). -Maintenance ------------ -* Make `skrub` compatible with scikit-learn 1.6. - :pr:`1135` by :user:`Guillaume Lemaitre `. Release 0.4.0 ============= diff --git a/benchmarks/bench_minhash_batch_number.py b/benchmarks/bench_minhash_batch_number.py index 6fda60f5b..4a26d36a3 100644 --- a/benchmarks/bench_minhash_batch_number.py +++ b/benchmarks/bench_minhash_batch_number.py @@ -15,11 +15,9 @@ import numpy as np import pandas as pd import seaborn as sns -import sklearn from joblib import Parallel, delayed, effective_n_jobs from sklearn.base import BaseEstimator, TransformerMixin from sklearn.utils import gen_even_slices, murmurhash3_32 -from sklearn.utils.fixes import parse_version from utils import default_parser, find_result, monitor from skrub._fast_hash import ngram_min_hash @@ -34,11 +32,6 @@ # flake8: noqa: E501 -sklearn_below_1_6 = parse_version( - parse_version(sklearn.__version__).base_version -) < parse_version("1.6") - - class MinHashEncoder(BaseEstimator, TransformerMixin): """ Encode string categorical features as a numeric array, minhash method @@ -133,20 +126,11 @@ def __init__( self.batch_per_job = batch_per_job self.n_jobs = n_jobs - if sklearn_below_1_6: - - def _more_tags(self): - """ - Used internally by sklearn to ease the estimator checks. - """ - return {"X_types": ["categorical"]} - - else: - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.categorical = True - return tags + def _more_tags(self): + """ + Used internally by sklearn to ease the estimator checks. + """ + return {"X_types": ["categorical"]} def _get_murmur_hash(self, string): """ diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py index 8f423128d..ee0b850e4 100644 --- a/skrub/_dataframe/_common.py +++ b/skrub/_dataframe/_common.py @@ -106,8 +106,6 @@ "total_seconds", ] -pandas_version = parse_version(parse_version(pd.__version__).base_version) - # # Inspecting containers' type and module # ====================================== @@ -332,8 +330,7 @@ def _concat_horizontal_pandas(*dataframes): init_index = dataframes[0].index dataframes = [df.reset_index(drop=True) for df in dataframes] dataframes = _join_utils.make_column_names_unique(*dataframes) - kwargs = {"copy": False} if pandas_version < parse_version("3.0") else {} - result = pd.concat(dataframes, axis=1, **kwargs) + result = pd.concat(dataframes, axis=1, copy=False) result.index = init_index return result diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index e62033932..6aee8a09b 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,8 +1,6 @@ from datetime import datetime, timezone import pandas as pd -import sklearn -from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted try: @@ -28,11 +26,6 @@ ] -sklearn_below_1_6 = parse_version( - parse_version(sklearn.__version__).base_version -) < parse_version("1.6") - - @dispatch def _is_date(col): raise NotImplementedError() @@ -330,17 +323,3 @@ def _check_params(self): raise ValueError( f"'resolution' options are {allowed}, got {self.resolution!r}." ) - - if sklearn_below_1_6: - - def _more_tags(self): - return {"preserves_dtype": []} - - else: - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - from sklearn.utils import TransformerTags - - tags.transformer_tags = TransformerTags() - return tags diff --git a/skrub/_fixes.py b/skrub/_fixes.py deleted file mode 100644 index 49ee8a219..000000000 --- a/skrub/_fixes.py +++ /dev/null @@ -1,19 +0,0 @@ -import sklearn -from sklearn.utils.fixes import parse_version - -sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) - - -if sklearn_version < parse_version("1.6"): - from sklearn.utils._tags import _safe_tags as get_tags # noqa -else: - from sklearn.utils import get_tags # noqa - - -def _check_n_features(estimator, X, *, reset): - if hasattr(estimator, "_check_n_features"): - estimator._check_n_features(X, reset=reset) - else: - from sklearn.utils.validation import _check_n_features - - _check_n_features(estimator, X, reset=reset) diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py index 14cbef4a1..2337aa8b8 100644 --- a/skrub/_interpolation_joiner.py +++ b/skrub/_interpolation_joiner.py @@ -1,5 +1,4 @@ import warnings -from dataclasses import is_dataclass import joblib import numpy as np @@ -8,11 +7,11 @@ HistGradientBoostingClassifier, HistGradientBoostingRegressor, ) +from sklearn.utils._tags import _safe_tags from . import _dataframe as sbd from . import _join_utils, _utils from . import _selectors as s -from ._fixes import get_tags from ._minhash_encoder import MinHashEncoder from ._table_vectorizer import TableVectorizer @@ -404,14 +403,7 @@ def _get_assignments_for_estimator(table, estimator): def _handles_multioutput(estimator): - tags = get_tags(estimator) - if isinstance(tags, dict): - # scikit-learn < 1.6 - return tags.get("multioutput", False) - elif is_dataclass(tags): - # scikit-learn >= 1.6 - return tags.target_tags.multi_output - return False + return _safe_tags(estimator).get("multioutput", False) def _fit(key_values, target_table, estimator, propagate_exceptions): diff --git a/skrub/_similarity_encoder.py b/skrub/_similarity_encoder.py index c29b53637..be16080c5 100644 --- a/skrub/_similarity_encoder.py +++ b/skrub/_similarity_encoder.py @@ -3,6 +3,7 @@ which encodes similarity instead of equality of values. """ + import numpy as np import pandas as pd import sklearn @@ -13,18 +14,12 @@ from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted -from ._fixes import _check_n_features from ._string_distances import get_ngram_count, preprocess # Ignore lines too long, first docstring lines can't be cut # flake8: noqa: E501 -sklearn_below_1_6 = parse_version( - parse_version(sklearn.__version__).base_version -) < parse_version("1.6") - - def _ngram_similarity_one_sample_inplace( x_count_vector, vocabulary_count_matrix, @@ -339,7 +334,7 @@ def fit(self, X, y=None): X[mask] = self.handle_missing Xlist, n_samples, n_features = self._check_X(X) - _check_n_features(self, X, reset=True) + self._check_n_features(X, reset=True) if self.handle_unknown not in ["error", "ignore"]: raise ValueError( @@ -458,7 +453,7 @@ def transform(self, X, fast=True): X[mask] = self.handle_missing Xlist, n_samples, n_features = self._check_X(X) - _check_n_features(self, X, reset=False) + self._check_n_features(X, reset=False) for i in range(n_features): Xi = Xlist[i] @@ -555,26 +550,15 @@ def _ngram_similarity_fast( return np.nan_to_num(out, copy=False) - if sklearn_below_1_6: - - def _more_tags(self): - return { - "X_types": ["2darray", "categorical", "string"], - "preserves_dtype": [], - "allow_nan": True, - "_xfail_checks": { - "check_estimator_sparse_data": ( - "Cannot create sparse matrix with strings." - ), - "check_estimators_dtypes": "We only support string dtypes.", - }, - } - - else: - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.categorical = True - tags.input_tags.string = True - tags.transformer_tags.preserves_dtype = [] - return tags + def _more_tags(self): + return { + "X_types": ["2darray", "categorical", "string"], + "preserves_dtype": [], + "allow_nan": True, + "_xfail_checks": { + "check_estimator_sparse_data": ( + "Cannot create sparse matrix with strings." + ), + "check_estimators_dtypes": "We only support string dtypes.", + }, + } diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 78463f997..aa8f050fb 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -3,12 +3,10 @@ from typing import Iterable import numpy as np -import sklearn from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.utils._estimator_html_repr import _VisualBlock -from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted from . import _dataframe as sbd @@ -30,11 +28,6 @@ __all__ = ["TableVectorizer"] -sklearn_below_1_6 = parse_version( - parse_version(sklearn.__version__).base_version -) < parse_version("1.6") - - class PassThrough(SingleColumnTransformer): def fit_transform(self, column, y=None): return column @@ -665,27 +658,17 @@ def _sk_visual_block_(self): # scikit-learn compatibility - if sklearn_below_1_6: - - def _more_tags(self): - """ - Used internally by sklearn to ease the estimator checks. - """ - return { - "X_types": ["2darray", "string"], - "allow_nan": [True], - "_xfail_checks": { - "check_complex_data": "Passthrough complex columns as-is.", - }, - } - - else: - - def __sklearn_tags__(self): - tags = super().__sklearn_tags__() - tags.input_tags.string = True - tags.input_tags.allow_nan = True - return tags + def _more_tags(self): + """ + Used internally by sklearn to ease the estimator checks. + """ + return { + "X_types": ["2darray", "string"], + "allow_nan": [True], + "_xfail_checks": { + "check_complex_data": "Passthrough complex columns as-is.", + }, + } def get_feature_names_out(self): """Return the column names of the output of ``transform`` as a list of strings. diff --git a/skrub/_tabular_learner.py b/skrub/_tabular_learner.py index 607406f15..fbec30c28 100644 --- a/skrub/_tabular_learner.py +++ b/skrub/_tabular_learner.py @@ -1,5 +1,3 @@ -from dataclasses import is_dataclass - import sklearn from sklearn import ensemble from sklearn.base import BaseEstimator @@ -8,7 +6,6 @@ from sklearn.preprocessing import OrdinalEncoder, StandardScaler from sklearn.utils.fixes import parse_version -from ._fixes import get_tags from ._minhash_encoder import MinHashEncoder from ._table_vectorizer import TableVectorizer from ._to_categorical import ToCategorical @@ -273,15 +270,9 @@ def tabular_learner(estimator, *, n_jobs=None): high_cardinality=MinHashEncoder(), ) steps = [vectorizer] - try: - tags = get_tags(estimator) - if is_dataclass(tags): - allow_nan = tags.input_tags.allow_nan - else: - allow_nan = tags.get("allow_nan", False) - except TypeError: - allow_nan = False - if not allow_nan: + if not hasattr(estimator, "_get_tags") or not estimator._get_tags().get( + "allow_nan", False + ): steps.append(SimpleImputer(add_indicator=True)) if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES): steps.append(StandardScaler()) diff --git a/skrub/_to_datetime.py b/skrub/_to_datetime.py index 943bf4906..2485fa84f 100644 --- a/skrub/_to_datetime.py +++ b/skrub/_to_datetime.py @@ -28,8 +28,6 @@ def _get_time_zone_pandas(col): return None if hasattr(tz, "zone"): return tz.zone - if hasattr(tz, "key"): - return tz.key return tz.tzname(None) diff --git a/skrub/tests/test_sklearn.py b/skrub/tests/test_sklearn.py index f50e3f89a..315c61f3a 100644 --- a/skrub/tests/test_sklearn.py +++ b/skrub/tests/test_sklearn.py @@ -2,6 +2,7 @@ import pytest import sklearn from sklearn.metrics.pairwise import linear_kernel, pairwise_distances +from sklearn.utils._tags import _safe_tags from sklearn.utils.estimator_checks import _is_pairwise_metric, parametrize_with_checks from skrub import ( # isort:skip @@ -12,133 +13,44 @@ SimilarityEncoder, TableVectorizer, ) -from skrub._fixes import get_tags -def _enforce_estimator_tags_X_monkey_patch( - estimator, X, X_test=None, kernel=linear_kernel -): +def _enforce_estimator_tags_X_monkey_patch(estimator, X, kernel=linear_kernel): """Monkey patch scikit-learn function to create a specific case where to enforce having only strings with some encoders. """ - tags = get_tags(estimator) - if isinstance(tags, dict): - # Estimators with `1darray` in `X_types` tag only accept - # X of shape (`n_samples`,) - if "1darray" in tags["X_types"]: - X = X[:, 0] - if X_test is not None: - X_test = X_test[:, 0] # pragma: no cover - # Estimators with a `requires_positive_X` tag only accept - # strictly positive data - if tags["requires_positive_X"]: - X = X - X.min() - if X_test is not None: - X_test = X_test - X_test.min() # pragma: no cover - if "categorical" in tags["X_types"]: - X = np.round((X - X.min())) - if X_test is not None: - X_test = np.round((X_test - X_test.min())) # pragma: no cover - if "string" in tags["X_types"]: - # Note: this part is the monkey patch - X = X.astype(object) - for i in range(X.shape[0]): - for j in range(X.shape[1]): - X[i, j] = str(X[i, j]) - if X_test is not None: - X_test = X_test.astype(object) - for i in range(X_test.shape[0]): - for j in range(X_test.shape[1]): - X_test[i, j] = str(X_test[i, j]) - elif tags["allow_nan"]: - X = X.astype(np.float64) - if X_test is not None: - X_test = X_test.astype(np.float64) # pragma: no cover - else: - X = X.astype(np.int32) - if X_test is not None: - X_test = X_test.astype(np.int32) # pragma: no cover + # Estimators with `1darray` in `X_types` tag only accept + # X of shape (`n_samples`,) + if "1darray" in _safe_tags(estimator, key="X_types"): + X = X[:, 0] + # Estimators with a `requires_positive_X` tag only accept + # strictly positive data + if _safe_tags(estimator, key="requires_positive_X"): + X = X - X.min() + if "categorical" in _safe_tags(estimator, key="X_types"): + X = np.round((X - X.min())) + if "string" in _safe_tags(estimator, key="X_types"): + # Note: this part is the monkey patch + X = X.astype(object) + for i in range(X.shape[0]): + for j in range(X.shape[1]): + X[i, j] = str(X[i, j]) + elif _safe_tags(estimator, key="allow_nan"): + X = X.astype(np.float64) + else: + X = X.astype(np.int32) - if estimator.__class__.__name__ == "SkewedChi2Sampler": - # SkewedChi2Sampler requires X > -skewdness in transform - X = X - X.min() - if X_test is not None: - X_test = X_test - X_test.min() # pragma: no cover + if estimator.__class__.__name__ == "SkewedChi2Sampler": + # SkewedChi2Sampler requires X > -skewdness in transform + X = X - X.min() - X_res = X - - # Pairwise estimators only accept - # X of shape (`n_samples`, `n_samples`) - if _is_pairwise_metric(estimator): - X_res = pairwise_distances(X, metric="euclidean") - if X_test is not None: - X_test = pairwise_distances( - X_test, X, metric="euclidean" - ) # pragma: no cover - elif tags["pairwise"]: - X_res = kernel(X, X) - if X_test is not None: - X_test = kernel(X_test, X) # pragma: no cover - else: - # scikit-learn >= 1.6 - # Estimators with `1darray` in `X_types` tag only accept - # X of shape (`n_samples`,) - if tags.input_tags.one_d_array: - X = X[:, 0] - if X_test is not None: - X_test = X_test[:, 0] # pragma: no cover - # Estimators with a `requires_positive_X` tag only accept - # strictly positive data - if tags.input_tags.positive_only: - X = X - X.min() - if X_test is not None: - X_test = X_test - X_test.min() # pragma: no cover - if tags.input_tags.categorical: - X = np.round((X - X.min())) - if X_test is not None: - X_test = np.round((X_test - X_test.min())) # pragma: no cover - if tags.input_tags.string: - X = X.astype(object) - for i in range(X.shape[0]): - for j in range(X.shape[1]): - X[i, j] = str(X[i, j]) - if X_test is not None: - X_test = X_test.astype(object) - for i in range(X_test.shape[0]): - for j in range(X_test.shape[1]): - X_test[i, j] = str(X_test[i, j]) - elif tags.input_tags.allow_nan: - X = X.astype(np.float64) - if X_test is not None: - X_test = X_test.astype(np.float64) # pragma: no cover - else: - X = X.astype(np.int32) - if X_test is not None: - X_test = X_test.astype(np.int32) # pragma: no cover - - if estimator.__class__.__name__ == "SkewedChi2Sampler": - # SkewedChi2Sampler requires X > -skewdness in transform - X = X - X.min() - if X_test is not None: - X_test = X_test - X_test.min() # pragma: no cover - - X_res = X - - # Pairwise estimators only accept - # X of shape (`n_samples`, `n_samples`) - if _is_pairwise_metric(estimator): - X_res = pairwise_distances(X, metric="euclidean") - if X_test is not None: - X_test = pairwise_distances( - X_test, X, metric="euclidean" - ) # pragma: no cover - elif tags.input_tags.pairwise: - X_res = kernel(X, X) - if X_test is not None: - X_test = kernel(X_test, X) # pragma: no cover - if X_test is not None: - return X_res, X_test - return X_res + # Pairwise estimators only accept + # X of shape (`n_samples`, `n_samples`) + if _is_pairwise_metric(estimator): + X = pairwise_distances(X, metric="euclidean") + elif _safe_tags(estimator, key="pairwise"): + X = kernel(X, X) + return X sklearn.utils.estimator_checks._enforce_estimator_tags_X = (