Revert "MAINT adapt for scikit-learn 1.6 (skrub-data#1135)" (skrub-da…

…ta#1194) This reverts commit 18af508.
jeromedockes · Dec 10, 2024 · ad825d4 · ad825d4
1 parent 18af508
commit ad825d4
Show file tree

Hide file tree

Showing 11 changed files with 69 additions and 272 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -38,10 +38,6 @@ Bug fixes
   :user:`Jérôme Dockès <jeromedockes>` and the matplotlib issue can be tracked
   [here](https://github.com/matplotlib/matplotlib/issues/25041).
 
-Maintenance
------------
-* Make `skrub` compatible with scikit-learn 1.6.
-  :pr:`1135` by :user:`Guillaume Lemaitre <glemaitre>`.
 
 Release 0.4.0
 =============

diff --git a/benchmarks/bench_minhash_batch_number.py b/benchmarks/bench_minhash_batch_number.py
@@ -15,11 +15,9 @@
 import numpy as np
 import pandas as pd
 import seaborn as sns
-import sklearn
 from joblib import Parallel, delayed, effective_n_jobs
 from sklearn.base import BaseEstimator, TransformerMixin
 from sklearn.utils import gen_even_slices, murmurhash3_32
-from sklearn.utils.fixes import parse_version
 from utils import default_parser, find_result, monitor
 
 from skrub._fast_hash import ngram_min_hash
@@ -34,11 +32,6 @@
 # flake8: noqa: E501
 
 
-sklearn_below_1_6 = parse_version(
-    parse_version(sklearn.__version__).base_version
-) < parse_version("1.6")
-
-
 class MinHashEncoder(BaseEstimator, TransformerMixin):
     """
     Encode string categorical features as a numeric array, minhash method
@@ -133,20 +126,11 @@ def __init__(
         self.batch_per_job = batch_per_job
         self.n_jobs = n_jobs
 
-    if sklearn_below_1_6:
-
-        def _more_tags(self):
-            """
-            Used internally by sklearn to ease the estimator checks.
-            """
-            return {"X_types": ["categorical"]}
-
-    else:
-
-        def __sklearn_tags__(self):
-            tags = super().__sklearn_tags__()
-            tags.input_tags.categorical = True
-            return tags
+    def _more_tags(self):
+        """
+        Used internally by sklearn to ease the estimator checks.
+        """
+        return {"X_types": ["categorical"]}
 
     def _get_murmur_hash(self, string):
         """

diff --git a/skrub/_dataframe/_common.py b/skrub/_dataframe/_common.py
@@ -106,8 +106,6 @@
     "total_seconds",
 ]
 
-pandas_version = parse_version(parse_version(pd.__version__).base_version)
-
 #
 # Inspecting containers' type and module
 # ======================================
@@ -332,8 +330,7 @@ def _concat_horizontal_pandas(*dataframes):
     init_index = dataframes[0].index
     dataframes = [df.reset_index(drop=True) for df in dataframes]
     dataframes = _join_utils.make_column_names_unique(*dataframes)
-    kwargs = {"copy": False} if pandas_version < parse_version("3.0") else {}
-    result = pd.concat(dataframes, axis=1, **kwargs)
+    result = pd.concat(dataframes, axis=1, copy=False)
     result.index = init_index
     return result
 

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
@@ -1,8 +1,6 @@
 from datetime import datetime, timezone
 
 import pandas as pd
-import sklearn
-from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_is_fitted
 
 try:
@@ -28,11 +26,6 @@
 ]
 
 
-sklearn_below_1_6 = parse_version(
-    parse_version(sklearn.__version__).base_version
-) < parse_version("1.6")
-
-
 @dispatch
 def _is_date(col):
     raise NotImplementedError()
@@ -330,17 +323,3 @@ def _check_params(self):
             raise ValueError(
                 f"'resolution' options are {allowed}, got {self.resolution!r}."
             )
-
-    if sklearn_below_1_6:
-
-        def _more_tags(self):
-            return {"preserves_dtype": []}
-
-    else:
-
-        def __sklearn_tags__(self):
-            tags = super().__sklearn_tags__()
-            from sklearn.utils import TransformerTags
-
-            tags.transformer_tags = TransformerTags()
-            return tags
diff --git a/skrub/_fixes.py b/skrub/_fixes.py
diff --git a/skrub/_interpolation_joiner.py b/skrub/_interpolation_joiner.py
@@ -1,5 +1,4 @@
 import warnings
-from dataclasses import is_dataclass
 
 import joblib
 import numpy as np
@@ -8,11 +7,11 @@
     HistGradientBoostingClassifier,
     HistGradientBoostingRegressor,
 )
+from sklearn.utils._tags import _safe_tags
 
 from . import _dataframe as sbd
 from . import _join_utils, _utils
 from . import _selectors as s
-from ._fixes import get_tags
 from ._minhash_encoder import MinHashEncoder
 from ._table_vectorizer import TableVectorizer
 
@@ -404,14 +403,7 @@ def _get_assignments_for_estimator(table, estimator):
 
 
 def _handles_multioutput(estimator):
-    tags = get_tags(estimator)
-    if isinstance(tags, dict):
-        # scikit-learn < 1.6
-        return tags.get("multioutput", False)
-    elif is_dataclass(tags):
-        # scikit-learn >= 1.6
-        return tags.target_tags.multi_output
-    return False
+    return _safe_tags(estimator).get("multioutput", False)
 
 
 def _fit(key_values, target_table, estimator, propagate_exceptions):

diff --git a/skrub/_similarity_encoder.py b/skrub/_similarity_encoder.py
@@ -3,6 +3,7 @@
 which encodes similarity instead of equality of values.
 """
 
+
 import numpy as np
 import pandas as pd
 import sklearn
@@ -13,18 +14,12 @@
 from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_is_fitted
 
-from ._fixes import _check_n_features
 from ._string_distances import get_ngram_count, preprocess
 
 # Ignore lines too long, first docstring lines can't be cut
 # flake8: noqa: E501
 
 
-sklearn_below_1_6 = parse_version(
-    parse_version(sklearn.__version__).base_version
-) < parse_version("1.6")
-
-
 def _ngram_similarity_one_sample_inplace(
     x_count_vector,
     vocabulary_count_matrix,
@@ -339,7 +334,7 @@ def fit(self, X, y=None):
                     X[mask] = self.handle_missing
 
         Xlist, n_samples, n_features = self._check_X(X)
-        _check_n_features(self, X, reset=True)
+        self._check_n_features(X, reset=True)
 
         if self.handle_unknown not in ["error", "ignore"]:
             raise ValueError(
@@ -458,7 +453,7 @@ def transform(self, X, fast=True):
                     X[mask] = self.handle_missing
 
         Xlist, n_samples, n_features = self._check_X(X)
-        _check_n_features(self, X, reset=False)
+        self._check_n_features(X, reset=False)
 
         for i in range(n_features):
             Xi = Xlist[i]
@@ -555,26 +550,15 @@ def _ngram_similarity_fast(
 
         return np.nan_to_num(out, copy=False)
 
-    if sklearn_below_1_6:
-
-        def _more_tags(self):
-            return {
-                "X_types": ["2darray", "categorical", "string"],
-                "preserves_dtype": [],
-                "allow_nan": True,
-                "_xfail_checks": {
-                    "check_estimator_sparse_data": (
-                        "Cannot create sparse matrix with strings."
-                    ),
-                    "check_estimators_dtypes": "We only support string dtypes.",
-                },
-            }
-
-    else:
-
-        def __sklearn_tags__(self):
-            tags = super().__sklearn_tags__()
-            tags.input_tags.categorical = True
-            tags.input_tags.string = True
-            tags.transformer_tags.preserves_dtype = []
-            return tags
+    def _more_tags(self):
+        return {
+            "X_types": ["2darray", "categorical", "string"],
+            "preserves_dtype": [],
+            "allow_nan": True,
+            "_xfail_checks": {
+                "check_estimator_sparse_data": (
+                    "Cannot create sparse matrix with strings."
+                ),
+                "check_estimators_dtypes": "We only support string dtypes.",
+            },
+        }
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
@@ -3,12 +3,10 @@
 from typing import Iterable
 
 import numpy as np
-import sklearn
 from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils._estimator_html_repr import _VisualBlock
-from sklearn.utils.fixes import parse_version
 from sklearn.utils.validation import check_is_fitted
 
 from . import _dataframe as sbd
@@ -30,11 +28,6 @@
 __all__ = ["TableVectorizer"]
 
 
-sklearn_below_1_6 = parse_version(
-    parse_version(sklearn.__version__).base_version
-) < parse_version("1.6")
-
-
 class PassThrough(SingleColumnTransformer):
     def fit_transform(self, column, y=None):
         return column
@@ -665,27 +658,17 @@ def _sk_visual_block_(self):
 
     # scikit-learn compatibility
 
-    if sklearn_below_1_6:
-
-        def _more_tags(self):
-            """
-            Used internally by sklearn to ease the estimator checks.
-            """
-            return {
-                "X_types": ["2darray", "string"],
-                "allow_nan": [True],
-                "_xfail_checks": {
-                    "check_complex_data": "Passthrough complex columns as-is.",
-                },
-            }
-
-    else:
-
-        def __sklearn_tags__(self):
-            tags = super().__sklearn_tags__()
-            tags.input_tags.string = True
-            tags.input_tags.allow_nan = True
-            return tags
+    def _more_tags(self):
+        """
+        Used internally by sklearn to ease the estimator checks.
+        """
+        return {
+            "X_types": ["2darray", "string"],
+            "allow_nan": [True],
+            "_xfail_checks": {
+                "check_complex_data": "Passthrough complex columns as-is.",
+            },
+        }
 
     def get_feature_names_out(self):
         """Return the column names of the output of ``transform`` as a list of strings.

diff --git a/skrub/_tabular_learner.py b/skrub/_tabular_learner.py
@@ -1,5 +1,3 @@
-from dataclasses import is_dataclass
-
 import sklearn
 from sklearn import ensemble
 from sklearn.base import BaseEstimator
@@ -8,7 +6,6 @@
 from sklearn.preprocessing import OrdinalEncoder, StandardScaler
 from sklearn.utils.fixes import parse_version
 
-from ._fixes import get_tags
 from ._minhash_encoder import MinHashEncoder
 from ._table_vectorizer import TableVectorizer
 from ._to_categorical import ToCategorical
@@ -273,15 +270,9 @@ def tabular_learner(estimator, *, n_jobs=None):
             high_cardinality=MinHashEncoder(),
         )
     steps = [vectorizer]
-    try:
-        tags = get_tags(estimator)
-        if is_dataclass(tags):
-            allow_nan = tags.input_tags.allow_nan
-        else:
-            allow_nan = tags.get("allow_nan", False)
-    except TypeError:
-        allow_nan = False
-    if not allow_nan:
+    if not hasattr(estimator, "_get_tags") or not estimator._get_tags().get(
+        "allow_nan", False
+    ):
         steps.append(SimpleImputer(add_indicator=True))
     if not isinstance(estimator, _TREE_ENSEMBLE_CLASSES):
         steps.append(StandardScaler())

diff --git a/skrub/_to_datetime.py b/skrub/_to_datetime.py
@@ -28,8 +28,6 @@ def _get_time_zone_pandas(col):
         return None
     if hasattr(tz, "zone"):
         return tz.zone
-    if hasattr(tz, "key"):
-        return tz.key
     return tz.tzname(None)