From 4b11e6260b01fdbcaf9059601fe3b318712bc9e3 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 9 Nov 2023 16:51:56 +0100
Subject: [PATCH] [ENH] Enable Grid-Search for `TableVectorizer` (#814)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* apply global sub-estimator default parameter

* fix docstirng

* fix docstring

* remove pytest exception

* add changes

* Update skrub/_table_vectorizer.py

Co-authored-by: Jérôme Dockès <jerome@dockes.org>

* remove the None conversion to passthrough

* add passthrough as default for numerical_transformer

* fix precommit

---------

Co-authored-by: Jérôme Dockès <jerome@dockes.org>
---
 CHANGES.rst                          |   4 +
 skrub/__init__.py                    |   3 +-
 skrub/_table_vectorizer.py           | 355 ++++++++++++---------------
 skrub/tests/test_table_vectorizer.py |  87 ++++---
 4 files changed, 208 insertions(+), 241 deletions(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 4d6a29eb8..c1cd568d4 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -15,6 +15,10 @@ development and backward compatibility is not ensured.
 Major changes
 -------------
 
+* Pipelines including :class:`TableVectorizer` can now be grid-searched, since
+  we can now call `set_params` on the default transformers of :class:`TableVectorizer`.
+  :pr:`814` by :user:`Vincent Maladiere <Vincent-Maladiere>`
+
 * :func:`to_datetime` is now available to support pandas.to_datetime
   over dataframes and 2d arrays.
   :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`
diff --git a/skrub/__init__.py b/skrub/__init__.py
index a55cc134a..e2fccecff 100644
--- a/skrub/__init__.py
+++ b/skrub/__init__.py
@@ -13,7 +13,7 @@
 from ._minhash_encoder import MinHashEncoder
 from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
-from ._table_vectorizer import SuperVectorizer, TableVectorizer
+from ._table_vectorizer import TableVectorizer
 from ._target_encoder import TargetEncoder
 
 check_dependencies()
@@ -29,7 +29,6 @@
     "GapEncoder",
     "MinHashEncoder",
     "SimilarityEncoder",
-    "SuperVectorizer",
     "TableVectorizer",
     "TargetEncoder",
     "deduplicate",
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 535819a18..7832ca0e1 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -6,27 +6,31 @@
 
 import warnings
 from collections import Counter
-from typing import Literal
 
 import numpy as np
 import pandas as pd
 import sklearn
-from numpy.typing import ArrayLike
 from pandas._libs.tslibs.parsing import guess_datetime_format
 from pandas.core.dtypes.base import ExtensionDtype
 from scipy import sparse
-from sklearn.base import TransformerMixin, clone
+from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.compose import ColumnTransformer
 from sklearn.compose._column_transformer import _get_transformer_list
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils import Bunch
-from sklearn.utils.deprecation import deprecated
-from sklearn.utils.metaestimators import _BaseComposition
 from sklearn.utils.validation import check_is_fitted
 
 from skrub import DatetimeEncoder, GapEncoder
 from skrub._utils import parse_astype_error_message
 
+HIGH_CARDINALITY_TRANSFORMER = GapEncoder(n_components=30)
+LOW_CARDINALITY_TRANSFORMER = OneHotEncoder(
+    sparse_output=False,
+    handle_unknown="ignore",
+    drop="if_binary",
+)
+DATETIME_TRANSFORMER = DatetimeEncoder()
+
 
 def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> str | None:
     """Infer the date format of a date column,
@@ -146,17 +150,76 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se
     return ser
 
 
-Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"]
+def _clone_if_default(transformer, default_transformer):
+    return clone(transformer) if transformer is default_transformer else transformer
+
+
+def _clone_during_fit(transformer, remainder, n_jobs):
+    if isinstance(transformer, sklearn.base.TransformerMixin):
+        return _propagate_n_jobs(clone(transformer), n_jobs)
+    elif transformer == "remainder":
+        return remainder if isinstance(remainder, str) else clone(remainder)
+    elif transformer == "passthrough":
+        return transformer
+    else:
+        raise ValueError(
+            "'transformer' must be an instance of sklearn.base.TransformerMixin, "
+            f"'remainder' or 'passthrough'. Got {transformer=!r}."
+        )
+
+
+def _check_specific_transformers(specific_transformers, n_jobs):
+    if (specific_transformers is None) or len(specific_transformers) == 0:
+        return []
+    else:
+        first_item_length = len(specific_transformers[0])
+        # Check that all tuples have the same length
+        for idx, tuple_ in enumerate(specific_transformers):
+            if len(tuple_) != first_item_length:
+                raise TypeError(
+                    "Expected `specific_transformers` to be a list of "
+                    "tuples with all the same length, got length "
+                    f"{len(tuple_)} at index {idx} (elements at previous "
+                    f"indices have {first_item_length} in length). "
+                )
+        if first_item_length == 2:
+            # Unnamed assignments, transform to named
+            specific_transformers = _get_transformer_list(specific_transformers)
+        elif first_item_length == 3:
+            # Named assignments, no-op
+            pass
+        else:
+            raise TypeError(
+                "Expected `specific_transformers` to be a list of tuples "
+                "of length 2 or 3, got a list of tuples of length "
+                f"{first_item_length}. "
+            )
+
+        return [
+            (
+                (name, _propagate_n_jobs(clone(transformer), n_jobs), cols)
+                if isinstance(transformer, sklearn.base.TransformerMixin)
+                else (name, transformer, cols)
+            )
+            for name, transformer, cols in specific_transformers
+        ]
+
+
+def _propagate_n_jobs(transformer, n_jobs):
+    if n_jobs is not None and (
+        hasattr(transformer, "n_jobs") and transformer.n_jobs is None
+    ):
+        transformer.set_params(n_jobs=n_jobs)
+    return transformer
 
 
-class TableVectorizer(TransformerMixin, _BaseComposition):
+class TableVectorizer(TransformerMixin, BaseEstimator):
     """Automatically transform a heterogeneous dataframe to a numerical array.
 
     Easily transforms a heterogeneous data table
-    (such as a :obj:`~pandas.DataFrame`) to a numerical array for machine
-    learning. For this it transforms each column depending on its data type.
-    It provides a simplified interface for the ColumnTransformer ;
-    more documentation of attributes and functions are available in its doc.
+    (such as a :obj:`pandas.DataFrame`) to a numerical array for machine
+    learning. To do so, the TableVectorizer transforms each column depending
+    on its data type.
 
     Parameters
     ----------
@@ -165,44 +228,46 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
         under this value, the low cardinality categorical features, and above or
         equal, the high cardinality categorical features.
         Different transformers will be applied to these two groups,
-        defined by the parameters `low_card_cat_transformer` and
-        `high_card_cat_transformer` respectively.
+        defined by the parameters `low_cardinality_transformer` and
+        `high_cardinality_transformer` respectively.
         Note: currently, missing values are counted as a single unique value
         (so they count in the cardinality).
 
-    low_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on categorical/string features with low cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance (e.g. OneHotEncoder),
         a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer
-        (OneHotEncoder(handle_unknown="ignore", drop="if_binary")).
+        'passthrough' to return the unencoded columns.
+        The default transformer is \
+            (OneHotEncoder(handle_unknown="ignore", drop="if_binary")).
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
-    high_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on categorical/string features with high cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance
         (e.g. GapEncoder), a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer (GapEncoder(n_components=30)).
+        or 'passthrough' to return the unencoded columns.
+        The default transformer is (GapEncoder(n_components=30)).
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
-    numerical_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    numerical_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on numerical features.
         Can either be a transformer object instance (e.g. StandardScaler),
         a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer (here nothing, so 'passthrough').
+        or 'passthrough' to return the unencoded columns (default).
         Features classified under this category are not imputed at all
         (regardless of `impute_missing`).
 
@@ -217,7 +282,9 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
         Features classified under this category are not imputed at all
         (regardless of `impute_missing`).
 
-    specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} or Transformer, list of str or int), optional
+    specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or \
+        Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} \
+            or Transformer, list of str or int), optional
         On top of the default column type classification (see parameters above),
         this parameter allows you to manually specify transformers for
         specific columns.
@@ -270,8 +337,8 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
     n_jobs : int, default=None
         Number of jobs to run in parallel. This number of jobs will be dispatched to
         the underlying transformers, if those support parallelization and they do not
-        set specifically `n_jobs`.
-        ``None`` (the default) means 1 unless in a :fund:`joblib.parallel_config`
+        set specifically ``n_jobs``.
+        ``None`` (the default) means 1 unless in a :func:`joblib.parallel_config`
         context. ``-1`` means using all processors.
 
     transformer_weights : dict, default=None
@@ -316,7 +383,8 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
     See Also
     --------
     GapEncoder :
-        Encodes dirty categories (strings) by constructing latent topics with continuous encoding.
+        Encodes dirty categories (strings) by constructing latent topics with \
+            continuous encoding.
     MinHashEncoder :
         Encode string columns as a numeric array with the minhash method.
     SimilarityEncoder :
@@ -326,12 +394,12 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
     -----
     The column order of the input data is not guaranteed to be the same
     as the output data (returned by TableVectorizer.transform).
-    This is a due to the way the ColumnTransformer works.
+    This is a due to the way the underlying ColumnTransformer works.
     However, the output column order will always be the same for different
-    calls to TableVectorizer.transform on a same fitted TableVectorizer instance.
+    calls to ``TableVectorize.transform`` on a same fitted TableVectorizer instance.
     For example, if input data has columns ['name', 'job', 'year'], then output
     columns might be shuffled, e.g. ['job', 'year', 'name'], but every call
-    to TableVectorizer.transform on this instance will return this order.
+    to ``TableVectorizer.transform`` on this instance will return this order.
 
     Examples
     --------
@@ -355,53 +423,42 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
     >>> tv.transformers_
     [('numeric', 'passthrough', ['year_first_hired']), \
 ('datetime', DatetimeEncoder(), ['date_first_hired']), \
-('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist'), \
+('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \
+sparse_output=False), \
 ['gender', 'department', 'department_name', 'assignment_category']), \
 ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
-    """  # noqa: E501
-
-    transformers_: list[tuple[str, Transformer, list[str]]]
-    types_: dict[str, type]
-    imputed_columns_: list[str]
-    low_card_cat_transformer_: Transformer
-    high_card_cat_transformer_: Transformer
-    numerical_transformer_: Transformer
-    datetime_transformer_: Transformer
-    specific_transformers_: list[tuple[str, Transformer, list[str, int]]]
-
-    _transformer_to_input_indices: dict[str, list[int]]
-
-    # Override required parameters
-    _required_parameters = []
+    """
 
     def __init__(
         self,
         *,
-        cardinality_threshold: int = 40,
-        low_card_cat_transformer: Transformer | None = None,
-        high_card_cat_transformer: Transformer | None = None,
-        numerical_transformer: Transformer | None = None,
-        datetime_transformer: Transformer | None = None,
-        specific_transformers: list[
-            tuple[Transformer, list[str | int]]
-            | tuple[str, Transformer, list[str, int]]
-        ]
-        | None = None,
-        auto_cast: bool = True,
-        impute_missing: Literal["auto", "force", "skip"] = "auto",
+        cardinality_threshold=40,
+        low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER,
+        high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER,
+        numerical_transformer="passthrough",
+        datetime_transformer=DATETIME_TRANSFORMER,
+        specific_transformers=None,
+        auto_cast=True,
+        impute_missing="auto",
         # The next parameters are inherited from ColumnTransformer
-        remainder: Literal["drop", "passthrough"] | TransformerMixin = "passthrough",
-        sparse_threshold: float = 0.0,
-        n_jobs: int = None,
+        remainder="passthrough",
+        sparse_threshold=0.0,
+        n_jobs=None,
         transformer_weights=None,
-        verbose: bool = False,
-        verbose_feature_names_out: bool = False,
+        verbose=False,
+        verbose_feature_names_out=False,
     ):
         self.cardinality_threshold = cardinality_threshold
-        self.low_card_cat_transformer = low_card_cat_transformer
-        self.high_card_cat_transformer = high_card_cat_transformer
+        self.low_cardinality_transformer = _clone_if_default(
+            low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER
+        )
+        self.high_cardinality_transformer = _clone_if_default(
+            high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER
+        )
+        self.datetime_transformer = _clone_if_default(
+            datetime_transformer, DATETIME_TRANSFORMER
+        )
         self.numerical_transformer = numerical_transformer
-        self.datetime_transformer = datetime_transformer
         self.specific_transformers = specific_transformers
         self.auto_cast = auto_cast
         self.impute_missing = impute_missing
@@ -414,26 +471,7 @@ def __init__(
         self.verbose = verbose
         self.verbose_feature_names_out = verbose_feature_names_out
 
-    def _more_tags(self) -> dict:
-        """
-        Used internally by sklearn to ease the estimator checks.
-        """
-        return {
-            "X_types": ["2darray", "string"],
-            "allow_nan": [True],
-            "_xfail_checks": {
-                "check_complex_data": "Passthrough complex columns as-is.",
-            },
-        }
-
-    def _propagate_n_jobs(self, transformer):
-        if self.n_jobs is not None and (
-            hasattr(transformer, "n_jobs") and transformer.n_jobs is None
-        ):
-            transformer.set_params(n_jobs=self.n_jobs)
-        return transformer
-
-    def _clone_transformers(self) -> None:
+    def _clone_transformers(self):
         """
         For each of the different transformers that can be passed,
         create the corresponding variable name with a trailing underscore,
@@ -443,105 +481,23 @@ def _clone_transformers(self) -> None:
         Note: typos are not detected here, they are left in and are detected
         down the line in ColumnTransformer.fit_transform.
         """
-        if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin):
-            self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
-        elif self.low_card_cat_transformer is None:
-            # sklearn is lenient and lets us use both
-            # `handle_unknown="infrequent_if_exist"` and `drop="if_binary"`
-            # at the same time
-            self.low_card_cat_transformer_ = OneHotEncoder(
-                drop="if_binary", handle_unknown="infrequent_if_exist"
-            )
-        elif self.low_card_cat_transformer == "remainder":
-            self.low_card_cat_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.low_card_cat_transformer_ = self.low_card_cat_transformer
-        self._propagate_n_jobs(self.low_card_cat_transformer_)
-
-        if isinstance(self.high_card_cat_transformer, sklearn.base.TransformerMixin):
-            self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer)
-        elif self.high_card_cat_transformer is None:
-            self.high_card_cat_transformer_ = GapEncoder(n_components=30)
-        elif self.high_card_cat_transformer == "remainder":
-            self.high_card_cat_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
+        for transformer_name in [
+            "high_cardinality_transformer",
+            "low_cardinality_transformer",
+            "datetime_transformer",
+            "numerical_transformer",
+        ]:
+            transformer = _clone_during_fit(
+                getattr(self, transformer_name),
+                remainder=self.remainder,
+                n_jobs=self.n_jobs,
             )
-        else:
-            self.high_card_cat_transformer_ = self.high_card_cat_transformer
-        self._propagate_n_jobs(self.high_card_cat_transformer_)
-
-        if isinstance(self.numerical_transformer, sklearn.base.TransformerMixin):
-            self.numerical_transformer_ = clone(self.numerical_transformer)
-        elif self.numerical_transformer is None:
-            self.numerical_transformer_ = "passthrough"
-        elif self.numerical_transformer == "remainder":
-            self.numerical_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.numerical_transformer_ = self.numerical_transformer
-        self._propagate_n_jobs(self.numerical_transformer_)
-
-        if isinstance(self.datetime_transformer, sklearn.base.TransformerMixin):
-            self.datetime_transformer_ = clone(self.datetime_transformer)
-        elif self.datetime_transformer is None:
-            self.datetime_transformer_ = DatetimeEncoder()
-        elif self.datetime_transformer == "remainder":
-            self.datetime_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.datetime_transformer_ = self.datetime_transformer
-        self._propagate_n_jobs(self.datetime_transformer_)
-
-        if (self.specific_transformers is None) or len(self.specific_transformers) == 0:
-            self.specific_transformers_ = []
-        else:
-            first_item_length = len(self.specific_transformers[0])
-            # Check all tuples are the same length
-            for i, tup in enumerate(self.specific_transformers):
-                if len(tup) != first_item_length:
-                    raise TypeError(
-                        "Expected `specific_transformers` to be a list of "
-                        "tuples with all the same length, got length "
-                        f"{len(tup)} at index {i} (elements at previous "
-                        f"indices have {first_item_length} in length). "
-                    )
-            if first_item_length == 2:
-                # Unnamed assignments, transform to named
-                named_specific_transformers = _get_transformer_list(
-                    self.specific_transformers
-                )
-            elif first_item_length == 3:
-                # Named assignments
-                named_specific_transformers = self.specific_transformers
-            else:
-                raise TypeError(
-                    "Expected `specific_transformers` to be a list of tuples "
-                    "of length 2 or 3, got a list of tuples of length "
-                    f"{first_item_length}. "
-                )
-
-            self.specific_transformers_ = [
-                (
-                    (name, self._propagate_n_jobs(clone(transformer)), cols)
-                    if isinstance(transformer, sklearn.base.TransformerMixin)
-                    else (name, transformer, cols)
-                )
-                for name, transformer, cols in named_specific_transformers
-            ]
+            setattr(self, f"{transformer_name}_", transformer)
 
-        # TODO: check that the provided transformers are valid
+        self.specific_transformers_ = _check_specific_transformers(
+            self.specific_transformers,
+            self.n_jobs,
+        )
 
     def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame:
         """Takes a dataframe and tries to convert its columns to their best possible
@@ -741,7 +697,7 @@ def _check_X(self, X):
             )
         return X
 
-    def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer":
+    def fit(self, X, y=None):
         """Fit all transformers using X.
 
         Parameters
@@ -763,7 +719,7 @@ def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer":
         self.fit_transform(X, y=y)
         return self
 
-    def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
+    def fit_transform(self, X, y=None):
         """Fit all transformers, transform the data, and concatenate the results.
 
         In practice, it (1) converts features to their best possible types
@@ -849,11 +805,15 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
 
         # Next part: construct the transformers
         # Create the list of all the transformers.
-        all_transformers: list[tuple[str, Transformer, list[str]]] = [
+        all_transformers = [
             ("numeric", self.numerical_transformer_, numeric_columns),
             ("datetime", self.datetime_transformer_, datetime_columns),
-            ("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns),
-            ("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns),
+            ("low_card_cat", self.low_cardinality_transformer_, low_card_cat_columns),
+            (
+                "high_card_cat",
+                self.high_cardinality_transformer_,
+                high_card_cat_columns,
+            ),
             *self.specific_transformers_,
         ]
         # We will now filter this list, by keeping only the ones with:
@@ -901,7 +861,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
 
         return X_enc
 
-    def transform(self, X: ArrayLike) -> ArrayLike:
+    def transform(self, X):
         """Transform `X` by applying the fitted transformers on the columns.
 
         Parameters
@@ -992,9 +952,14 @@ def output_indices_(self) -> dict[str, slice]:
         """
         return self._column_transformer.output_indices_
 
-
-@deprecated("Use TableVectorizer instead.")
-class SuperVectorizer(TableVectorizer):
-    """Deprecated name of TableVectorizer."""
-
-    pass
+    def _more_tags(self) -> dict:
+        """
+        Used internally by sklearn to ease the estimator checks.
+        """
+        return {
+            "X_types": ["2darray", "string"],
+            "allow_nan": [True],
+            "_xfail_checks": {
+                "check_complex_data": "Passthrough complex columns as-is.",
+            },
+        }
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
index 333b4d7f9..a23700259 100644
--- a/skrub/tests/test_table_vectorizer.py
+++ b/skrub/tests/test_table_vectorizer.py
@@ -7,7 +7,7 @@
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
 from sklearn.utils.validation import check_is_fitted
 
-from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer
+from skrub import GapEncoder, MinHashEncoder, TableVectorizer
 from skrub._datetime_encoder import _is_pandas_format_mixed_available
 from skrub._table_vectorizer import _infer_date_format
 from skrub.tests.utils import transformers_list_equal
@@ -192,7 +192,7 @@ def _test_possibilities(X) -> None:
     vectorizer_base = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
     # Warning: order-dependant
@@ -237,7 +237,7 @@ def _test_possibilities(X) -> None:
     vectorizer_cast = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
     X_str = X.astype("object")
@@ -360,7 +360,7 @@ def test_with_arrays() -> None:
     vectorizer = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
 
@@ -380,23 +380,28 @@ def test_get_feature_names_out() -> None:
     vec_w_pass.fit(X)
 
     # In this test, order matters. If it doesn't, convert to set.
-    expected_feature_names_pass = [
-        "int",
-        "float",
-        "str1_public",
-        "str2_chef",
-        "str2_lawyer",
-        "str2_manager",
-        "str2_officer",
-        "str2_teacher",
-        "cat1_yes",
-        "cat2_20K+",
-        "cat2_30K+",
-        "cat2_40K+",
-        "cat2_50K+",
-        "cat2_60K+",
-    ]
-    assert vec_w_pass.get_feature_names_out().tolist() == expected_feature_names_pass
+    expected_feature_names_pass = np.array(
+        [
+            "int",
+            "float",
+            "str1_public",
+            "str2_chef",
+            "str2_lawyer",
+            "str2_manager",
+            "str2_officer",
+            "str2_teacher",
+            "cat1_yes",
+            "cat2_20K+",
+            "cat2_30K+",
+            "cat2_40K+",
+            "cat2_50K+",
+            "cat2_60K+",
+        ]
+    )
+    assert_array_equal(
+        vec_w_pass.get_feature_names_out(),
+        expected_feature_names_pass,
+    )
 
     vec_w_drop = TableVectorizer(remainder="drop")
     vec_w_drop.fit(X)
@@ -480,8 +485,8 @@ def test_passthrough() -> None:
     X_clean = _get_clean_dataframe()
 
     tv = TableVectorizer(
-        low_card_cat_transformer="passthrough",
-        high_card_cat_transformer="passthrough",
+        low_cardinality_transformer="passthrough",
+        high_cardinality_transformer="passthrough",
         datetime_transformer="passthrough",
         numerical_transformer="passthrough",
         impute_missing="skip",
@@ -517,12 +522,6 @@ def test_check_fitted_table_vectorizer() -> None:
     tv.transform(X)
 
 
-def test_check_name_change() -> None:
-    """Test that using SuperVectorizer raises a deprecation warning"""
-    with pytest.warns(FutureWarning):
-        SuperVectorizer()
-
-
 def test_handle_unknown() -> None:
     """
     Test that new categories encountered in the test set
@@ -732,7 +731,7 @@ def test_specific_transformers_unexpected_behavior():
             ],
         ),
         TableVectorizer(
-            low_card_cat_transformer=MinHashEncoder(),
+            low_cardinality_transformer=MinHashEncoder(),
         ),
     ],
 )
@@ -876,13 +875,13 @@ def test_column_by_column() -> None:
     # when applied column by column
     X = _get_clean_dataframe()
     table_vec_all_cols = TableVectorizer(
-        high_card_cat_transformer=GapEncoder(n_components=2, random_state=0),
+        high_cardinality_transformer=GapEncoder(n_components=2, random_state=0),
         cardinality_threshold=4,
     )
     table_vec_all_cols.fit(X)
     for col in X.columns:
         table_vec_one_col = TableVectorizer(
-            high_card_cat_transformer=GapEncoder(n_components=2, random_state=0),
+            high_cardinality_transformer=GapEncoder(n_components=2, random_state=0),
             cardinality_threshold=4,
         )
         table_vec_one_col.fit(X[[col]])
@@ -905,7 +904,7 @@ def test_column_by_column() -> None:
 
 @skip_if_no_parallel
 @pytest.mark.parametrize(
-    "high_card_cat_transformer",
+    "high_cardinality_transformer",
     # the gap encoder and the minhashencoder
     # should be parallelized on all columns
     # the one hot encoder should not be parallelized
@@ -915,11 +914,11 @@ def test_column_by_column() -> None:
         MinHashEncoder(n_components=2),
     ],
 )
-def test_parallelism(high_card_cat_transformer) -> None:
+def test_parallelism(high_cardinality_transformer) -> None:
     # Test that parallelism works
     X = _get_clean_dataframe()
     table_vec_no_parallel = TableVectorizer(
-        high_card_cat_transformer=high_card_cat_transformer,
+        high_cardinality_transformer=high_cardinality_transformer,
         cardinality_threshold=4,
     )
     X_trans = table_vec_no_parallel.fit_transform(X)
@@ -927,7 +926,7 @@ def test_parallelism(high_card_cat_transformer) -> None:
         for n_jobs in [None, 2, -1]:
             table_vec = TableVectorizer(
                 n_jobs=n_jobs,
-                high_card_cat_transformer=high_card_cat_transformer,
+                high_cardinality_transformer=high_cardinality_transformer,
                 cardinality_threshold=4,
             )
             X_trans_parallel = table_vec.fit_transform(X)
@@ -980,7 +979,7 @@ def __init__(self, n_jobs=None):
 
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=None),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=None,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs is None
@@ -988,7 +987,7 @@ def __init__(self, n_jobs=None):
 
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=2),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=None,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
@@ -998,7 +997,7 @@ def __init__(self, n_jobs=None):
     # when the underlying transformer `n_jobs` is not set explicitly.
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=None),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=2,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
@@ -1008,7 +1007,7 @@ def __init__(self, n_jobs=None):
     # when the underlying transformer `n_jobs` is set explicitly.
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=4),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=2,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 4
@@ -1022,14 +1021,14 @@ def test_table_vectorizer_remainder_cloning():
     df = pd.concat([df1, df2], axis=1)
     remainder = FunctionTransformer()
     table_vectorizer = TableVectorizer(
-        low_card_cat_transformer="remainder",
-        high_card_cat_transformer="remainder",
+        low_cardinality_transformer="remainder",
+        high_cardinality_transformer="remainder",
         numerical_transformer="remainder",
         datetime_transformer="remainder",
         remainder=remainder,
     ).fit(df)
-    assert table_vectorizer.low_card_cat_transformer_ is not remainder
-    assert table_vectorizer.high_card_cat_transformer_ is not remainder
+    assert table_vectorizer.low_cardinality_transformer_ is not remainder
+    assert table_vectorizer.high_cardinality_transformer_ is not remainder
     assert table_vectorizer.numerical_transformer_ is not remainder
     assert table_vectorizer.datetime_transformer_ is not remainder