From 4a4d686b000153993df053bba2e0e2a741b9292d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 17:41:00 +0100
Subject: [PATCH 1/9] apply global sub-estimator default parameter

---
 skrub/__init__.py                    |   3 +-
 skrub/_table_vectorizer.py           | 310 ++++++++++++---------------
 skrub/tests/test_table_vectorizer.py |  87 ++++----
 3 files changed, 180 insertions(+), 220 deletions(-)

diff --git a/skrub/__init__.py b/skrub/__init__.py
index 868632080..ccb01dc72 100644
--- a/skrub/__init__.py
+++ b/skrub/__init__.py
@@ -13,7 +13,7 @@
 from ._minhash_encoder import MinHashEncoder
 from ._select_cols import DropCols, SelectCols
 from ._similarity_encoder import SimilarityEncoder
-from ._table_vectorizer import SuperVectorizer, TableVectorizer
+from ._table_vectorizer import TableVectorizer
 from ._target_encoder import TargetEncoder
 
 check_dependencies()
@@ -29,7 +29,6 @@
     "GapEncoder",
     "MinHashEncoder",
     "SimilarityEncoder",
-    "SuperVectorizer",
     "TableVectorizer",
     "TargetEncoder",
     "deduplicate",
diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index f7478d03c..c804b0571 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -6,27 +6,31 @@
 
 import warnings
 from collections import Counter
-from typing import Literal
 
 import numpy as np
 import pandas as pd
 import sklearn
-from numpy.typing import ArrayLike
 from pandas._libs.tslibs.parsing import guess_datetime_format
 from pandas.core.dtypes.base import ExtensionDtype
 from scipy import sparse
-from sklearn.base import TransformerMixin, clone
+from sklearn.base import BaseEstimator, TransformerMixin, clone
 from sklearn.compose import ColumnTransformer
 from sklearn.compose._column_transformer import _get_transformer_list
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.utils import Bunch
-from sklearn.utils.deprecation import deprecated
-from sklearn.utils.metaestimators import _BaseComposition
 from sklearn.utils.validation import check_is_fitted
 
 from skrub import DatetimeEncoder, GapEncoder
 from skrub._utils import parse_astype_error_message
 
+HIGH_CARDINALITY_TRANSFORMER = GapEncoder(n_components=30)
+LOW_CARDINALITY_TRANSFORMER = OneHotEncoder(
+    sparse_output=False,
+    handle_unknown="ignore",
+    drop="if_binary",
+)
+DATETIME_TRANSFORMER = DatetimeEncoder()
+
 
 def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> str | None:
     """Infer the date format of a date column,
@@ -146,10 +150,72 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se
     return ser
 
 
-Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"]
+def _clone_if_default(transformer, DEFAULT_TRANSFORMER):
+    return clone(transformer) if transformer is DEFAULT_TRANSFORMER else transformer
+
+
+def _clone_during_fit(transformer, remainder, n_jobs):
+    if isinstance(transformer, sklearn.base.TransformerMixin):
+        return _propagate_n_jobs(clone(transformer), n_jobs)
+    elif transformer is None:
+        return "passthrough"
+    elif transformer == "remainder":
+        return remainder if isinstance(remainder, str) else clone(remainder)
+    elif transformer == "passthrough":
+        return transformer
+    else:
+        raise ValueError(
+            "'transformer' must be an instance of sklearn.base.TransformerMixin, "
+            f"None, 'remainder' or 'passthrough'. Got {transformer=!r}."
+        )
+
+
+def _check_specific_transformers(specific_transformers, n_jobs):
+    if (specific_transformers is None) or len(specific_transformers) == 0:
+        return []
+    else:
+        first_item_length = len(specific_transformers[0])
+        # Check that all tuples have the same length
+        for idx, tuple_ in enumerate(specific_transformers):
+            if len(tuple_) != first_item_length:
+                raise TypeError(
+                    "Expected `specific_transformers` to be a list of "
+                    "tuples with all the same length, got length "
+                    f"{len(tuple_)} at index {idx} (elements at previous "
+                    f"indices have {first_item_length} in length). "
+                )
+        if first_item_length == 2:
+            # Unnamed assignments, transform to named
+            specific_transformers = _get_transformer_list(specific_transformers)
+        elif first_item_length == 3:
+            # Named assignments, no-op
+            pass
+        else:
+            raise TypeError(
+                "Expected `specific_transformers` to be a list of tuples "
+                "of length 2 or 3, got a list of tuples of length "
+                f"{first_item_length}. "
+            )
+
+        return [
+            (
+                (name, _propagate_n_jobs(clone(transformer), n_jobs), cols)
+                if isinstance(transformer, sklearn.base.TransformerMixin)
+                else (name, transformer, cols)
+            )
+            for name, transformer, cols in specific_transformers
+        ]
+
+
+def _propagate_n_jobs(transformer, n_jobs):
+    if n_jobs is not None and (
+        hasattr(transformer, "n_jobs") and transformer.n_jobs is None
+    ):
+        transformer.set_params(n_jobs=n_jobs)
+    return transformer
 
 
-class TableVectorizer(TransformerMixin, _BaseComposition):
+class TableVectorizer(TransformerMixin, BaseEstimator):
     """Automatically transform a heterogeneous dataframe to a numerical array.
 
     Easily transforms a heterogeneous data table
@@ -165,12 +231,12 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
         under this value, the low cardinality categorical features, and above or
         equal, the high cardinality categorical features.
         Different transformers will be applied to these two groups,
-        defined by the parameters `low_card_cat_transformer` and
-        `high_card_cat_transformer` respectively.
+        defined by the parameters `low_cardinality_transformer` and
+        `high_cardinality_transformer` respectively.
         Note: currently, missing values are counted as a single unique value
         (so they count in the cardinality).
 
-    low_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
         Transformer used on categorical/string features with low cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance (e.g. OneHotEncoder),
@@ -183,7 +249,7 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
-    high_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
         Transformer used on categorical/string features with high cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance
@@ -361,48 +427,36 @@ class TableVectorizer(TransformerMixin, _BaseComposition):
 ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
     """  # noqa: E501
 
-    transformers_: list[tuple[str, Transformer, list[str]]]
-    types_: dict[str, type]
-    imputed_columns_: list[str]
-    low_card_cat_transformer_: Transformer
-    high_card_cat_transformer_: Transformer
-    numerical_transformer_: Transformer
-    datetime_transformer_: Transformer
-    specific_transformers_: list[tuple[str, Transformer, list[str, int]]]
-
-    _transformer_to_input_indices: dict[str, list[int]]
-
-    # Override required parameters
-    _required_parameters = []
-
     def __init__(
         self,
         *,
-        cardinality_threshold: int = 40,
-        low_card_cat_transformer: Transformer | None = None,
-        high_card_cat_transformer: Transformer | None = None,
-        numerical_transformer: Transformer | None = None,
-        datetime_transformer: Transformer | None = None,
-        specific_transformers: list[
-            tuple[Transformer, list[str | int]]
-            | tuple[str, Transformer, list[str, int]]
-        ]
-        | None = None,
-        auto_cast: bool = True,
-        impute_missing: Literal["auto", "force", "skip"] = "auto",
+        cardinality_threshold=40,
+        low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER,
+        high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER,
+        datetime_transformer=DATETIME_TRANSFORMER,
+        numerical_transformer=None,
+        specific_transformers=None,
+        auto_cast=True,
+        impute_missing="auto",
         # The next parameters are inherited from ColumnTransformer
-        remainder: Literal["drop", "passthrough"] | TransformerMixin = "passthrough",
-        sparse_threshold: float = 0.0,
-        n_jobs: int = None,
+        remainder="passthrough",
+        sparse_threshold=0.0,
+        n_jobs=None,
         transformer_weights=None,
-        verbose: bool = False,
-        verbose_feature_names_out: bool = False,
+        verbose=False,
+        verbose_feature_names_out=False,
     ):
         self.cardinality_threshold = cardinality_threshold
-        self.low_card_cat_transformer = low_card_cat_transformer
-        self.high_card_cat_transformer = high_card_cat_transformer
+        self.low_cardinality_transformer = _clone_if_default(
+            low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER
+        )
+        self.high_cardinality_transformer = _clone_if_default(
+            high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER
+        )
+        self.datetime_transformer = _clone_if_default(
+            datetime_transformer, DATETIME_TRANSFORMER
+        )
         self.numerical_transformer = numerical_transformer
-        self.datetime_transformer = datetime_transformer
         self.specific_transformers = specific_transformers
         self.auto_cast = auto_cast
         self.impute_missing = impute_missing
@@ -415,26 +469,7 @@ def __init__(
         self.verbose = verbose
         self.verbose_feature_names_out = verbose_feature_names_out
 
-    def _more_tags(self) -> dict:
-        """
-        Used internally by sklearn to ease the estimator checks.
-        """
-        return {
-            "X_types": ["2darray", "string"],
-            "allow_nan": [True],
-            "_xfail_checks": {
-                "check_complex_data": "Passthrough complex columns as-is.",
-            },
-        }
-
-    def _propagate_n_jobs(self, transformer):
-        if self.n_jobs is not None and (
-            hasattr(transformer, "n_jobs") and transformer.n_jobs is None
-        ):
-            transformer.set_params(n_jobs=self.n_jobs)
-        return transformer
-
-    def _clone_transformers(self) -> None:
+    def _clone_transformers(self):
         """
         For each of the different transformers that can be passed,
         create the corresponding variable name with a trailing underscore,
@@ -444,105 +479,23 @@ def _clone_transformers(self) -> None:
         Note: typos are not detected here, they are left in and are detected
         down the line in ColumnTransformer.fit_transform.
         """
-        if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin):
-            self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer)
-        elif self.low_card_cat_transformer is None:
-            # sklearn is lenient and lets us use both
-            # `handle_unknown="infrequent_if_exist"` and `drop="if_binary"`
-            # at the same time
-            self.low_card_cat_transformer_ = OneHotEncoder(
-                drop="if_binary", handle_unknown="infrequent_if_exist"
-            )
-        elif self.low_card_cat_transformer == "remainder":
-            self.low_card_cat_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.low_card_cat_transformer_ = self.low_card_cat_transformer
-        self._propagate_n_jobs(self.low_card_cat_transformer_)
-
-        if isinstance(self.high_card_cat_transformer, sklearn.base.TransformerMixin):
-            self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer)
-        elif self.high_card_cat_transformer is None:
-            self.high_card_cat_transformer_ = GapEncoder(n_components=30)
-        elif self.high_card_cat_transformer == "remainder":
-            self.high_card_cat_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.high_card_cat_transformer_ = self.high_card_cat_transformer
-        self._propagate_n_jobs(self.high_card_cat_transformer_)
-
-        if isinstance(self.numerical_transformer, sklearn.base.TransformerMixin):
-            self.numerical_transformer_ = clone(self.numerical_transformer)
-        elif self.numerical_transformer is None:
-            self.numerical_transformer_ = "passthrough"
-        elif self.numerical_transformer == "remainder":
-            self.numerical_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
-            )
-        else:
-            self.numerical_transformer_ = self.numerical_transformer
-        self._propagate_n_jobs(self.numerical_transformer_)
-
-        if isinstance(self.datetime_transformer, sklearn.base.TransformerMixin):
-            self.datetime_transformer_ = clone(self.datetime_transformer)
-        elif self.datetime_transformer is None:
-            self.datetime_transformer_ = DatetimeEncoder()
-        elif self.datetime_transformer == "remainder":
-            self.datetime_transformer_ = (
-                self.remainder
-                if isinstance(self.remainder, str)
-                else clone(self.remainder)
+        for transformer_name in [
+            "high_cardinality_transformer",
+            "low_cardinality_transformer",
+            "datetime_transformer",
+            "numerical_transformer",
+        ]:
+            transformer = _clone_during_fit(
+                getattr(self, transformer_name),
+                remainder=self.remainder,
+                n_jobs=self.n_jobs,
             )
-        else:
-            self.datetime_transformer_ = self.datetime_transformer
-        self._propagate_n_jobs(self.datetime_transformer_)
-
-        if (self.specific_transformers is None) or len(self.specific_transformers) == 0:
-            self.specific_transformers_ = []
-        else:
-            first_item_length = len(self.specific_transformers[0])
-            # Check all tuples are the same length
-            for i, tup in enumerate(self.specific_transformers):
-                if len(tup) != first_item_length:
-                    raise TypeError(
-                        "Expected `specific_transformers` to be a list of "
-                        "tuples with all the same length, got length "
-                        f"{len(tup)} at index {i} (elements at previous "
-                        f"indices have {first_item_length} in length). "
-                    )
-            if first_item_length == 2:
-                # Unnamed assignments, transform to named
-                named_specific_transformers = _get_transformer_list(
-                    self.specific_transformers
-                )
-            elif first_item_length == 3:
-                # Named assignments
-                named_specific_transformers = self.specific_transformers
-            else:
-                raise TypeError(
-                    "Expected `specific_transformers` to be a list of tuples "
-                    "of length 2 or 3, got a list of tuples of length "
-                    f"{first_item_length}. "
-                )
+            setattr(self, f"{transformer_name}_", transformer)
 
-            self.specific_transformers_ = [
-                (
-                    (name, self._propagate_n_jobs(clone(transformer)), cols)
-                    if isinstance(transformer, sklearn.base.TransformerMixin)
-                    else (name, transformer, cols)
-                )
-                for name, transformer, cols in named_specific_transformers
-            ]
-
-        # TODO: check that the provided transformers are valid
+        self.specific_transformers_ = _check_specific_transformers(
+            self.specific_transformers,
+            self.n_jobs,
+        )
 
     def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame:
         """Takes a dataframe and tries to convert its columns to their best possible
@@ -742,7 +695,7 @@ def _check_X(self, X):
             )
         return X
 
-    def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer":
+    def fit(self, X, y=None):
         """Fit all transformers using X.
 
         Parameters
@@ -764,7 +717,7 @@ def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer":
         self.fit_transform(X, y=y)
         return self
 
-    def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
+    def fit_transform(self, X, y=None):
         """Fit all transformers, transform the data, and concatenate the results.
 
         In practice, it (1) converts features to their best possible types
@@ -850,11 +803,15 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
 
         # Next part: construct the transformers
         # Create the list of all the transformers.
-        all_transformers: list[tuple[str, Transformer, list[str]]] = [
+        all_transformers = [
             ("numeric", self.numerical_transformer_, numeric_columns),
             ("datetime", self.datetime_transformer_, datetime_columns),
-            ("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns),
-            ("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns),
+            ("low_card_cat", self.low_cardinality_transformer_, low_card_cat_columns),
+            (
+                "high_card_cat",
+                self.high_cardinality_transformer_,
+                high_card_cat_columns,
+            ),
             *self.specific_transformers_,
         ]
         # We will now filter this list, by keeping only the ones with:
@@ -902,7 +859,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike:
 
         return X_enc
 
-    def transform(self, X: ArrayLike) -> ArrayLike:
+    def transform(self, X):
         """Transform `X` by applying the fitted transformers on the columns.
 
         Parameters
@@ -993,9 +950,14 @@ def output_indices_(self) -> dict[str, slice]:
         """
         return self._column_transformer.output_indices_
 
-
-@deprecated("Use TableVectorizer instead.")
-class SuperVectorizer(TableVectorizer):
-    """Deprecated name of TableVectorizer."""
-
-    pass
+    def _more_tags(self) -> dict:
+        """
+        Used internally by sklearn to ease the estimator checks.
+        """
+        return {
+            "X_types": ["2darray", "string"],
+            "allow_nan": [True],
+            "_xfail_checks": {
+                "check_complex_data": "Passthrough complex columns as-is.",
+            },
+        }
diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py
index ffe41155b..9830387a1 100644
--- a/skrub/tests/test_table_vectorizer.py
+++ b/skrub/tests/test_table_vectorizer.py
@@ -7,7 +7,7 @@
 from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel
 from sklearn.utils.validation import check_is_fitted
 
-from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer
+from skrub import GapEncoder, MinHashEncoder, TableVectorizer
 from skrub._table_vectorizer import _infer_date_format
 from skrub.tests.utils import transformers_list_equal
 
@@ -189,7 +189,7 @@ def _test_possibilities(X) -> None:
     vectorizer_base = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
     # Warning: order-dependant
@@ -234,7 +234,7 @@ def _test_possibilities(X) -> None:
     vectorizer_cast = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
     X_str = X.astype("object")
@@ -357,7 +357,7 @@ def test_with_arrays() -> None:
     vectorizer = TableVectorizer(
         cardinality_threshold=4,
         # we must have n_samples = 5 >= n_components
-        high_card_cat_transformer=GapEncoder(n_components=2),
+        high_cardinality_transformer=GapEncoder(n_components=2),
         numerical_transformer=StandardScaler(),
     )
 
@@ -377,23 +377,28 @@ def test_get_feature_names_out() -> None:
     vec_w_pass.fit(X)
 
     # In this test, order matters. If it doesn't, convert to set.
-    expected_feature_names_pass = [
-        "int",
-        "float",
-        "str1_public",
-        "str2_chef",
-        "str2_lawyer",
-        "str2_manager",
-        "str2_officer",
-        "str2_teacher",
-        "cat1_yes",
-        "cat2_20K+",
-        "cat2_30K+",
-        "cat2_40K+",
-        "cat2_50K+",
-        "cat2_60K+",
-    ]
-    assert vec_w_pass.get_feature_names_out().tolist() == expected_feature_names_pass
+    expected_feature_names_pass = np.array(
+        [
+            "int",
+            "float",
+            "str1_public",
+            "str2_chef",
+            "str2_lawyer",
+            "str2_manager",
+            "str2_officer",
+            "str2_teacher",
+            "cat1_yes",
+            "cat2_20K+",
+            "cat2_30K+",
+            "cat2_40K+",
+            "cat2_50K+",
+            "cat2_60K+",
+        ]
+    )
+    assert_array_equal(
+        vec_w_pass.get_feature_names_out(),
+        expected_feature_names_pass,
+    )
 
     vec_w_drop = TableVectorizer(remainder="drop")
     vec_w_drop.fit(X)
@@ -477,8 +482,8 @@ def test_passthrough() -> None:
     X_clean = _get_clean_dataframe()
 
     tv = TableVectorizer(
-        low_card_cat_transformer="passthrough",
-        high_card_cat_transformer="passthrough",
+        low_cardinality_transformer="passthrough",
+        high_cardinality_transformer="passthrough",
         datetime_transformer="passthrough",
         numerical_transformer="passthrough",
         impute_missing="skip",
@@ -514,12 +519,6 @@ def test_check_fitted_table_vectorizer() -> None:
     tv.transform(X)
 
 
-def test_check_name_change() -> None:
-    """Test that using SuperVectorizer raises a deprecation warning"""
-    with pytest.warns(FutureWarning):
-        SuperVectorizer()
-
-
 def test_handle_unknown() -> None:
     """
     Test that new categories encountered in the test set
@@ -729,7 +728,7 @@ def test_specific_transformers_unexpected_behavior():
             ],
         ),
         TableVectorizer(
-            low_card_cat_transformer=MinHashEncoder(),
+            low_cardinality_transformer=MinHashEncoder(),
         ),
     ],
 )
@@ -869,13 +868,13 @@ def test_column_by_column() -> None:
     # when applied column by column
     X = _get_clean_dataframe()
     table_vec_all_cols = TableVectorizer(
-        high_card_cat_transformer=GapEncoder(n_components=2, random_state=0),
+        high_cardinality_transformer=GapEncoder(n_components=2, random_state=0),
         cardinality_threshold=4,
     )
     table_vec_all_cols.fit(X)
     for col in X.columns:
         table_vec_one_col = TableVectorizer(
-            high_card_cat_transformer=GapEncoder(n_components=2, random_state=0),
+            high_cardinality_transformer=GapEncoder(n_components=2, random_state=0),
             cardinality_threshold=4,
         )
         table_vec_one_col.fit(X[[col]])
@@ -898,7 +897,7 @@ def test_column_by_column() -> None:
 
 @skip_if_no_parallel
 @pytest.mark.parametrize(
-    "high_card_cat_transformer",
+    "high_cardinality_transformer",
     # the gap encoder and the minhashencoder
     # should be parallelized on all columns
     # the one hot encoder should not be parallelized
@@ -908,11 +907,11 @@ def test_column_by_column() -> None:
         MinHashEncoder(n_components=2),
     ],
 )
-def test_parallelism(high_card_cat_transformer) -> None:
+def test_parallelism(high_cardinality_transformer) -> None:
     # Test that parallelism works
     X = _get_clean_dataframe()
     table_vec_no_parallel = TableVectorizer(
-        high_card_cat_transformer=high_card_cat_transformer,
+        high_cardinality_transformer=high_cardinality_transformer,
         cardinality_threshold=4,
     )
     X_trans = table_vec_no_parallel.fit_transform(X)
@@ -920,7 +919,7 @@ def test_parallelism(high_card_cat_transformer) -> None:
         for n_jobs in [None, 2, -1]:
             table_vec = TableVectorizer(
                 n_jobs=n_jobs,
-                high_card_cat_transformer=high_card_cat_transformer,
+                high_cardinality_transformer=high_cardinality_transformer,
                 cardinality_threshold=4,
             )
             X_trans_parallel = table_vec.fit_transform(X)
@@ -973,7 +972,7 @@ def __init__(self, n_jobs=None):
 
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=None),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=None,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs is None
@@ -981,7 +980,7 @@ def __init__(self, n_jobs=None):
 
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=2),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=None,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
@@ -991,7 +990,7 @@ def __init__(self, n_jobs=None):
     # when the underlying transformer `n_jobs` is not set explicitly.
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=None),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=2,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
@@ -1001,7 +1000,7 @@ def __init__(self, n_jobs=None):
     # when the underlying transformer `n_jobs` is set explicitly.
     table_vectorizer = TableVectorizer(
         numerical_transformer=DummyTransformerWithJobs(n_jobs=4),
-        low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None),
+        low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
         n_jobs=2,
     ).fit(X)
     assert table_vectorizer.named_transformers_["numeric"].n_jobs == 4
@@ -1015,14 +1014,14 @@ def test_table_vectorizer_remainder_cloning():
     df = pd.concat([df1, df2], axis=1)
     remainder = FunctionTransformer()
     table_vectorizer = TableVectorizer(
-        low_card_cat_transformer="remainder",
-        high_card_cat_transformer="remainder",
+        low_cardinality_transformer="remainder",
+        high_cardinality_transformer="remainder",
         numerical_transformer="remainder",
         datetime_transformer="remainder",
         remainder=remainder,
     ).fit(df)
-    assert table_vectorizer.low_card_cat_transformer_ is not remainder
-    assert table_vectorizer.high_card_cat_transformer_ is not remainder
+    assert table_vectorizer.low_cardinality_transformer_ is not remainder
+    assert table_vectorizer.high_cardinality_transformer_ is not remainder
     assert table_vectorizer.numerical_transformer_ is not remainder
     assert table_vectorizer.datetime_transformer_ is not remainder
 

From db14b9b5af66c2d28668d037b7bb478add42e518 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 2 Nov 2023 18:23:31 +0100
Subject: [PATCH 2/9] fix docstirng

---
 skrub/_table_vectorizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index c804b0571..72b5637bd 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -433,8 +433,8 @@ def __init__(
         cardinality_threshold=40,
         low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER,
         high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER,
-        datetime_transformer=DATETIME_TRANSFORMER,
         numerical_transformer=None,
+        datetime_transformer=DATETIME_TRANSFORMER,
         specific_transformers=None,
         auto_cast=True,
         impute_missing="auto",

From 22f681decb629ea6a56996c60e1362e6eb3ca902 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 12:17:04 +0100
Subject: [PATCH 3/9] fix docstring

---
 skrub/_table_vectorizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 7e2b6f6f1..6bc158b22 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -421,7 +421,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     >>> tv.transformers_
     [('numeric', 'passthrough', ['year_first_hired']), \
 ('datetime', DatetimeEncoder(), ['date_first_hired']), \
-('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist'), \
+('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \
+sparse_output=False), \
 ['gender', 'department', 'department_name', 'assignment_category']), \
 ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
     """  # noqa: E501

From a62785153de3270d0b6097612f7539ce685f62f9 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 12:22:02 +0100
Subject: [PATCH 4/9] remove pytest exception

---
 skrub/_table_vectorizer.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 6bc158b22..273047568 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -236,7 +236,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         Note: currently, missing values are counted as a single unique value
         (so they count in the cardinality).
 
-    low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on categorical/string features with low cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance (e.g. OneHotEncoder),
@@ -249,7 +250,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
-    high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on categorical/string features with high cardinality
         (threshold is defined by `cardinality_threshold`).
         Can either be a transformer object instance
@@ -261,7 +263,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
-    numerical_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional
+    numerical_transformer : {'drop', 'remainder', 'passthrough'} \
+        or Transformer, optional
         Transformer used on numerical features.
         Can either be a transformer object instance (e.g. StandardScaler),
         a Pipeline containing the preprocessing steps,
@@ -283,7 +286,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         Features classified under this category are not imputed at all
         (regardless of `impute_missing`).
 
-    specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} or Transformer, list of str or int), optional
+    specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or \
+        Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} \
+            or Transformer, list of str or int), optional
         On top of the default column type classification (see parameters above),
         this parameter allows you to manually specify transformers for
         specific columns.
@@ -382,7 +387,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     See Also
     --------
     GapEncoder :
-        Encodes dirty categories (strings) by constructing latent topics with continuous encoding.
+        Encodes dirty categories (strings) by constructing latent topics with \
+            continuous encoding.
     MinHashEncoder :
         Encode string columns as a numeric array with the minhash method.
     SimilarityEncoder :
@@ -425,7 +431,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
 sparse_output=False), \
 ['gender', 'department', 'department_name', 'assignment_category']), \
 ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
-    """  # noqa: E501
+    """
 
     def __init__(
         self,

From c3f56364c7fe7bf821eee9d5798ce99b34db8224 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Sat, 4 Nov 2023 12:57:17 +0100
Subject: [PATCH 5/9] add changes

---
 CHANGES.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CHANGES.rst b/CHANGES.rst
index 5d547cfd6..28dac1370 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -15,6 +15,10 @@ development and backward compatibility is not ensured.
 Major changes
 -------------
 
+* Pipelines including :class:`TableVectorizer` can now be grid-searched, since
+  we can now call `set_params` on the default transformers of :class:`TableVectorizer`.
+  :pr:`814` by :user:`Vincent Maladiere <Vincent-Maladiere>`
+
 * Some parameters of :class:`Joiner` have changed. The goal is to harmonize
   parameters across all estimator that perform join(-like) operations, as
   discussed in `#751 <https://github.com/skrub-data/skrub/discussions/751>`_.

From 28e4c0d3df3390c4b3769c796137d463071261bd Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Mon, 6 Nov 2023 20:10:04 +0100
Subject: [PATCH 6/9] Update skrub/_table_vectorizer.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Jérôme Dockès <jerome@dockes.org>
---
 skrub/_table_vectorizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 273047568..566d26346 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -150,8 +150,8 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se
     return ser
 
 
-def _clone_if_default(transformer, DEFAULT_TRANSFORMER):
-    return clone(transformer) if transformer is DEFAULT_TRANSFORMER else transformer
+def _clone_if_default(transformer, default_transformer):
+    return clone(transformer) if transformer is default_transformer else transformer
 
 
 def _clone_during_fit(transformer, remainder, n_jobs):

From 8d5c639deeac624123402b459bdc1f08b275be1e Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 8 Nov 2023 11:10:50 +0100
Subject: [PATCH 7/9] remove the None conversion to passthrough

---
 skrub/_table_vectorizer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index 566d26346..dc844eb8c 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -157,8 +157,6 @@ def _clone_if_default(transformer, default_transformer):
 def _clone_during_fit(transformer, remainder, n_jobs):
     if isinstance(transformer, sklearn.base.TransformerMixin):
         return _propagate_n_jobs(clone(transformer), n_jobs)
-    elif transformer is None:
-        return "passthrough"
     elif transformer == "remainder":
         return remainder if isinstance(remainder, str) else clone(remainder)
     elif transformer == "passthrough":
@@ -166,7 +164,7 @@ def _clone_during_fit(transformer, remainder, n_jobs):
     else:
         raise ValueError(
             "'transformer' must be an instance of sklearn.base.TransformerMixin, "
-            f"None, 'remainder' or 'passthrough'. Got {transformer=!r}."
+            f"'remainder' or 'passthrough'. Got {transformer=!r}."
         )
 
 

From cebd9cbc1a64df05f03ff02e121a003c0092d746 Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Wed, 8 Nov 2023 11:20:57 +0100
Subject: [PATCH 8/9] add passthrough as default for numerical_transformer

---
 skrub/_table_vectorizer.py | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py
index dc844eb8c..7832ca0e1 100644
--- a/skrub/_table_vectorizer.py
+++ b/skrub/_table_vectorizer.py
@@ -217,10 +217,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     """Automatically transform a heterogeneous dataframe to a numerical array.
 
     Easily transforms a heterogeneous data table
-    (such as a :obj:`~pandas.DataFrame`) to a numerical array for machine
-    learning. For this it transforms each column depending on its data type.
-    It provides a simplified interface for the ColumnTransformer ;
-    more documentation of attributes and functions are available in its doc.
+    (such as a :obj:`pandas.DataFrame`) to a numerical array for machine
+    learning. To do so, the TableVectorizer transforms each column depending
+    on its data type.
 
     Parameters
     ----------
@@ -242,9 +241,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer
-        (OneHotEncoder(handle_unknown="ignore", drop="if_binary")).
+        'passthrough' to return the unencoded columns.
+        The default transformer is \
+            (OneHotEncoder(handle_unknown="ignore", drop="if_binary")).
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
@@ -256,8 +255,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         (e.g. GapEncoder), a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer (GapEncoder(n_components=30)).
+        or 'passthrough' to return the unencoded columns.
+        The default transformer is (GapEncoder(n_components=30)).
         Features classified under this category are imputed based on the
         strategy defined with `impute_missing`.
 
@@ -268,8 +267,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
         a Pipeline containing the preprocessing steps,
         'drop' for dropping the columns,
         'remainder' for applying `remainder`,
-        'passthrough' to return the unencoded columns,
-        or `None` to use the default transformer (here nothing, so 'passthrough').
+        or 'passthrough' to return the unencoded columns (default).
         Features classified under this category are not imputed at all
         (regardless of `impute_missing`).
 
@@ -339,8 +337,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     n_jobs : int, default=None
         Number of jobs to run in parallel. This number of jobs will be dispatched to
         the underlying transformers, if those support parallelization and they do not
-        set specifically `n_jobs`.
-        ``None`` (the default) means 1 unless in a :fund:`joblib.parallel_config`
+        set specifically ``n_jobs``.
+        ``None`` (the default) means 1 unless in a :func:`joblib.parallel_config`
         context. ``-1`` means using all processors.
 
     transformer_weights : dict, default=None
@@ -396,12 +394,12 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
     -----
     The column order of the input data is not guaranteed to be the same
     as the output data (returned by TableVectorizer.transform).
-    This is a due to the way the ColumnTransformer works.
+    This is a due to the way the underlying ColumnTransformer works.
     However, the output column order will always be the same for different
-    calls to TableVectorizer.transform on a same fitted TableVectorizer instance.
+    calls to ``TableVectorize.transform`` on a same fitted TableVectorizer instance.
     For example, if input data has columns ['name', 'job', 'year'], then output
     columns might be shuffled, e.g. ['job', 'year', 'name'], but every call
-    to TableVectorizer.transform on this instance will return this order.
+    to ``TableVectorizer.transform`` on this instance will return this order.
 
     Examples
     --------
@@ -437,7 +435,7 @@ def __init__(
         cardinality_threshold=40,
         low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER,
         high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER,
-        numerical_transformer=None,
+        numerical_transformer="passthrough",
         datetime_transformer=DATETIME_TRANSFORMER,
         specific_transformers=None,
         auto_cast=True,

From a0e99780c444c7fc1941c3ca86b3626747b2081d Mon Sep 17 00:00:00 2001
From: Vincent M <maladiere.vincent@yahoo.fr>
Date: Thu, 9 Nov 2023 11:53:00 +0100
Subject: [PATCH 9/9] fix precommit

---
 CHANGES.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGES.rst b/CHANGES.rst
index 116a67b26..c1cd568d4 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -18,7 +18,7 @@ Major changes
 * Pipelines including :class:`TableVectorizer` can now be grid-searched, since
   we can now call `set_params` on the default transformers of :class:`TableVectorizer`.
   :pr:`814` by :user:`Vincent Maladiere <Vincent-Maladiere>`
-  
+
 * :func:`to_datetime` is now available to support pandas.to_datetime
   over dataframes and 2d arrays.
   :pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`