From 4b11e6260b01fdbcaf9059601fe3b318712bc9e3 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 9 Nov 2023 16:51:56 +0100 Subject: [PATCH] [ENH] Enable Grid-Search for `TableVectorizer` (#814) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * apply global sub-estimator default parameter * fix docstirng * fix docstring * remove pytest exception * add changes * Update skrub/_table_vectorizer.py Co-authored-by: Jérôme Dockès * remove the None conversion to passthrough * add passthrough as default for numerical_transformer * fix precommit --------- Co-authored-by: Jérôme Dockès --- CHANGES.rst | 4 + skrub/__init__.py | 3 +- skrub/_table_vectorizer.py | 355 ++++++++++++--------------- skrub/tests/test_table_vectorizer.py | 87 ++++--- 4 files changed, 208 insertions(+), 241 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 4d6a29eb8..c1cd568d4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,10 @@ development and backward compatibility is not ensured. Major changes ------------- +* Pipelines including :class:`TableVectorizer` can now be grid-searched, since + we can now call `set_params` on the default transformers of :class:`TableVectorizer`. + :pr:`814` by :user:`Vincent Maladiere ` + * :func:`to_datetime` is now available to support pandas.to_datetime over dataframes and 2d arrays. :pr:`784` by :user:`Vincent Maladiere ` diff --git a/skrub/__init__.py b/skrub/__init__.py index a55cc134a..e2fccecff 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -13,7 +13,7 @@ from ._minhash_encoder import MinHashEncoder from ._select_cols import DropCols, SelectCols from ._similarity_encoder import SimilarityEncoder -from ._table_vectorizer import SuperVectorizer, TableVectorizer +from ._table_vectorizer import TableVectorizer from ._target_encoder import TargetEncoder check_dependencies() @@ -29,7 +29,6 @@ "GapEncoder", "MinHashEncoder", "SimilarityEncoder", - "SuperVectorizer", "TableVectorizer", "TargetEncoder", "deduplicate", diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 535819a18..7832ca0e1 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -6,27 +6,31 @@ import warnings from collections import Counter -from typing import Literal import numpy as np import pandas as pd import sklearn -from numpy.typing import ArrayLike from pandas._libs.tslibs.parsing import guess_datetime_format from pandas.core.dtypes.base import ExtensionDtype from scipy import sparse -from sklearn.base import TransformerMixin, clone +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.compose import ColumnTransformer from sklearn.compose._column_transformer import _get_transformer_list from sklearn.preprocessing import OneHotEncoder from sklearn.utils import Bunch -from sklearn.utils.deprecation import deprecated -from sklearn.utils.metaestimators import _BaseComposition from sklearn.utils.validation import check_is_fitted from skrub import DatetimeEncoder, GapEncoder from skrub._utils import parse_astype_error_message +HIGH_CARDINALITY_TRANSFORMER = GapEncoder(n_components=30) +LOW_CARDINALITY_TRANSFORMER = OneHotEncoder( + sparse_output=False, + handle_unknown="ignore", + drop="if_binary", +) +DATETIME_TRANSFORMER = DatetimeEncoder() + def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> str | None: """Infer the date format of a date column, @@ -146,17 +150,76 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser -Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"] +def _clone_if_default(transformer, default_transformer): + return clone(transformer) if transformer is default_transformer else transformer + + +def _clone_during_fit(transformer, remainder, n_jobs): + if isinstance(transformer, sklearn.base.TransformerMixin): + return _propagate_n_jobs(clone(transformer), n_jobs) + elif transformer == "remainder": + return remainder if isinstance(remainder, str) else clone(remainder) + elif transformer == "passthrough": + return transformer + else: + raise ValueError( + "'transformer' must be an instance of sklearn.base.TransformerMixin, " + f"'remainder' or 'passthrough'. Got {transformer=!r}." + ) + + +def _check_specific_transformers(specific_transformers, n_jobs): + if (specific_transformers is None) or len(specific_transformers) == 0: + return [] + else: + first_item_length = len(specific_transformers[0]) + # Check that all tuples have the same length + for idx, tuple_ in enumerate(specific_transformers): + if len(tuple_) != first_item_length: + raise TypeError( + "Expected `specific_transformers` to be a list of " + "tuples with all the same length, got length " + f"{len(tuple_)} at index {idx} (elements at previous " + f"indices have {first_item_length} in length). " + ) + if first_item_length == 2: + # Unnamed assignments, transform to named + specific_transformers = _get_transformer_list(specific_transformers) + elif first_item_length == 3: + # Named assignments, no-op + pass + else: + raise TypeError( + "Expected `specific_transformers` to be a list of tuples " + "of length 2 or 3, got a list of tuples of length " + f"{first_item_length}. " + ) + + return [ + ( + (name, _propagate_n_jobs(clone(transformer), n_jobs), cols) + if isinstance(transformer, sklearn.base.TransformerMixin) + else (name, transformer, cols) + ) + for name, transformer, cols in specific_transformers + ] + + +def _propagate_n_jobs(transformer, n_jobs): + if n_jobs is not None and ( + hasattr(transformer, "n_jobs") and transformer.n_jobs is None + ): + transformer.set_params(n_jobs=n_jobs) + return transformer -class TableVectorizer(TransformerMixin, _BaseComposition): +class TableVectorizer(TransformerMixin, BaseEstimator): """Automatically transform a heterogeneous dataframe to a numerical array. Easily transforms a heterogeneous data table - (such as a :obj:`~pandas.DataFrame`) to a numerical array for machine - learning. For this it transforms each column depending on its data type. - It provides a simplified interface for the ColumnTransformer ; - more documentation of attributes and functions are available in its doc. + (such as a :obj:`pandas.DataFrame`) to a numerical array for machine + learning. To do so, the TableVectorizer transforms each column depending + on its data type. Parameters ---------- @@ -165,44 +228,46 @@ class TableVectorizer(TransformerMixin, _BaseComposition): under this value, the low cardinality categorical features, and above or equal, the high cardinality categorical features. Different transformers will be applied to these two groups, - defined by the parameters `low_card_cat_transformer` and - `high_card_cat_transformer` respectively. + defined by the parameters `low_cardinality_transformer` and + `high_cardinality_transformer` respectively. Note: currently, missing values are counted as a single unique value (so they count in the cardinality). - low_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on categorical/string features with low cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance (e.g. OneHotEncoder), a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer - (OneHotEncoder(handle_unknown="ignore", drop="if_binary")). + 'passthrough' to return the unencoded columns. + The default transformer is \ + (OneHotEncoder(handle_unknown="ignore", drop="if_binary")). Features classified under this category are imputed based on the strategy defined with `impute_missing`. - high_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on categorical/string features with high cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance (e.g. GapEncoder), a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer (GapEncoder(n_components=30)). + or 'passthrough' to return the unencoded columns. + The default transformer is (GapEncoder(n_components=30)). Features classified under this category are imputed based on the strategy defined with `impute_missing`. - numerical_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + numerical_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on numerical features. Can either be a transformer object instance (e.g. StandardScaler), a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer (here nothing, so 'passthrough'). + or 'passthrough' to return the unencoded columns (default). Features classified under this category are not imputed at all (regardless of `impute_missing`). @@ -217,7 +282,9 @@ class TableVectorizer(TransformerMixin, _BaseComposition): Features classified under this category are not imputed at all (regardless of `impute_missing`). - specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} or Transformer, list of str or int), optional + specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or \ + Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} \ + or Transformer, list of str or int), optional On top of the default column type classification (see parameters above), this parameter allows you to manually specify transformers for specific columns. @@ -270,8 +337,8 @@ class TableVectorizer(TransformerMixin, _BaseComposition): n_jobs : int, default=None Number of jobs to run in parallel. This number of jobs will be dispatched to the underlying transformers, if those support parallelization and they do not - set specifically `n_jobs`. - ``None`` (the default) means 1 unless in a :fund:`joblib.parallel_config` + set specifically ``n_jobs``. + ``None`` (the default) means 1 unless in a :func:`joblib.parallel_config` context. ``-1`` means using all processors. transformer_weights : dict, default=None @@ -316,7 +383,8 @@ class TableVectorizer(TransformerMixin, _BaseComposition): See Also -------- GapEncoder : - Encodes dirty categories (strings) by constructing latent topics with continuous encoding. + Encodes dirty categories (strings) by constructing latent topics with \ + continuous encoding. MinHashEncoder : Encode string columns as a numeric array with the minhash method. SimilarityEncoder : @@ -326,12 +394,12 @@ class TableVectorizer(TransformerMixin, _BaseComposition): ----- The column order of the input data is not guaranteed to be the same as the output data (returned by TableVectorizer.transform). - This is a due to the way the ColumnTransformer works. + This is a due to the way the underlying ColumnTransformer works. However, the output column order will always be the same for different - calls to TableVectorizer.transform on a same fitted TableVectorizer instance. + calls to ``TableVectorize.transform`` on a same fitted TableVectorizer instance. For example, if input data has columns ['name', 'job', 'year'], then output columns might be shuffled, e.g. ['job', 'year', 'name'], but every call - to TableVectorizer.transform on this instance will return this order. + to ``TableVectorizer.transform`` on this instance will return this order. Examples -------- @@ -355,53 +423,42 @@ class TableVectorizer(TransformerMixin, _BaseComposition): >>> tv.transformers_ [('numeric', 'passthrough', ['year_first_hired']), \ ('datetime', DatetimeEncoder(), ['date_first_hired']), \ -('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist'), \ +('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \ +sparse_output=False), \ ['gender', 'department', 'department_name', 'assignment_category']), \ ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])] - """ # noqa: E501 - - transformers_: list[tuple[str, Transformer, list[str]]] - types_: dict[str, type] - imputed_columns_: list[str] - low_card_cat_transformer_: Transformer - high_card_cat_transformer_: Transformer - numerical_transformer_: Transformer - datetime_transformer_: Transformer - specific_transformers_: list[tuple[str, Transformer, list[str, int]]] - - _transformer_to_input_indices: dict[str, list[int]] - - # Override required parameters - _required_parameters = [] + """ def __init__( self, *, - cardinality_threshold: int = 40, - low_card_cat_transformer: Transformer | None = None, - high_card_cat_transformer: Transformer | None = None, - numerical_transformer: Transformer | None = None, - datetime_transformer: Transformer | None = None, - specific_transformers: list[ - tuple[Transformer, list[str | int]] - | tuple[str, Transformer, list[str, int]] - ] - | None = None, - auto_cast: bool = True, - impute_missing: Literal["auto", "force", "skip"] = "auto", + cardinality_threshold=40, + low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER, + high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER, + numerical_transformer="passthrough", + datetime_transformer=DATETIME_TRANSFORMER, + specific_transformers=None, + auto_cast=True, + impute_missing="auto", # The next parameters are inherited from ColumnTransformer - remainder: Literal["drop", "passthrough"] | TransformerMixin = "passthrough", - sparse_threshold: float = 0.0, - n_jobs: int = None, + remainder="passthrough", + sparse_threshold=0.0, + n_jobs=None, transformer_weights=None, - verbose: bool = False, - verbose_feature_names_out: bool = False, + verbose=False, + verbose_feature_names_out=False, ): self.cardinality_threshold = cardinality_threshold - self.low_card_cat_transformer = low_card_cat_transformer - self.high_card_cat_transformer = high_card_cat_transformer + self.low_cardinality_transformer = _clone_if_default( + low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER + ) + self.high_cardinality_transformer = _clone_if_default( + high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER + ) + self.datetime_transformer = _clone_if_default( + datetime_transformer, DATETIME_TRANSFORMER + ) self.numerical_transformer = numerical_transformer - self.datetime_transformer = datetime_transformer self.specific_transformers = specific_transformers self.auto_cast = auto_cast self.impute_missing = impute_missing @@ -414,26 +471,7 @@ def __init__( self.verbose = verbose self.verbose_feature_names_out = verbose_feature_names_out - def _more_tags(self) -> dict: - """ - Used internally by sklearn to ease the estimator checks. - """ - return { - "X_types": ["2darray", "string"], - "allow_nan": [True], - "_xfail_checks": { - "check_complex_data": "Passthrough complex columns as-is.", - }, - } - - def _propagate_n_jobs(self, transformer): - if self.n_jobs is not None and ( - hasattr(transformer, "n_jobs") and transformer.n_jobs is None - ): - transformer.set_params(n_jobs=self.n_jobs) - return transformer - - def _clone_transformers(self) -> None: + def _clone_transformers(self): """ For each of the different transformers that can be passed, create the corresponding variable name with a trailing underscore, @@ -443,105 +481,23 @@ def _clone_transformers(self) -> None: Note: typos are not detected here, they are left in and are detected down the line in ColumnTransformer.fit_transform. """ - if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin): - self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer) - elif self.low_card_cat_transformer is None: - # sklearn is lenient and lets us use both - # `handle_unknown="infrequent_if_exist"` and `drop="if_binary"` - # at the same time - self.low_card_cat_transformer_ = OneHotEncoder( - drop="if_binary", handle_unknown="infrequent_if_exist" - ) - elif self.low_card_cat_transformer == "remainder": - self.low_card_cat_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.low_card_cat_transformer_ = self.low_card_cat_transformer - self._propagate_n_jobs(self.low_card_cat_transformer_) - - if isinstance(self.high_card_cat_transformer, sklearn.base.TransformerMixin): - self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer) - elif self.high_card_cat_transformer is None: - self.high_card_cat_transformer_ = GapEncoder(n_components=30) - elif self.high_card_cat_transformer == "remainder": - self.high_card_cat_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) + for transformer_name in [ + "high_cardinality_transformer", + "low_cardinality_transformer", + "datetime_transformer", + "numerical_transformer", + ]: + transformer = _clone_during_fit( + getattr(self, transformer_name), + remainder=self.remainder, + n_jobs=self.n_jobs, ) - else: - self.high_card_cat_transformer_ = self.high_card_cat_transformer - self._propagate_n_jobs(self.high_card_cat_transformer_) - - if isinstance(self.numerical_transformer, sklearn.base.TransformerMixin): - self.numerical_transformer_ = clone(self.numerical_transformer) - elif self.numerical_transformer is None: - self.numerical_transformer_ = "passthrough" - elif self.numerical_transformer == "remainder": - self.numerical_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.numerical_transformer_ = self.numerical_transformer - self._propagate_n_jobs(self.numerical_transformer_) - - if isinstance(self.datetime_transformer, sklearn.base.TransformerMixin): - self.datetime_transformer_ = clone(self.datetime_transformer) - elif self.datetime_transformer is None: - self.datetime_transformer_ = DatetimeEncoder() - elif self.datetime_transformer == "remainder": - self.datetime_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.datetime_transformer_ = self.datetime_transformer - self._propagate_n_jobs(self.datetime_transformer_) - - if (self.specific_transformers is None) or len(self.specific_transformers) == 0: - self.specific_transformers_ = [] - else: - first_item_length = len(self.specific_transformers[0]) - # Check all tuples are the same length - for i, tup in enumerate(self.specific_transformers): - if len(tup) != first_item_length: - raise TypeError( - "Expected `specific_transformers` to be a list of " - "tuples with all the same length, got length " - f"{len(tup)} at index {i} (elements at previous " - f"indices have {first_item_length} in length). " - ) - if first_item_length == 2: - # Unnamed assignments, transform to named - named_specific_transformers = _get_transformer_list( - self.specific_transformers - ) - elif first_item_length == 3: - # Named assignments - named_specific_transformers = self.specific_transformers - else: - raise TypeError( - "Expected `specific_transformers` to be a list of tuples " - "of length 2 or 3, got a list of tuples of length " - f"{first_item_length}. " - ) - - self.specific_transformers_ = [ - ( - (name, self._propagate_n_jobs(clone(transformer)), cols) - if isinstance(transformer, sklearn.base.TransformerMixin) - else (name, transformer, cols) - ) - for name, transformer, cols in named_specific_transformers - ] + setattr(self, f"{transformer_name}_", transformer) - # TODO: check that the provided transformers are valid + self.specific_transformers_ = _check_specific_transformers( + self.specific_transformers, + self.n_jobs, + ) def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: """Takes a dataframe and tries to convert its columns to their best possible @@ -741,7 +697,7 @@ def _check_X(self, X): ) return X - def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer": + def fit(self, X, y=None): """Fit all transformers using X. Parameters @@ -763,7 +719,7 @@ def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer": self.fit_transform(X, y=y) return self - def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: + def fit_transform(self, X, y=None): """Fit all transformers, transform the data, and concatenate the results. In practice, it (1) converts features to their best possible types @@ -849,11 +805,15 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: # Next part: construct the transformers # Create the list of all the transformers. - all_transformers: list[tuple[str, Transformer, list[str]]] = [ + all_transformers = [ ("numeric", self.numerical_transformer_, numeric_columns), ("datetime", self.datetime_transformer_, datetime_columns), - ("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns), - ("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns), + ("low_card_cat", self.low_cardinality_transformer_, low_card_cat_columns), + ( + "high_card_cat", + self.high_cardinality_transformer_, + high_card_cat_columns, + ), *self.specific_transformers_, ] # We will now filter this list, by keeping only the ones with: @@ -901,7 +861,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: return X_enc - def transform(self, X: ArrayLike) -> ArrayLike: + def transform(self, X): """Transform `X` by applying the fitted transformers on the columns. Parameters @@ -992,9 +952,14 @@ def output_indices_(self) -> dict[str, slice]: """ return self._column_transformer.output_indices_ - -@deprecated("Use TableVectorizer instead.") -class SuperVectorizer(TableVectorizer): - """Deprecated name of TableVectorizer.""" - - pass + def _more_tags(self) -> dict: + """ + Used internally by sklearn to ease the estimator checks. + """ + return { + "X_types": ["2darray", "string"], + "allow_nan": [True], + "_xfail_checks": { + "check_complex_data": "Passthrough complex columns as-is.", + }, + } diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index 333b4d7f9..a23700259 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -7,7 +7,7 @@ from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from sklearn.utils.validation import check_is_fitted -from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer +from skrub import GapEncoder, MinHashEncoder, TableVectorizer from skrub._datetime_encoder import _is_pandas_format_mixed_available from skrub._table_vectorizer import _infer_date_format from skrub.tests.utils import transformers_list_equal @@ -192,7 +192,7 @@ def _test_possibilities(X) -> None: vectorizer_base = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) # Warning: order-dependant @@ -237,7 +237,7 @@ def _test_possibilities(X) -> None: vectorizer_cast = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) X_str = X.astype("object") @@ -360,7 +360,7 @@ def test_with_arrays() -> None: vectorizer = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) @@ -380,23 +380,28 @@ def test_get_feature_names_out() -> None: vec_w_pass.fit(X) # In this test, order matters. If it doesn't, convert to set. - expected_feature_names_pass = [ - "int", - "float", - "str1_public", - "str2_chef", - "str2_lawyer", - "str2_manager", - "str2_officer", - "str2_teacher", - "cat1_yes", - "cat2_20K+", - "cat2_30K+", - "cat2_40K+", - "cat2_50K+", - "cat2_60K+", - ] - assert vec_w_pass.get_feature_names_out().tolist() == expected_feature_names_pass + expected_feature_names_pass = np.array( + [ + "int", + "float", + "str1_public", + "str2_chef", + "str2_lawyer", + "str2_manager", + "str2_officer", + "str2_teacher", + "cat1_yes", + "cat2_20K+", + "cat2_30K+", + "cat2_40K+", + "cat2_50K+", + "cat2_60K+", + ] + ) + assert_array_equal( + vec_w_pass.get_feature_names_out(), + expected_feature_names_pass, + ) vec_w_drop = TableVectorizer(remainder="drop") vec_w_drop.fit(X) @@ -480,8 +485,8 @@ def test_passthrough() -> None: X_clean = _get_clean_dataframe() tv = TableVectorizer( - low_card_cat_transformer="passthrough", - high_card_cat_transformer="passthrough", + low_cardinality_transformer="passthrough", + high_cardinality_transformer="passthrough", datetime_transformer="passthrough", numerical_transformer="passthrough", impute_missing="skip", @@ -517,12 +522,6 @@ def test_check_fitted_table_vectorizer() -> None: tv.transform(X) -def test_check_name_change() -> None: - """Test that using SuperVectorizer raises a deprecation warning""" - with pytest.warns(FutureWarning): - SuperVectorizer() - - def test_handle_unknown() -> None: """ Test that new categories encountered in the test set @@ -732,7 +731,7 @@ def test_specific_transformers_unexpected_behavior(): ], ), TableVectorizer( - low_card_cat_transformer=MinHashEncoder(), + low_cardinality_transformer=MinHashEncoder(), ), ], ) @@ -876,13 +875,13 @@ def test_column_by_column() -> None: # when applied column by column X = _get_clean_dataframe() table_vec_all_cols = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + high_cardinality_transformer=GapEncoder(n_components=2, random_state=0), cardinality_threshold=4, ) table_vec_all_cols.fit(X) for col in X.columns: table_vec_one_col = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + high_cardinality_transformer=GapEncoder(n_components=2, random_state=0), cardinality_threshold=4, ) table_vec_one_col.fit(X[[col]]) @@ -905,7 +904,7 @@ def test_column_by_column() -> None: @skip_if_no_parallel @pytest.mark.parametrize( - "high_card_cat_transformer", + "high_cardinality_transformer", # the gap encoder and the minhashencoder # should be parallelized on all columns # the one hot encoder should not be parallelized @@ -915,11 +914,11 @@ def test_column_by_column() -> None: MinHashEncoder(n_components=2), ], ) -def test_parallelism(high_card_cat_transformer) -> None: +def test_parallelism(high_cardinality_transformer) -> None: # Test that parallelism works X = _get_clean_dataframe() table_vec_no_parallel = TableVectorizer( - high_card_cat_transformer=high_card_cat_transformer, + high_cardinality_transformer=high_cardinality_transformer, cardinality_threshold=4, ) X_trans = table_vec_no_parallel.fit_transform(X) @@ -927,7 +926,7 @@ def test_parallelism(high_card_cat_transformer) -> None: for n_jobs in [None, 2, -1]: table_vec = TableVectorizer( n_jobs=n_jobs, - high_card_cat_transformer=high_card_cat_transformer, + high_cardinality_transformer=high_cardinality_transformer, cardinality_threshold=4, ) X_trans_parallel = table_vec.fit_transform(X) @@ -980,7 +979,7 @@ def __init__(self, n_jobs=None): table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=None), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=None, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs is None @@ -988,7 +987,7 @@ def __init__(self, n_jobs=None): table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=2), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=None, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2 @@ -998,7 +997,7 @@ def __init__(self, n_jobs=None): # when the underlying transformer `n_jobs` is not set explicitly. table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=None), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=2, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2 @@ -1008,7 +1007,7 @@ def __init__(self, n_jobs=None): # when the underlying transformer `n_jobs` is set explicitly. table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=4), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=2, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 4 @@ -1022,14 +1021,14 @@ def test_table_vectorizer_remainder_cloning(): df = pd.concat([df1, df2], axis=1) remainder = FunctionTransformer() table_vectorizer = TableVectorizer( - low_card_cat_transformer="remainder", - high_card_cat_transformer="remainder", + low_cardinality_transformer="remainder", + high_cardinality_transformer="remainder", numerical_transformer="remainder", datetime_transformer="remainder", remainder=remainder, ).fit(df) - assert table_vectorizer.low_card_cat_transformer_ is not remainder - assert table_vectorizer.high_card_cat_transformer_ is not remainder + assert table_vectorizer.low_cardinality_transformer_ is not remainder + assert table_vectorizer.high_cardinality_transformer_ is not remainder assert table_vectorizer.numerical_transformer_ is not remainder assert table_vectorizer.datetime_transformer_ is not remainder