From 4a4d686b000153993df053bba2e0e2a741b9292d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 2 Nov 2023 17:41:00 +0100 Subject: [PATCH 1/9] apply global sub-estimator default parameter --- skrub/__init__.py | 3 +- skrub/_table_vectorizer.py | 310 ++++++++++++--------------- skrub/tests/test_table_vectorizer.py | 87 ++++---- 3 files changed, 180 insertions(+), 220 deletions(-) diff --git a/skrub/__init__.py b/skrub/__init__.py index 868632080..ccb01dc72 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -13,7 +13,7 @@ from ._minhash_encoder import MinHashEncoder from ._select_cols import DropCols, SelectCols from ._similarity_encoder import SimilarityEncoder -from ._table_vectorizer import SuperVectorizer, TableVectorizer +from ._table_vectorizer import TableVectorizer from ._target_encoder import TargetEncoder check_dependencies() @@ -29,7 +29,6 @@ "GapEncoder", "MinHashEncoder", "SimilarityEncoder", - "SuperVectorizer", "TableVectorizer", "TargetEncoder", "deduplicate", diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index f7478d03c..c804b0571 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -6,27 +6,31 @@ import warnings from collections import Counter -from typing import Literal import numpy as np import pandas as pd import sklearn -from numpy.typing import ArrayLike from pandas._libs.tslibs.parsing import guess_datetime_format from pandas.core.dtypes.base import ExtensionDtype from scipy import sparse -from sklearn.base import TransformerMixin, clone +from sklearn.base import BaseEstimator, TransformerMixin, clone from sklearn.compose import ColumnTransformer from sklearn.compose._column_transformer import _get_transformer_list from sklearn.preprocessing import OneHotEncoder from sklearn.utils import Bunch -from sklearn.utils.deprecation import deprecated -from sklearn.utils.metaestimators import _BaseComposition from sklearn.utils.validation import check_is_fitted from skrub import DatetimeEncoder, GapEncoder from skrub._utils import parse_astype_error_message +HIGH_CARDINALITY_TRANSFORMER = GapEncoder(n_components=30) +LOW_CARDINALITY_TRANSFORMER = OneHotEncoder( + sparse_output=False, + handle_unknown="ignore", + drop="if_binary", +) +DATETIME_TRANSFORMER = DatetimeEncoder() + def _infer_date_format(date_column: pd.Series, n_trials: int = 100) -> str | None: """Infer the date format of a date column, @@ -146,10 +150,72 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser -Transformer = TransformerMixin | Literal["drop", "remainder", "passthrough"] +def _clone_if_default(transformer, DEFAULT_TRANSFORMER): + return clone(transformer) if transformer is DEFAULT_TRANSFORMER else transformer + + +def _clone_during_fit(transformer, remainder, n_jobs): + if isinstance(transformer, sklearn.base.TransformerMixin): + return _propagate_n_jobs(clone(transformer), n_jobs) + elif transformer is None: + return "passthrough" + elif transformer == "remainder": + return remainder if isinstance(remainder, str) else clone(remainder) + elif transformer == "passthrough": + return transformer + else: + raise ValueError( + "'transformer' must be an instance of sklearn.base.TransformerMixin, " + f"None, 'remainder' or 'passthrough'. Got {transformer=!r}." + ) + + +def _check_specific_transformers(specific_transformers, n_jobs): + if (specific_transformers is None) or len(specific_transformers) == 0: + return [] + else: + first_item_length = len(specific_transformers[0]) + # Check that all tuples have the same length + for idx, tuple_ in enumerate(specific_transformers): + if len(tuple_) != first_item_length: + raise TypeError( + "Expected `specific_transformers` to be a list of " + "tuples with all the same length, got length " + f"{len(tuple_)} at index {idx} (elements at previous " + f"indices have {first_item_length} in length). " + ) + if first_item_length == 2: + # Unnamed assignments, transform to named + specific_transformers = _get_transformer_list(specific_transformers) + elif first_item_length == 3: + # Named assignments, no-op + pass + else: + raise TypeError( + "Expected `specific_transformers` to be a list of tuples " + "of length 2 or 3, got a list of tuples of length " + f"{first_item_length}. " + ) + + return [ + ( + (name, _propagate_n_jobs(clone(transformer), n_jobs), cols) + if isinstance(transformer, sklearn.base.TransformerMixin) + else (name, transformer, cols) + ) + for name, transformer, cols in specific_transformers + ] + + +def _propagate_n_jobs(transformer, n_jobs): + if n_jobs is not None and ( + hasattr(transformer, "n_jobs") and transformer.n_jobs is None + ): + transformer.set_params(n_jobs=n_jobs) + return transformer -class TableVectorizer(TransformerMixin, _BaseComposition): +class TableVectorizer(TransformerMixin, BaseEstimator): """Automatically transform a heterogeneous dataframe to a numerical array. Easily transforms a heterogeneous data table @@ -165,12 +231,12 @@ class TableVectorizer(TransformerMixin, _BaseComposition): under this value, the low cardinality categorical features, and above or equal, the high cardinality categorical features. Different transformers will be applied to these two groups, - defined by the parameters `low_card_cat_transformer` and - `high_card_cat_transformer` respectively. + defined by the parameters `low_cardinality_transformer` and + `high_cardinality_transformer` respectively. Note: currently, missing values are counted as a single unique value (so they count in the cardinality). - low_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional Transformer used on categorical/string features with low cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance (e.g. OneHotEncoder), @@ -183,7 +249,7 @@ class TableVectorizer(TransformerMixin, _BaseComposition): Features classified under this category are imputed based on the strategy defined with `impute_missing`. - high_card_cat_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional Transformer used on categorical/string features with high cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance @@ -361,48 +427,36 @@ class TableVectorizer(TransformerMixin, _BaseComposition): ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])] """ # noqa: E501 - transformers_: list[tuple[str, Transformer, list[str]]] - types_: dict[str, type] - imputed_columns_: list[str] - low_card_cat_transformer_: Transformer - high_card_cat_transformer_: Transformer - numerical_transformer_: Transformer - datetime_transformer_: Transformer - specific_transformers_: list[tuple[str, Transformer, list[str, int]]] - - _transformer_to_input_indices: dict[str, list[int]] - - # Override required parameters - _required_parameters = [] - def __init__( self, *, - cardinality_threshold: int = 40, - low_card_cat_transformer: Transformer | None = None, - high_card_cat_transformer: Transformer | None = None, - numerical_transformer: Transformer | None = None, - datetime_transformer: Transformer | None = None, - specific_transformers: list[ - tuple[Transformer, list[str | int]] - | tuple[str, Transformer, list[str, int]] - ] - | None = None, - auto_cast: bool = True, - impute_missing: Literal["auto", "force", "skip"] = "auto", + cardinality_threshold=40, + low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER, + high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER, + datetime_transformer=DATETIME_TRANSFORMER, + numerical_transformer=None, + specific_transformers=None, + auto_cast=True, + impute_missing="auto", # The next parameters are inherited from ColumnTransformer - remainder: Literal["drop", "passthrough"] | TransformerMixin = "passthrough", - sparse_threshold: float = 0.0, - n_jobs: int = None, + remainder="passthrough", + sparse_threshold=0.0, + n_jobs=None, transformer_weights=None, - verbose: bool = False, - verbose_feature_names_out: bool = False, + verbose=False, + verbose_feature_names_out=False, ): self.cardinality_threshold = cardinality_threshold - self.low_card_cat_transformer = low_card_cat_transformer - self.high_card_cat_transformer = high_card_cat_transformer + self.low_cardinality_transformer = _clone_if_default( + low_cardinality_transformer, LOW_CARDINALITY_TRANSFORMER + ) + self.high_cardinality_transformer = _clone_if_default( + high_cardinality_transformer, HIGH_CARDINALITY_TRANSFORMER + ) + self.datetime_transformer = _clone_if_default( + datetime_transformer, DATETIME_TRANSFORMER + ) self.numerical_transformer = numerical_transformer - self.datetime_transformer = datetime_transformer self.specific_transformers = specific_transformers self.auto_cast = auto_cast self.impute_missing = impute_missing @@ -415,26 +469,7 @@ def __init__( self.verbose = verbose self.verbose_feature_names_out = verbose_feature_names_out - def _more_tags(self) -> dict: - """ - Used internally by sklearn to ease the estimator checks. - """ - return { - "X_types": ["2darray", "string"], - "allow_nan": [True], - "_xfail_checks": { - "check_complex_data": "Passthrough complex columns as-is.", - }, - } - - def _propagate_n_jobs(self, transformer): - if self.n_jobs is not None and ( - hasattr(transformer, "n_jobs") and transformer.n_jobs is None - ): - transformer.set_params(n_jobs=self.n_jobs) - return transformer - - def _clone_transformers(self) -> None: + def _clone_transformers(self): """ For each of the different transformers that can be passed, create the corresponding variable name with a trailing underscore, @@ -444,105 +479,23 @@ def _clone_transformers(self) -> None: Note: typos are not detected here, they are left in and are detected down the line in ColumnTransformer.fit_transform. """ - if isinstance(self.low_card_cat_transformer, sklearn.base.TransformerMixin): - self.low_card_cat_transformer_ = clone(self.low_card_cat_transformer) - elif self.low_card_cat_transformer is None: - # sklearn is lenient and lets us use both - # `handle_unknown="infrequent_if_exist"` and `drop="if_binary"` - # at the same time - self.low_card_cat_transformer_ = OneHotEncoder( - drop="if_binary", handle_unknown="infrequent_if_exist" - ) - elif self.low_card_cat_transformer == "remainder": - self.low_card_cat_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.low_card_cat_transformer_ = self.low_card_cat_transformer - self._propagate_n_jobs(self.low_card_cat_transformer_) - - if isinstance(self.high_card_cat_transformer, sklearn.base.TransformerMixin): - self.high_card_cat_transformer_ = clone(self.high_card_cat_transformer) - elif self.high_card_cat_transformer is None: - self.high_card_cat_transformer_ = GapEncoder(n_components=30) - elif self.high_card_cat_transformer == "remainder": - self.high_card_cat_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.high_card_cat_transformer_ = self.high_card_cat_transformer - self._propagate_n_jobs(self.high_card_cat_transformer_) - - if isinstance(self.numerical_transformer, sklearn.base.TransformerMixin): - self.numerical_transformer_ = clone(self.numerical_transformer) - elif self.numerical_transformer is None: - self.numerical_transformer_ = "passthrough" - elif self.numerical_transformer == "remainder": - self.numerical_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) - ) - else: - self.numerical_transformer_ = self.numerical_transformer - self._propagate_n_jobs(self.numerical_transformer_) - - if isinstance(self.datetime_transformer, sklearn.base.TransformerMixin): - self.datetime_transformer_ = clone(self.datetime_transformer) - elif self.datetime_transformer is None: - self.datetime_transformer_ = DatetimeEncoder() - elif self.datetime_transformer == "remainder": - self.datetime_transformer_ = ( - self.remainder - if isinstance(self.remainder, str) - else clone(self.remainder) + for transformer_name in [ + "high_cardinality_transformer", + "low_cardinality_transformer", + "datetime_transformer", + "numerical_transformer", + ]: + transformer = _clone_during_fit( + getattr(self, transformer_name), + remainder=self.remainder, + n_jobs=self.n_jobs, ) - else: - self.datetime_transformer_ = self.datetime_transformer - self._propagate_n_jobs(self.datetime_transformer_) - - if (self.specific_transformers is None) or len(self.specific_transformers) == 0: - self.specific_transformers_ = [] - else: - first_item_length = len(self.specific_transformers[0]) - # Check all tuples are the same length - for i, tup in enumerate(self.specific_transformers): - if len(tup) != first_item_length: - raise TypeError( - "Expected `specific_transformers` to be a list of " - "tuples with all the same length, got length " - f"{len(tup)} at index {i} (elements at previous " - f"indices have {first_item_length} in length). " - ) - if first_item_length == 2: - # Unnamed assignments, transform to named - named_specific_transformers = _get_transformer_list( - self.specific_transformers - ) - elif first_item_length == 3: - # Named assignments - named_specific_transformers = self.specific_transformers - else: - raise TypeError( - "Expected `specific_transformers` to be a list of tuples " - "of length 2 or 3, got a list of tuples of length " - f"{first_item_length}. " - ) + setattr(self, f"{transformer_name}_", transformer) - self.specific_transformers_ = [ - ( - (name, self._propagate_n_jobs(clone(transformer)), cols) - if isinstance(transformer, sklearn.base.TransformerMixin) - else (name, transformer, cols) - ) - for name, transformer, cols in named_specific_transformers - ] - - # TODO: check that the provided transformers are valid + self.specific_transformers_ = _check_specific_transformers( + self.specific_transformers, + self.n_jobs, + ) def _auto_cast(self, X: pd.DataFrame) -> pd.DataFrame: """Takes a dataframe and tries to convert its columns to their best possible @@ -742,7 +695,7 @@ def _check_X(self, X): ) return X - def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer": + def fit(self, X, y=None): """Fit all transformers using X. Parameters @@ -764,7 +717,7 @@ def fit(self, X: ArrayLike, y: ArrayLike = None) -> "TableVectorizer": self.fit_transform(X, y=y) return self - def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: + def fit_transform(self, X, y=None): """Fit all transformers, transform the data, and concatenate the results. In practice, it (1) converts features to their best possible types @@ -850,11 +803,15 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: # Next part: construct the transformers # Create the list of all the transformers. - all_transformers: list[tuple[str, Transformer, list[str]]] = [ + all_transformers = [ ("numeric", self.numerical_transformer_, numeric_columns), ("datetime", self.datetime_transformer_, datetime_columns), - ("low_card_cat", self.low_card_cat_transformer_, low_card_cat_columns), - ("high_card_cat", self.high_card_cat_transformer_, high_card_cat_columns), + ("low_card_cat", self.low_cardinality_transformer_, low_card_cat_columns), + ( + "high_card_cat", + self.high_cardinality_transformer_, + high_card_cat_columns, + ), *self.specific_transformers_, ] # We will now filter this list, by keeping only the ones with: @@ -902,7 +859,7 @@ def fit_transform(self, X: ArrayLike, y: ArrayLike = None) -> ArrayLike: return X_enc - def transform(self, X: ArrayLike) -> ArrayLike: + def transform(self, X): """Transform `X` by applying the fitted transformers on the columns. Parameters @@ -993,9 +950,14 @@ def output_indices_(self) -> dict[str, slice]: """ return self._column_transformer.output_indices_ - -@deprecated("Use TableVectorizer instead.") -class SuperVectorizer(TableVectorizer): - """Deprecated name of TableVectorizer.""" - - pass + def _more_tags(self) -> dict: + """ + Used internally by sklearn to ease the estimator checks. + """ + return { + "X_types": ["2darray", "string"], + "allow_nan": [True], + "_xfail_checks": { + "check_complex_data": "Passthrough complex columns as-is.", + }, + } diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index ffe41155b..9830387a1 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -7,7 +7,7 @@ from sklearn.utils._testing import assert_array_equal, skip_if_no_parallel from sklearn.utils.validation import check_is_fitted -from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer +from skrub import GapEncoder, MinHashEncoder, TableVectorizer from skrub._table_vectorizer import _infer_date_format from skrub.tests.utils import transformers_list_equal @@ -189,7 +189,7 @@ def _test_possibilities(X) -> None: vectorizer_base = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) # Warning: order-dependant @@ -234,7 +234,7 @@ def _test_possibilities(X) -> None: vectorizer_cast = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) X_str = X.astype("object") @@ -357,7 +357,7 @@ def test_with_arrays() -> None: vectorizer = TableVectorizer( cardinality_threshold=4, # we must have n_samples = 5 >= n_components - high_card_cat_transformer=GapEncoder(n_components=2), + high_cardinality_transformer=GapEncoder(n_components=2), numerical_transformer=StandardScaler(), ) @@ -377,23 +377,28 @@ def test_get_feature_names_out() -> None: vec_w_pass.fit(X) # In this test, order matters. If it doesn't, convert to set. - expected_feature_names_pass = [ - "int", - "float", - "str1_public", - "str2_chef", - "str2_lawyer", - "str2_manager", - "str2_officer", - "str2_teacher", - "cat1_yes", - "cat2_20K+", - "cat2_30K+", - "cat2_40K+", - "cat2_50K+", - "cat2_60K+", - ] - assert vec_w_pass.get_feature_names_out().tolist() == expected_feature_names_pass + expected_feature_names_pass = np.array( + [ + "int", + "float", + "str1_public", + "str2_chef", + "str2_lawyer", + "str2_manager", + "str2_officer", + "str2_teacher", + "cat1_yes", + "cat2_20K+", + "cat2_30K+", + "cat2_40K+", + "cat2_50K+", + "cat2_60K+", + ] + ) + assert_array_equal( + vec_w_pass.get_feature_names_out(), + expected_feature_names_pass, + ) vec_w_drop = TableVectorizer(remainder="drop") vec_w_drop.fit(X) @@ -477,8 +482,8 @@ def test_passthrough() -> None: X_clean = _get_clean_dataframe() tv = TableVectorizer( - low_card_cat_transformer="passthrough", - high_card_cat_transformer="passthrough", + low_cardinality_transformer="passthrough", + high_cardinality_transformer="passthrough", datetime_transformer="passthrough", numerical_transformer="passthrough", impute_missing="skip", @@ -514,12 +519,6 @@ def test_check_fitted_table_vectorizer() -> None: tv.transform(X) -def test_check_name_change() -> None: - """Test that using SuperVectorizer raises a deprecation warning""" - with pytest.warns(FutureWarning): - SuperVectorizer() - - def test_handle_unknown() -> None: """ Test that new categories encountered in the test set @@ -729,7 +728,7 @@ def test_specific_transformers_unexpected_behavior(): ], ), TableVectorizer( - low_card_cat_transformer=MinHashEncoder(), + low_cardinality_transformer=MinHashEncoder(), ), ], ) @@ -869,13 +868,13 @@ def test_column_by_column() -> None: # when applied column by column X = _get_clean_dataframe() table_vec_all_cols = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + high_cardinality_transformer=GapEncoder(n_components=2, random_state=0), cardinality_threshold=4, ) table_vec_all_cols.fit(X) for col in X.columns: table_vec_one_col = TableVectorizer( - high_card_cat_transformer=GapEncoder(n_components=2, random_state=0), + high_cardinality_transformer=GapEncoder(n_components=2, random_state=0), cardinality_threshold=4, ) table_vec_one_col.fit(X[[col]]) @@ -898,7 +897,7 @@ def test_column_by_column() -> None: @skip_if_no_parallel @pytest.mark.parametrize( - "high_card_cat_transformer", + "high_cardinality_transformer", # the gap encoder and the minhashencoder # should be parallelized on all columns # the one hot encoder should not be parallelized @@ -908,11 +907,11 @@ def test_column_by_column() -> None: MinHashEncoder(n_components=2), ], ) -def test_parallelism(high_card_cat_transformer) -> None: +def test_parallelism(high_cardinality_transformer) -> None: # Test that parallelism works X = _get_clean_dataframe() table_vec_no_parallel = TableVectorizer( - high_card_cat_transformer=high_card_cat_transformer, + high_cardinality_transformer=high_cardinality_transformer, cardinality_threshold=4, ) X_trans = table_vec_no_parallel.fit_transform(X) @@ -920,7 +919,7 @@ def test_parallelism(high_card_cat_transformer) -> None: for n_jobs in [None, 2, -1]: table_vec = TableVectorizer( n_jobs=n_jobs, - high_card_cat_transformer=high_card_cat_transformer, + high_cardinality_transformer=high_cardinality_transformer, cardinality_threshold=4, ) X_trans_parallel = table_vec.fit_transform(X) @@ -973,7 +972,7 @@ def __init__(self, n_jobs=None): table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=None), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=None, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs is None @@ -981,7 +980,7 @@ def __init__(self, n_jobs=None): table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=2), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=None, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2 @@ -991,7 +990,7 @@ def __init__(self, n_jobs=None): # when the underlying transformer `n_jobs` is not set explicitly. table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=None), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=2, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2 @@ -1001,7 +1000,7 @@ def __init__(self, n_jobs=None): # when the underlying transformer `n_jobs` is set explicitly. table_vectorizer = TableVectorizer( numerical_transformer=DummyTransformerWithJobs(n_jobs=4), - low_card_cat_transformer=DummyTransformerWithJobs(n_jobs=None), + low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None), n_jobs=2, ).fit(X) assert table_vectorizer.named_transformers_["numeric"].n_jobs == 4 @@ -1015,14 +1014,14 @@ def test_table_vectorizer_remainder_cloning(): df = pd.concat([df1, df2], axis=1) remainder = FunctionTransformer() table_vectorizer = TableVectorizer( - low_card_cat_transformer="remainder", - high_card_cat_transformer="remainder", + low_cardinality_transformer="remainder", + high_cardinality_transformer="remainder", numerical_transformer="remainder", datetime_transformer="remainder", remainder=remainder, ).fit(df) - assert table_vectorizer.low_card_cat_transformer_ is not remainder - assert table_vectorizer.high_card_cat_transformer_ is not remainder + assert table_vectorizer.low_cardinality_transformer_ is not remainder + assert table_vectorizer.high_cardinality_transformer_ is not remainder assert table_vectorizer.numerical_transformer_ is not remainder assert table_vectorizer.datetime_transformer_ is not remainder From db14b9b5af66c2d28668d037b7bb478add42e518 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 2 Nov 2023 18:23:31 +0100 Subject: [PATCH 2/9] fix docstirng --- skrub/_table_vectorizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index c804b0571..72b5637bd 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -433,8 +433,8 @@ def __init__( cardinality_threshold=40, low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER, high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER, - datetime_transformer=DATETIME_TRANSFORMER, numerical_transformer=None, + datetime_transformer=DATETIME_TRANSFORMER, specific_transformers=None, auto_cast=True, impute_missing="auto", From 22f681decb629ea6a56996c60e1362e6eb3ca902 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 12:17:04 +0100 Subject: [PATCH 3/9] fix docstring --- skrub/_table_vectorizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 7e2b6f6f1..6bc158b22 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -421,7 +421,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): >>> tv.transformers_ [('numeric', 'passthrough', ['year_first_hired']), \ ('datetime', DatetimeEncoder(), ['date_first_hired']), \ -('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='infrequent_if_exist'), \ +('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \ +sparse_output=False), \ ['gender', 'department', 'department_name', 'assignment_category']), \ ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])] """ # noqa: E501 From a62785153de3270d0b6097612f7539ce685f62f9 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 12:22:02 +0100 Subject: [PATCH 4/9] remove pytest exception --- skrub/_table_vectorizer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 6bc158b22..273047568 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -236,7 +236,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Note: currently, missing values are counted as a single unique value (so they count in the cardinality). - low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + low_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on categorical/string features with low cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance (e.g. OneHotEncoder), @@ -249,7 +250,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Features classified under this category are imputed based on the strategy defined with `impute_missing`. - high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + high_cardinality_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on categorical/string features with high cardinality (threshold is defined by `cardinality_threshold`). Can either be a transformer object instance @@ -261,7 +263,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Features classified under this category are imputed based on the strategy defined with `impute_missing`. - numerical_transformer : {'drop', 'remainder', 'passthrough'} or Transformer, optional + numerical_transformer : {'drop', 'remainder', 'passthrough'} \ + or Transformer, optional Transformer used on numerical features. Can either be a transformer object instance (e.g. StandardScaler), a Pipeline containing the preprocessing steps, @@ -283,7 +286,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator): Features classified under this category are not imputed at all (regardless of `impute_missing`). - specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} or Transformer, list of str or int), optional + specific_transformers : list of tuples ({'drop', 'remainder', 'passthrough'} or \ + Transformer, list of str or int) or (str, {'drop', 'remainder', 'passthrough'} \ + or Transformer, list of str or int), optional On top of the default column type classification (see parameters above), this parameter allows you to manually specify transformers for specific columns. @@ -382,7 +387,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): See Also -------- GapEncoder : - Encodes dirty categories (strings) by constructing latent topics with continuous encoding. + Encodes dirty categories (strings) by constructing latent topics with \ + continuous encoding. MinHashEncoder : Encode string columns as a numeric array with the minhash method. SimilarityEncoder : @@ -425,7 +431,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): sparse_output=False), \ ['gender', 'department', 'department_name', 'assignment_category']), \ ('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])] - """ # noqa: E501 + """ def __init__( self, From c3f56364c7fe7bf821eee9d5798ce99b34db8224 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Sat, 4 Nov 2023 12:57:17 +0100 Subject: [PATCH 5/9] add changes --- CHANGES.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 5d547cfd6..28dac1370 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,10 @@ development and backward compatibility is not ensured. Major changes ------------- +* Pipelines including :class:`TableVectorizer` can now be grid-searched, since + we can now call `set_params` on the default transformers of :class:`TableVectorizer`. + :pr:`814` by :user:`Vincent Maladiere ` + * Some parameters of :class:`Joiner` have changed. The goal is to harmonize parameters across all estimator that perform join(-like) operations, as discussed in `#751 `_. From 28e4c0d3df3390c4b3769c796137d463071261bd Mon Sep 17 00:00:00 2001 From: Vincent M Date: Mon, 6 Nov 2023 20:10:04 +0100 Subject: [PATCH 6/9] Update skrub/_table_vectorizer.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- skrub/_table_vectorizer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 273047568..566d26346 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -150,8 +150,8 @@ def _replace_missing_in_cat_col(ser: pd.Series, value: str = "missing") -> pd.Se return ser -def _clone_if_default(transformer, DEFAULT_TRANSFORMER): - return clone(transformer) if transformer is DEFAULT_TRANSFORMER else transformer +def _clone_if_default(transformer, default_transformer): + return clone(transformer) if transformer is default_transformer else transformer def _clone_during_fit(transformer, remainder, n_jobs): From 8d5c639deeac624123402b459bdc1f08b275be1e Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 8 Nov 2023 11:10:50 +0100 Subject: [PATCH 7/9] remove the None conversion to passthrough --- skrub/_table_vectorizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index 566d26346..dc844eb8c 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -157,8 +157,6 @@ def _clone_if_default(transformer, default_transformer): def _clone_during_fit(transformer, remainder, n_jobs): if isinstance(transformer, sklearn.base.TransformerMixin): return _propagate_n_jobs(clone(transformer), n_jobs) - elif transformer is None: - return "passthrough" elif transformer == "remainder": return remainder if isinstance(remainder, str) else clone(remainder) elif transformer == "passthrough": @@ -166,7 +164,7 @@ def _clone_during_fit(transformer, remainder, n_jobs): else: raise ValueError( "'transformer' must be an instance of sklearn.base.TransformerMixin, " - f"None, 'remainder' or 'passthrough'. Got {transformer=!r}." + f"'remainder' or 'passthrough'. Got {transformer=!r}." ) From cebd9cbc1a64df05f03ff02e121a003c0092d746 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Wed, 8 Nov 2023 11:20:57 +0100 Subject: [PATCH 8/9] add passthrough as default for numerical_transformer --- skrub/_table_vectorizer.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/skrub/_table_vectorizer.py b/skrub/_table_vectorizer.py index dc844eb8c..7832ca0e1 100644 --- a/skrub/_table_vectorizer.py +++ b/skrub/_table_vectorizer.py @@ -217,10 +217,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator): """Automatically transform a heterogeneous dataframe to a numerical array. Easily transforms a heterogeneous data table - (such as a :obj:`~pandas.DataFrame`) to a numerical array for machine - learning. For this it transforms each column depending on its data type. - It provides a simplified interface for the ColumnTransformer ; - more documentation of attributes and functions are available in its doc. + (such as a :obj:`pandas.DataFrame`) to a numerical array for machine + learning. To do so, the TableVectorizer transforms each column depending + on its data type. Parameters ---------- @@ -242,9 +241,9 @@ class TableVectorizer(TransformerMixin, BaseEstimator): a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer - (OneHotEncoder(handle_unknown="ignore", drop="if_binary")). + 'passthrough' to return the unencoded columns. + The default transformer is \ + (OneHotEncoder(handle_unknown="ignore", drop="if_binary")). Features classified under this category are imputed based on the strategy defined with `impute_missing`. @@ -256,8 +255,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): (e.g. GapEncoder), a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer (GapEncoder(n_components=30)). + or 'passthrough' to return the unencoded columns. + The default transformer is (GapEncoder(n_components=30)). Features classified under this category are imputed based on the strategy defined with `impute_missing`. @@ -268,8 +267,7 @@ class TableVectorizer(TransformerMixin, BaseEstimator): a Pipeline containing the preprocessing steps, 'drop' for dropping the columns, 'remainder' for applying `remainder`, - 'passthrough' to return the unencoded columns, - or `None` to use the default transformer (here nothing, so 'passthrough'). + or 'passthrough' to return the unencoded columns (default). Features classified under this category are not imputed at all (regardless of `impute_missing`). @@ -339,8 +337,8 @@ class TableVectorizer(TransformerMixin, BaseEstimator): n_jobs : int, default=None Number of jobs to run in parallel. This number of jobs will be dispatched to the underlying transformers, if those support parallelization and they do not - set specifically `n_jobs`. - ``None`` (the default) means 1 unless in a :fund:`joblib.parallel_config` + set specifically ``n_jobs``. + ``None`` (the default) means 1 unless in a :func:`joblib.parallel_config` context. ``-1`` means using all processors. transformer_weights : dict, default=None @@ -396,12 +394,12 @@ class TableVectorizer(TransformerMixin, BaseEstimator): ----- The column order of the input data is not guaranteed to be the same as the output data (returned by TableVectorizer.transform). - This is a due to the way the ColumnTransformer works. + This is a due to the way the underlying ColumnTransformer works. However, the output column order will always be the same for different - calls to TableVectorizer.transform on a same fitted TableVectorizer instance. + calls to ``TableVectorize.transform`` on a same fitted TableVectorizer instance. For example, if input data has columns ['name', 'job', 'year'], then output columns might be shuffled, e.g. ['job', 'year', 'name'], but every call - to TableVectorizer.transform on this instance will return this order. + to ``TableVectorizer.transform`` on this instance will return this order. Examples -------- @@ -437,7 +435,7 @@ def __init__( cardinality_threshold=40, low_cardinality_transformer=LOW_CARDINALITY_TRANSFORMER, high_cardinality_transformer=HIGH_CARDINALITY_TRANSFORMER, - numerical_transformer=None, + numerical_transformer="passthrough", datetime_transformer=DATETIME_TRANSFORMER, specific_transformers=None, auto_cast=True, From a0e99780c444c7fc1941c3ca86b3626747b2081d Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 9 Nov 2023 11:53:00 +0100 Subject: [PATCH 9/9] fix precommit --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 116a67b26..c1cd568d4 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -18,7 +18,7 @@ Major changes * Pipelines including :class:`TableVectorizer` can now be grid-searched, since we can now call `set_params` on the default transformers of :class:`TableVectorizer`. :pr:`814` by :user:`Vincent Maladiere ` - + * :func:`to_datetime` is now available to support pandas.to_datetime over dataframes and 2d arrays. :pr:`784` by :user:`Vincent Maladiere `