Skip to content

Commit

Permalink
replacing high_card_cat and low_card_cat
Browse files Browse the repository at this point in the history
  • Loading branch information
Vincent-Maladiere committed Nov 9, 2023
1 parent 7d1f3c2 commit c75c235
Showing 7 changed files with 40 additions and 40 deletions.
4 changes: 2 additions & 2 deletions benchmarks/bench_gap_divergence.py
Original file line number Diff line number Diff line change
@@ -208,7 +208,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
(
"encoding",
TableVectorizer(
high_card_cat_transformer=ModifiedGapEncoder(
high_cardinality_transformer=ModifiedGapEncoder(
min_iter=5,
max_iter=5,
max_iter_e_step=max_iter_e_step,
@@ -234,7 +234,7 @@ def benchmark(max_iter_e_step: int, dataset_name: str):
results = []
for pipeline, (_, cv_results) in zip(pipelines, cv_df.iterrows()):
for modified_gap_encoder in (
pipeline["encoding"].named_transformers_["high_card_cat"].fitted_models_
pipeline["encoding"].named_transformers_["high_cardinality"].fitted_models_
):
for gap_iter, inner_results in enumerate(
modified_gap_encoder.benchmark_results_
2 changes: 1 addition & 1 deletion benchmarks/bench_tablevectorizer_tuning.py
Original file line number Diff line number Diff line change
@@ -59,7 +59,7 @@ def benchmark(
):
tv = TableVectorizer(
cardinality_threshold=tv_cardinality_threshold,
high_card_cat_transformer=MinHashEncoder(n_components=minhash_n_components),
high_cardinality_transformer=MinHashEncoder(n_components=minhash_n_components),
)

dataset = dataset_map[dataset_name]
4 changes: 2 additions & 2 deletions benchmarks/run_on_openml_datasets.py
Original file line number Diff line number Diff line change
@@ -45,14 +45,14 @@

classification_pipeline = Pipeline(
[
("vectorizer", TableVectorizer(high_card_cat_transformer=MinHashEncoder())),
("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())),
("classifier", HistGradientBoostingClassifier()),
]
)

regression_pipeline = Pipeline(
[
("vectorizer", TableVectorizer(high_card_cat_transformer=MinHashEncoder())),
("vectorizer", TableVectorizer(high_cardinality_transformer=MinHashEncoder())),
("regressor", HistGradientBoostingRegressor()),
]
)
4 changes: 2 additions & 2 deletions examples/01_encodings.py
Original file line number Diff line number Diff line change
@@ -94,14 +94,14 @@
# - The |OneHotEncoder| for low cardinality string variables, the columns
# ``'gender'``, ``'department'``, ``'department_name'`` and ``'assignment_category'``.

tv.named_transformers_["low_card_cat"].get_feature_names_out()
tv.named_transformers_["low_cardinality"].get_feature_names_out()

###############################################################################
# - The |GapEncoder| for high cardinality string columns, ``'employee_position_title'``
# and ``'division'``. The |GapEncoder| is a powerful encoder that can handle dirty
# categorical columns.

tv.named_transformers_["high_card_cat"].get_feature_names_out()
tv.named_transformers_["high_cardinality"].get_feature_names_out()

###############################################################################
# - The |DatetimeEncoder| to the ``'date_first_hired'`` column. The |DatetimeEncoder|
10 changes: 5 additions & 5 deletions examples/FIXME/07_grid_searching_with_the_tablevectorizer.py
Original file line number Diff line number Diff line change
@@ -62,7 +62,7 @@
from skrub import MinHashEncoder

tv = TableVectorizer(
high_card_cat_transformer=MinHashEncoder(),
high_cardinality_transformer=MinHashEncoder(),
)
tv.fit(X)

@@ -101,8 +101,8 @@
# For that, we use the dunder separator, which indicates a nesting layer.
# That means that for tuning the parameter ``n_components`` of the
# |GapEncoder| saved in the |TableVectorizer| attribute
# ``high_card_cat_transformer``, we use the syntax
# ``tablevectorizer__high_card_cat_transformer__n_components``.
# ``high_cardinality_transformer``, we use the syntax
# ``tablevectorizer__high_cardinality_transformer__n_components``.
#
# We recommend using the 3-tuple syntax for the column-specific transformers,
# which allows us to give a name to the assignment (here ``mh_dep_name``).
@@ -114,7 +114,7 @@

pipeline = make_pipeline(
TableVectorizer(
high_card_cat_transformer=GapEncoder(),
high_cardinality_transformer=GapEncoder(),
specific_transformers=[
("mh_dep_name", MinHashEncoder(), ["department_name"]),
],
@@ -123,7 +123,7 @@
)

params = {
"tablevectorizer__high_card_cat_transformer__n_components": [10, 30, 50],
"tablevectorizer__high_cardinality_transformer__n_components": [10, 30, 50],
"tablevectorizer__mh_dep_name__n_components": [25, 50],
}

16 changes: 8 additions & 8 deletions skrub/_table_vectorizer.py
Original file line number Diff line number Diff line change
@@ -377,10 +377,10 @@ class TableVectorizer(TransformerMixin, BaseEstimator):
>>> tv.transformers_
[('numeric', 'passthrough', ['year_first_hired']), \
('datetime', DatetimeEncoder(), ['date_first_hired']), \
('low_card_cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \
('low_cardinality', OneHotEncoder(drop='if_binary', handle_unknown='ignore', \
sparse_output=False), \
['gender', 'department', 'department_name', 'assignment_category']), \
('high_card_cat', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
('high_cardinality', GapEncoder(n_components=30), ['division', 'employee_position_title'])]
"""

def __init__(
@@ -721,23 +721,23 @@ def fit_transform(self, X, y=None):
).columns.to_list()

# Classify categorical columns by cardinality
low_card_cat_columns, high_card_cat_columns = [], []
low_cardinality_columns, high_cardinality_columns = [], []
for col in categorical_columns:
if X[col].nunique() < self.cardinality_threshold:
low_card_cat_columns.append(col)
low_cardinality_columns.append(col)
else:
high_card_cat_columns.append(col)
high_cardinality_columns.append(col)

# Next part: construct the transformers
# Create the list of all the transformers.
all_transformers = [
("numeric", self.numerical_transformer_, numeric_columns),
("datetime", self.datetime_transformer_, datetime_columns),
("low_card_cat", self.low_cardinality_transformer_, low_card_cat_columns),
("low_cardinality", self.low_cardinality_transformer_, low_cardinality_columns),
(
"high_card_cat",
"high_cardinality",
self.high_cardinality_transformer_,
high_card_cat_columns,
high_cardinality_columns,
),
*self.specific_transformers_,
]
40 changes: 20 additions & 20 deletions skrub/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
@@ -197,15 +197,15 @@ def _test_possibilities(X) -> None:
# Warning: order-dependant
expected_transformers_df = {
"numeric": ["int", "float"],
"low_card_cat": ["str1", "cat1"],
"high_card_cat": ["str2", "cat2"],
"low_cardinality": ["str1", "cat1"],
"high_cardinality": ["str2", "cat2"],
}
vectorizer_base.fit_transform(X)
check_same_transformers(expected_transformers_df, vectorizer_base.transformers_)

# Test with higher cardinality threshold and no numeric transformer
expected_transformers_2 = {
"low_card_cat": ["str1", "str2", "cat1", "cat2"],
"low_cardinality": ["str1", "str2", "cat1", "cat2"],
"numeric": ["int", "float"],
}
vectorizer_default = TableVectorizer() # Using default values
@@ -216,8 +216,8 @@ def _test_possibilities(X) -> None:
arr = X.to_numpy()
# Instead of the columns names, we'll have the column indices.
expected_transformers_np_no_cast = {
"low_card_cat": [2, 4],
"high_card_cat": [3, 5],
"low_cardinality": [2, 4],
"high_cardinality": [3, 5],
"numeric": [0, 1],
}
vectorizer_base.fit_transform(arr)
@@ -227,7 +227,7 @@ def _test_possibilities(X) -> None:

# Test with single column dataframe
expected_transformers_series = {
"low_card_cat": ["cat1"],
"low_cardinality": ["cat1"],
}
vectorizer_base.fit_transform(X[["cat1"]])
check_same_transformers(expected_transformers_series, vectorizer_base.transformers_)
@@ -242,17 +242,17 @@ def _test_possibilities(X) -> None:
X_str = X.astype("object")
# With pandas
expected_transformers_plain = {
"high_card_cat": ["str2", "cat2"],
"low_card_cat": ["str1", "cat1"],
"high_cardinality": ["str2", "cat2"],
"low_cardinality": ["str1", "cat1"],
"numeric": ["int", "float"],
}
vectorizer_cast.fit_transform(X_str)
check_same_transformers(expected_transformers_plain, vectorizer_cast.transformers_)
# With numpy
expected_transformers_np_cast = {
"numeric": [0, 1],
"low_card_cat": [2, 4],
"high_card_cat": [3, 5],
"low_cardinality": [2, 4],
"high_cardinality": [3, 5],
}
vectorizer_cast.fit_transform(X_str.to_numpy())
check_same_transformers(
@@ -353,8 +353,8 @@ def test_with_arrays() -> None:
"""
expected_transformers = {
"numeric": [0, 1],
"low_card_cat": [2, 4],
"high_card_cat": [3, 5],
"low_cardinality": [2, 4],
"high_cardinality": [3, 5],
}
vectorizer = TableVectorizer(
cardinality_threshold=4,
@@ -580,15 +580,15 @@ def test_handle_unknown() -> None:
[
("numeric", "passthrough", ["int", "float"]),
("minhashencoder", "MinHashEncoder", ["str1", "str2"]),
("low_card_cat", "OneHotEncoder", ["cat1", "cat2"]),
("low_cardinality", "OneHotEncoder", ["cat1", "cat2"]),
],
),
(
("mh_cat1", MinHashEncoder(), ["cat1"]),
[
("numeric", "passthrough", ["int", "float"]),
("mh_cat1", "MinHashEncoder", ["cat1"]),
("low_card_cat", "OneHotEncoder", ["str1", "str2", "cat2"]),
("low_cardinality", "OneHotEncoder", ["str1", "str2", "cat2"]),
],
),
],
@@ -673,7 +673,7 @@ def test_mixed_types() -> None:
table_vec.fit_transform(df)
expected_transformers_df = {
"numeric": ["int_str", "float_str", "int_float"],
"low_card_cat": ["bool_str"],
"low_cardinality": ["bool_str"],
}
check_same_transformers(expected_transformers_df, table_vec.transformers_)

@@ -684,7 +684,7 @@ def test_mixed_types() -> None:
table_vec.fit_transform(X)
expected_transformers_array = {
"numeric": [0, 1, 2],
"low_card_cat": [3],
"low_cardinality": [3],
}
check_same_transformers(expected_transformers_array, table_vec.transformers_)

@@ -898,15 +898,15 @@ def __init__(self, n_jobs=None):
n_jobs=None,
).fit(X)
assert table_vectorizer.named_transformers_["numeric"].n_jobs is None
assert table_vectorizer.named_transformers_["low_card_cat"].n_jobs is None
assert table_vectorizer.named_transformers_["low_cardinality"].n_jobs is None

table_vectorizer = TableVectorizer(
numerical_transformer=DummyTransformerWithJobs(n_jobs=2),
low_cardinality_transformer=DummyTransformerWithJobs(n_jobs=None),
n_jobs=None,
).fit(X)
assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
assert table_vectorizer.named_transformers_["low_card_cat"].n_jobs is None
assert table_vectorizer.named_transformers_["low_cardinality"].n_jobs is None

# 2. Case where `TableVectorizer.n_jobs` is not `None` and we should propagate
# when the underlying transformer `n_jobs` is not set explicitly.
@@ -916,7 +916,7 @@ def __init__(self, n_jobs=None):
n_jobs=2,
).fit(X)
assert table_vectorizer.named_transformers_["numeric"].n_jobs == 2
assert table_vectorizer.named_transformers_["low_card_cat"].n_jobs == 2
assert table_vectorizer.named_transformers_["low_cardinality"].n_jobs == 2

# 3. Case where `TableVectorizer.n_jobs` is not `None` and we should not propagate
# when the underlying transformer `n_jobs` is set explicitly.
@@ -926,7 +926,7 @@ def __init__(self, n_jobs=None):
n_jobs=2,
).fit(X)
assert table_vectorizer.named_transformers_["numeric"].n_jobs == 4
assert table_vectorizer.named_transformers_["low_card_cat"].n_jobs == 2
assert table_vectorizer.named_transformers_["low_cardinality"].n_jobs == 2


def test_table_vectorizer_remainder_cloning():

0 comments on commit c75c235

Please sign in to comment.