diff --git a/CHANGES.rst b/CHANGES.rst index cb73deaa8..b39510274 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -50,6 +50,9 @@ Bug fixes :user:`Jérôme Dockès ` and the matplotlib issue can be tracked [here](https://github.com/matplotlib/matplotlib/issues/25041). +* Improve the performance of :func:`deduplicate` by removing some unnecessary + computations. :pr:`1193` by :user:`Jérôme Dockès `. + Maintenance ----------- * Make `skrub` compatible with scikit-learn 1.6. diff --git a/skrub/_dataframe/tests/test_common.py b/skrub/_dataframe/tests/test_common.py index b95998767..a9830ea05 100644 --- a/skrub/_dataframe/tests/test_common.py +++ b/skrub/_dataframe/tests/test_common.py @@ -500,13 +500,11 @@ def test_to_datetime(df_module): s = df_module.make_column("", ["01/02/2020", "02/01/2021", "bad"]) with pytest.raises(ValueError): ns.to_datetime(s, "%m/%d/%Y", True) - df_module.assert_column_equal( - ns.to_datetime(s, "%m/%d/%Y", False), - df_module.make_column("", [datetime(2020, 1, 2), datetime(2021, 2, 1), None]), + assert ns.to_list(ns.to_datetime(s, "%m/%d/%Y", False)) == ns.to_list( + df_module.make_column("", [datetime(2020, 1, 2), datetime(2021, 2, 1), None]) ) - df_module.assert_column_equal( - ns.to_datetime(s, "%d/%m/%Y", False), - df_module.make_column("", [datetime(2020, 2, 1), datetime(2021, 1, 2), None]), + assert ns.to_list(ns.to_datetime(s, "%d/%m/%Y", False)) == ns.to_list( + df_module.make_column("", [datetime(2020, 2, 1), datetime(2021, 1, 2), None]) ) dt_col = ns.col(df_module.example_dataframe, "datetime-col") assert ns.to_datetime(dt_col, None) is dt_col diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index 0e4198705..fea47ded0 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -128,7 +128,7 @@ class DatetimeEncoder(SingleColumnTransformer): 0 2024-05-13 12:05:36 1 NaT 2 2024-05-15 13:46:02 - Name: login, dtype: datetime64[ns] + Name: login, dtype: datetime64[...] >>> from skrub import DatetimeEncoder >>> DatetimeEncoder().fit_transform(login) @@ -231,7 +231,7 @@ class DatetimeEncoder(SingleColumnTransformer): 0 2024-05-13 07:05:36-03:00 1 NaT 2 2024-05-15 08:46:02-03:00 - Name: login, dtype: datetime64[ns, America/Sao_Paulo] + Name: login, dtype: datetime64[..., America/Sao_Paulo] >>> encoder.transform(login_sp)['login_hour'] 0 7.0 1 NaN diff --git a/skrub/_deduplicate.py b/skrub/_deduplicate.py index ce6426bd4..3aeb5dc7b 100644 --- a/skrub/_deduplicate.py +++ b/skrub/_deduplicate.py @@ -72,7 +72,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None): int number of clusters that maximize the silhouette score. """ - max_clusters = distance_mat.shape[0] + max_clusters = Z.shape[0] n_clusters = np.arange(2, max_clusters) # silhouette score needs a redundant distance matrix redundant_dist = squareform(distance_mat) diff --git a/skrub/_on_each_column.py b/skrub/_on_each_column.py index 68705d6c2..fd736d6e6 100644 --- a/skrub/_on_each_column.py +++ b/skrub/_on_each_column.py @@ -38,7 +38,7 @@ class RejectColumn(ValueError): >>> df = pd.DataFrame(dict(a=['2020-02-02'], b=[12.5])) >>> ToDatetime().fit_transform(df['a']) 0 2020-02-02 - Name: a, dtype: datetime64[ns] + Name: a, dtype: datetime64[...] >>> ToDatetime().fit_transform(df['b']) Traceback (most recent call last): ... @@ -340,7 +340,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator): dtype: object >>> ToDatetime().fit_transform(df["birthday"]) 0 2024-01-29 - Name: birthday, dtype: datetime64[ns] + Name: birthday, dtype: datetime64[...] >>> ToDatetime().fit_transform(df["city"]) Traceback (most recent call last): ... @@ -373,7 +373,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator): datetime column. >>> transformed.dtypes - birthday datetime64[ns] + birthday datetime64[...] city object dtype: object >>> to_datetime.transformers_ diff --git a/skrub/_selectors/_selectors.py b/skrub/_selectors/_selectors.py index 08fd58a94..a3a9a8a80 100644 --- a/skrub/_selectors/_selectors.py +++ b/skrub/_selectors/_selectors.py @@ -312,8 +312,8 @@ def any_date(): 0 2020-03-02 10:30:00 2020-03-02 10:30:00+00:00 2020-03-02 10:30:00 >>> df.dtypes - dt datetime64[ns] - tzdt datetime64[ns, UTC] + dt datetime64[...] + tzdt datetime64[..., UTC] str_ object dtype: object diff --git a/skrub/_to_datetime.py b/skrub/_to_datetime.py index 943bf4906..0a21a28f0 100644 --- a/skrub/_to_datetime.py +++ b/skrub/_to_datetime.py @@ -145,7 +145,7 @@ class ToDatetime(SingleColumnTransformer): 0 2024-05-05 13:17:52 1 NaT 2 2024-05-07 13:17:52 - Name: when, dtype: datetime64[ns] + Name: when, dtype: datetime64[...] The attributes ``format_``, ``output_dtype_``, ``output_time_zone_`` record information about the conversion result. @@ -153,7 +153,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.format_ '%Y-%m-%dT%H:%M:%S' >>> to_dt.output_dtype_ - dtype('>> to_dt.output_time_zone_ is None True @@ -164,7 +164,7 @@ class ToDatetime(SingleColumnTransformer): 0 2024-05-05 13:17:52 1 NaT 2 2024-05-07 13:17:52 - Name: when, dtype: datetime64[ns] + Name: when, dtype: datetime64[...] >>> ToDatetime(format="%d/%m/%Y").fit_transform(s) Traceback (most recent call last): @@ -179,7 +179,7 @@ class ToDatetime(SingleColumnTransformer): 0 2024-05-05 13:17:52+02:00 1 NaT 2 2024-05-07 13:17:52+02:00 - Name: when, dtype: datetime64[ns, Europe/Paris] + Name: when, dtype: datetime64[..., Europe/Paris] >>> to_dt.fit_transform(s) is s True @@ -188,7 +188,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.format_ is None True >>> to_dt.output_dtype_ - datetime64[ns, Europe/Paris] + datetime64[..., Europe/Paris] >>> to_dt.output_time_zone_ 'Europe/Paris' @@ -220,13 +220,13 @@ class ToDatetime(SingleColumnTransformer): 0 2024-05-05 13:17:52 1 NaT 2 2024-05-07 13:17:52 - Name: when, dtype: datetime64[ns] + Name: when, dtype: datetime64[...] >>> s = pd.Series(["05/05/2024", None, "07/05/2024"], name="when") >>> to_dt.transform(s) 0 NaT 1 NaT 2 NaT - Name: when, dtype: datetime64[ns] + Name: when, dtype: datetime64[...] **Time zones** @@ -237,7 +237,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.fit_transform(s) 0 2020-01-01 02:00:00+00:00 1 2020-01-01 01:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[..., UTC] >>> to_dt.format_ '%Y-%m-%dT%H:%M:%S%z' >>> to_dt.output_time_zone_ @@ -249,7 +249,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.fit_transform(s) 0 2020-01-01 04:00:00 1 2020-01-01 04:00:00 - dtype: datetime64[ns] + dtype: datetime64[...] >>> to_dt.output_time_zone_ is None True @@ -262,10 +262,10 @@ class ToDatetime(SingleColumnTransformer): >>> s_paris 0 2024-05-07 14:24:49+02:00 1 2024-05-06 14:24:49+02:00 - dtype: datetime64[ns, Europe/Paris] + dtype: datetime64[..., Europe/Paris] >>> to_dt = ToDatetime().fit(s_paris) >>> to_dt.output_dtype_ - datetime64[ns, Europe/Paris] + datetime64[..., Europe/Paris] Here our converter is set to output datetimes with nanosecond resolution, localized in "Europe/Paris". @@ -276,7 +276,7 @@ class ToDatetime(SingleColumnTransformer): >>> s_london 0 2024-05-07 13:24:49+01:00 1 2024-05-06 13:24:49+01:00 - dtype: datetime64[ns, Europe/London] + dtype: datetime64[..., Europe/London] Here the timezone is "Europe/London" and the times are offset by 1 hour. During ``transform`` datetimes will be converted to the original dtype and the @@ -285,7 +285,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.transform(s_london) 0 2024-05-07 14:24:49+02:00 1 2024-05-06 14:24:49+02:00 - dtype: datetime64[ns, Europe/Paris] + dtype: datetime64[..., Europe/Paris] Moreover, we may have to transform a timezone-naive column whereas the transformer was fitted on a timezone-aware column. Note that is somewhat a @@ -296,7 +296,7 @@ class ToDatetime(SingleColumnTransformer): >>> s_naive 0 2024-05-07 12:24:49 1 2024-05-06 12:24:49 - dtype: datetime64[ns] + dtype: datetime64[...] In this case, we make the arbitrary choice to assume that the timezone-naive datetimes are in UTC. @@ -304,7 +304,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.transform(s_naive) 0 2024-05-07 14:24:49+02:00 1 2024-05-06 14:24:49+02:00 - dtype: datetime64[ns, Europe/Paris] + dtype: datetime64[..., Europe/Paris] Conversely, a transformer fitted on a timezone-naive column can convert timezone-aware columns. Here also, we assume the naive datetimes were in UTC. @@ -313,7 +313,7 @@ class ToDatetime(SingleColumnTransformer): >>> to_dt.transform(s_london) 0 2024-05-07 12:24:49 1 2024-05-06 12:24:49 - dtype: datetime64[ns] + dtype: datetime64[...] **``%d/%m/%Y`` vs ``%m/%d/%Y``** @@ -324,7 +324,7 @@ class ToDatetime(SingleColumnTransformer): >>> s = pd.Series(["05/23/2024"]) >>> to_dt.fit_transform(s) 0 2024-05-23 - dtype: datetime64[ns] + dtype: datetime64[...] >>> to_dt.format_ '%m/%d/%Y' @@ -334,7 +334,7 @@ class ToDatetime(SingleColumnTransformer): >>> s = pd.Series(["23/05/2024"]) >>> to_dt.fit_transform(s) 0 2024-05-23 - dtype: datetime64[ns] + dtype: datetime64[...] >>> to_dt.format_ '%d/%m/%Y' @@ -343,7 +343,7 @@ class ToDatetime(SingleColumnTransformer): >>> s = pd.Series(["03/05/2024"]) >>> to_dt.fit_transform(s) 0 2024-03-05 - dtype: datetime64[ns] + dtype: datetime64[...] >>> to_dt.format_ '%m/%d/%Y' diff --git a/skrub/_to_float32.py b/skrub/_to_float32.py index 26e8303ba..4cf1185a2 100644 --- a/skrub/_to_float32.py +++ b/skrub/_to_float32.py @@ -158,7 +158,7 @@ class ToFloat32(SingleColumnTransformer): >>> to_float.fit_transform(pd.to_datetime(pd.Series(['2024-05-13'], name='s'))) Traceback (most recent call last): ... - skrub._on_each_column.RejectColumn: Refusing to cast column 's' with dtype 'datetime64[ns]' to numbers. + skrub._on_each_column.RejectColumn: Refusing to cast column 's' with dtype 'datetime64[...]' to numbers. float32 columns are passed through: diff --git a/skrub/_to_str.py b/skrub/_to_str.py index 5cf7de2e3..e8ff19cac 100644 --- a/skrub/_to_str.py +++ b/skrub/_to_str.py @@ -100,7 +100,7 @@ class ToStr(SingleColumnTransformer): >>> to_str.fit_transform(pd.to_datetime(pd.Series(['2020-02-02']))) Traceback (most recent call last): ... - skrub._on_each_column.RejectColumn: Refusing to convert None with dtype 'datetime64[ns]' to strings. + skrub._on_each_column.RejectColumn: Refusing to convert None with dtype 'datetime64[...]' to strings. However, once a column has been accepted, the output of ``transform`` will always be strings: diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index a16cf17f1..ec1fc023b 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -235,11 +235,11 @@ def test_duplicate_column_names(): ( X, { - "pd_datetime": "datetime64[ns]", - "np_datetime": "datetime64[ns]", - "dmy-": "datetime64[ns]", - "ymd/": "datetime64[ns]", - "ymd/_hms:": "datetime64[ns]", + "pd_datetime": "datetime", + "np_datetime": "datetime", + "dmy-": "datetime", + "ymd/": "datetime", + "ymd/_hms:": "datetime", }, ), # Test other types detection @@ -285,7 +285,10 @@ def test_auto_cast(X, dict_expected_types): vectorizer = passthrough_vectorizer() X_trans = vectorizer.fit_transform(X) for col in X_trans.columns: - assert dict_expected_types[col] == X_trans[col].dtype + if dict_expected_types[col] == "datetime": + assert sbd.is_any_date(X_trans[col]) + else: + assert dict_expected_types[col] == X_trans[col].dtype def test_auto_cast_missing_categories():