Skip to content

Commit

Permalink
MAINT fix nightly builds (#1193)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeromedockes authored and TheooJ committed Dec 11, 2024
1 parent 299a2b9 commit 5edef7b
Show file tree
Hide file tree
Showing 10 changed files with 45 additions and 41 deletions.
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ Bug fixes
:user:`Jérôme Dockès <jeromedockes>` and the matplotlib issue can be tracked
[here](https://github.com/matplotlib/matplotlib/issues/25041).

* Improve the performance of :func:`deduplicate` by removing some unnecessary
computations. :pr:`1193` by :user:`Jérôme Dockès <jeromedockes>`.

Maintenance
-----------
* Make `skrub` compatible with scikit-learn 1.6.
Expand Down
10 changes: 4 additions & 6 deletions skrub/_dataframe/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,13 +500,11 @@ def test_to_datetime(df_module):
s = df_module.make_column("", ["01/02/2020", "02/01/2021", "bad"])
with pytest.raises(ValueError):
ns.to_datetime(s, "%m/%d/%Y", True)
df_module.assert_column_equal(
ns.to_datetime(s, "%m/%d/%Y", False),
df_module.make_column("", [datetime(2020, 1, 2), datetime(2021, 2, 1), None]),
assert ns.to_list(ns.to_datetime(s, "%m/%d/%Y", False)) == ns.to_list(
df_module.make_column("", [datetime(2020, 1, 2), datetime(2021, 2, 1), None])
)
df_module.assert_column_equal(
ns.to_datetime(s, "%d/%m/%Y", False),
df_module.make_column("", [datetime(2020, 2, 1), datetime(2021, 1, 2), None]),
assert ns.to_list(ns.to_datetime(s, "%d/%m/%Y", False)) == ns.to_list(
df_module.make_column("", [datetime(2020, 2, 1), datetime(2021, 1, 2), None])
)
dt_col = ns.col(df_module.example_dataframe, "datetime-col")
assert ns.to_datetime(dt_col, None) is dt_col
Expand Down
4 changes: 2 additions & 2 deletions skrub/_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class DatetimeEncoder(SingleColumnTransformer):
0 2024-05-13 12:05:36
1 NaT
2 2024-05-15 13:46:02
Name: login, dtype: datetime64[ns]
Name: login, dtype: datetime64[...]
>>> from skrub import DatetimeEncoder
>>> DatetimeEncoder().fit_transform(login)
Expand Down Expand Up @@ -231,7 +231,7 @@ class DatetimeEncoder(SingleColumnTransformer):
0 2024-05-13 07:05:36-03:00
1 NaT
2 2024-05-15 08:46:02-03:00
Name: login, dtype: datetime64[ns, America/Sao_Paulo]
Name: login, dtype: datetime64[..., America/Sao_Paulo]
>>> encoder.transform(login_sp)['login_hour']
0 7.0
1 NaN
Expand Down
2 changes: 1 addition & 1 deletion skrub/_deduplicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def _guess_clusters(Z, distance_mat, n_jobs=None):
int
number of clusters that maximize the silhouette score.
"""
max_clusters = distance_mat.shape[0]
max_clusters = Z.shape[0]
n_clusters = np.arange(2, max_clusters)
# silhouette score needs a redundant distance matrix
redundant_dist = squareform(distance_mat)
Expand Down
6 changes: 3 additions & 3 deletions skrub/_on_each_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class RejectColumn(ValueError):
>>> df = pd.DataFrame(dict(a=['2020-02-02'], b=[12.5]))
>>> ToDatetime().fit_transform(df['a'])
0 2020-02-02
Name: a, dtype: datetime64[ns]
Name: a, dtype: datetime64[...]
>>> ToDatetime().fit_transform(df['b'])
Traceback (most recent call last):
...
Expand Down Expand Up @@ -340,7 +340,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator):
dtype: object
>>> ToDatetime().fit_transform(df["birthday"])
0 2024-01-29
Name: birthday, dtype: datetime64[ns]
Name: birthday, dtype: datetime64[...]
>>> ToDatetime().fit_transform(df["city"])
Traceback (most recent call last):
...
Expand Down Expand Up @@ -373,7 +373,7 @@ class OnEachColumn(TransformerMixin, BaseEstimator):
datetime column.
>>> transformed.dtypes
birthday datetime64[ns]
birthday datetime64[...]
city object
dtype: object
>>> to_datetime.transformers_
Expand Down
4 changes: 2 additions & 2 deletions skrub/_selectors/_selectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,8 @@ def any_date():
0 2020-03-02 10:30:00 2020-03-02 10:30:00+00:00 2020-03-02 10:30:00
>>> df.dtypes
dt datetime64[ns]
tzdt datetime64[ns, UTC]
dt datetime64[...]
tzdt datetime64[..., UTC]
str_ object
dtype: object
Expand Down
38 changes: 19 additions & 19 deletions skrub/_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,15 @@ class ToDatetime(SingleColumnTransformer):
0 2024-05-05 13:17:52
1 NaT
2 2024-05-07 13:17:52
Name: when, dtype: datetime64[ns]
Name: when, dtype: datetime64[...]
The attributes ``format_``, ``output_dtype_``, ``output_time_zone_``
record information about the conversion result.
>>> to_dt.format_
'%Y-%m-%dT%H:%M:%S'
>>> to_dt.output_dtype_
dtype('<M8[ns]')
dtype('<M8[...]')
>>> to_dt.output_time_zone_ is None
True
Expand All @@ -164,7 +164,7 @@ class ToDatetime(SingleColumnTransformer):
0 2024-05-05 13:17:52
1 NaT
2 2024-05-07 13:17:52
Name: when, dtype: datetime64[ns]
Name: when, dtype: datetime64[...]
>>> ToDatetime(format="%d/%m/%Y").fit_transform(s)
Traceback (most recent call last):
Expand All @@ -179,7 +179,7 @@ class ToDatetime(SingleColumnTransformer):
0 2024-05-05 13:17:52+02:00
1 NaT
2 2024-05-07 13:17:52+02:00
Name: when, dtype: datetime64[ns, Europe/Paris]
Name: when, dtype: datetime64[..., Europe/Paris]
>>> to_dt.fit_transform(s) is s
True
Expand All @@ -188,7 +188,7 @@ class ToDatetime(SingleColumnTransformer):
>>> to_dt.format_ is None
True
>>> to_dt.output_dtype_
datetime64[ns, Europe/Paris]
datetime64[..., Europe/Paris]
>>> to_dt.output_time_zone_
'Europe/Paris'
Expand Down Expand Up @@ -220,13 +220,13 @@ class ToDatetime(SingleColumnTransformer):
0 2024-05-05 13:17:52
1 NaT
2 2024-05-07 13:17:52
Name: when, dtype: datetime64[ns]
Name: when, dtype: datetime64[...]
>>> s = pd.Series(["05/05/2024", None, "07/05/2024"], name="when")
>>> to_dt.transform(s)
0 NaT
1 NaT
2 NaT
Name: when, dtype: datetime64[ns]
Name: when, dtype: datetime64[...]
**Time zones**
Expand All @@ -237,7 +237,7 @@ class ToDatetime(SingleColumnTransformer):
>>> to_dt.fit_transform(s)
0 2020-01-01 02:00:00+00:00
1 2020-01-01 01:00:00+00:00
dtype: datetime64[ns, UTC]
dtype: datetime64[..., UTC]
>>> to_dt.format_
'%Y-%m-%dT%H:%M:%S%z'
>>> to_dt.output_time_zone_
Expand All @@ -249,7 +249,7 @@ class ToDatetime(SingleColumnTransformer):
>>> to_dt.fit_transform(s)
0 2020-01-01 04:00:00
1 2020-01-01 04:00:00
dtype: datetime64[ns]
dtype: datetime64[...]
>>> to_dt.output_time_zone_ is None
True
Expand All @@ -262,10 +262,10 @@ class ToDatetime(SingleColumnTransformer):
>>> s_paris
0 2024-05-07 14:24:49+02:00
1 2024-05-06 14:24:49+02:00
dtype: datetime64[ns, Europe/Paris]
dtype: datetime64[..., Europe/Paris]
>>> to_dt = ToDatetime().fit(s_paris)
>>> to_dt.output_dtype_
datetime64[ns, Europe/Paris]
datetime64[..., Europe/Paris]
Here our converter is set to output datetimes with nanosecond resolution,
localized in "Europe/Paris".
Expand All @@ -276,7 +276,7 @@ class ToDatetime(SingleColumnTransformer):
>>> s_london
0 2024-05-07 13:24:49+01:00
1 2024-05-06 13:24:49+01:00
dtype: datetime64[ns, Europe/London]
dtype: datetime64[..., Europe/London]
Here the timezone is "Europe/London" and the times are offset by 1 hour. During
``transform`` datetimes will be converted to the original dtype and the
Expand All @@ -285,7 +285,7 @@ class ToDatetime(SingleColumnTransformer):
>>> to_dt.transform(s_london)
0 2024-05-07 14:24:49+02:00
1 2024-05-06 14:24:49+02:00
dtype: datetime64[ns, Europe/Paris]
dtype: datetime64[..., Europe/Paris]
Moreover, we may have to transform a timezone-naive column whereas the
transformer was fitted on a timezone-aware column. Note that is somewhat a
Expand All @@ -296,15 +296,15 @@ class ToDatetime(SingleColumnTransformer):
>>> s_naive
0 2024-05-07 12:24:49
1 2024-05-06 12:24:49
dtype: datetime64[ns]
dtype: datetime64[...]
In this case, we make the arbitrary choice to assume that the timezone-naive
datetimes are in UTC.
>>> to_dt.transform(s_naive)
0 2024-05-07 14:24:49+02:00
1 2024-05-06 14:24:49+02:00
dtype: datetime64[ns, Europe/Paris]
dtype: datetime64[..., Europe/Paris]
Conversely, a transformer fitted on a timezone-naive column can convert
timezone-aware columns. Here also, we assume the naive datetimes were in UTC.
Expand All @@ -313,7 +313,7 @@ class ToDatetime(SingleColumnTransformer):
>>> to_dt.transform(s_london)
0 2024-05-07 12:24:49
1 2024-05-06 12:24:49
dtype: datetime64[ns]
dtype: datetime64[...]
**``%d/%m/%Y`` vs ``%m/%d/%Y``**
Expand All @@ -324,7 +324,7 @@ class ToDatetime(SingleColumnTransformer):
>>> s = pd.Series(["05/23/2024"])
>>> to_dt.fit_transform(s)
0 2024-05-23
dtype: datetime64[ns]
dtype: datetime64[...]
>>> to_dt.format_
'%m/%d/%Y'
Expand All @@ -334,7 +334,7 @@ class ToDatetime(SingleColumnTransformer):
>>> s = pd.Series(["23/05/2024"])
>>> to_dt.fit_transform(s)
0 2024-05-23
dtype: datetime64[ns]
dtype: datetime64[...]
>>> to_dt.format_
'%d/%m/%Y'
Expand All @@ -343,7 +343,7 @@ class ToDatetime(SingleColumnTransformer):
>>> s = pd.Series(["03/05/2024"])
>>> to_dt.fit_transform(s)
0 2024-03-05
dtype: datetime64[ns]
dtype: datetime64[...]
>>> to_dt.format_
'%m/%d/%Y'
Expand Down
2 changes: 1 addition & 1 deletion skrub/_to_float32.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ class ToFloat32(SingleColumnTransformer):
>>> to_float.fit_transform(pd.to_datetime(pd.Series(['2024-05-13'], name='s')))
Traceback (most recent call last):
...
skrub._on_each_column.RejectColumn: Refusing to cast column 's' with dtype 'datetime64[ns]' to numbers.
skrub._on_each_column.RejectColumn: Refusing to cast column 's' with dtype 'datetime64[...]' to numbers.
float32 columns are passed through:
Expand Down
2 changes: 1 addition & 1 deletion skrub/_to_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class ToStr(SingleColumnTransformer):
>>> to_str.fit_transform(pd.to_datetime(pd.Series(['2020-02-02'])))
Traceback (most recent call last):
...
skrub._on_each_column.RejectColumn: Refusing to convert None with dtype 'datetime64[ns]' to strings.
skrub._on_each_column.RejectColumn: Refusing to convert None with dtype 'datetime64[...]' to strings.
However, once a column has been accepted, the output of ``transform`` will
always be strings:
Expand Down
15 changes: 9 additions & 6 deletions skrub/tests/test_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,11 @@ def test_duplicate_column_names():
(
X,
{
"pd_datetime": "datetime64[ns]",
"np_datetime": "datetime64[ns]",
"dmy-": "datetime64[ns]",
"ymd/": "datetime64[ns]",
"ymd/_hms:": "datetime64[ns]",
"pd_datetime": "datetime",
"np_datetime": "datetime",
"dmy-": "datetime",
"ymd/": "datetime",
"ymd/_hms:": "datetime",
},
),
# Test other types detection
Expand Down Expand Up @@ -285,7 +285,10 @@ def test_auto_cast(X, dict_expected_types):
vectorizer = passthrough_vectorizer()
X_trans = vectorizer.fit_transform(X)
for col in X_trans.columns:
assert dict_expected_types[col] == X_trans[col].dtype
if dict_expected_types[col] == "datetime":
assert sbd.is_any_date(X_trans[col])
else:
assert dict_expected_types[col] == X_trans[col].dtype


def test_auto_cast_missing_categories():
Expand Down

0 comments on commit 5edef7b

Please sign in to comment.