From 2bda119b27f56541f6b7ab77e69236c2e66649d3 Mon Sep 17 00:00:00 2001 From: Vincent M Date: Thu, 9 Nov 2023 11:37:09 +0100 Subject: [PATCH] MAIN Improve `DatetimeEncoder` (#784) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jérôme Dockès --- CHANGES.rst | 9 + doc/api.rst | 10 +- doc/conf.py | 1 + examples/03_datetime_encoder.py | 181 +++--- skrub/__init__.py | 3 +- skrub/_agg_joiner.py | 19 +- skrub/_datetime_encoder.py | 791 +++++++++++++++++++------ skrub/tests/test_datetime_encoder.py | 826 +++++++++++++-------------- skrub/tests/test_table_vectorizer.py | 9 +- 9 files changed, 1128 insertions(+), 721 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c366c247c..4d6a29eb8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -15,6 +15,10 @@ development and backward compatibility is not ensured. Major changes ------------- +* :func:`to_datetime` is now available to support pandas.to_datetime + over dataframes and 2d arrays. + :pr:`784` by :user:`Vincent Maladiere ` + * Some parameters of :class:`Joiner` have changed. The goal is to harmonize parameters across all estimator that perform join(-like) operations, as discussed in `#751 `_. @@ -57,6 +61,11 @@ Major changes Minor changes ------------- +* :class:`DatetimeEncoder` doesn't remove constant features anymore. + It also supports an 'errors' argument to raise or coerce errors during + transform, and a 'add_total_seconds' argument to include the number of + seconds since Epoch. + :pr:`784` by :user:`Vincent Maladiere ` * Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès `. diff --git a/doc/api.rst b/doc/api.rst index 99acf2807..730fe37cb 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -79,7 +79,7 @@ This page lists all available functions and classes of `skrub`. .. raw:: html -

Other encoders

+

Dealing with dates

.. autosummary:: :toctree: generated/ @@ -89,6 +89,14 @@ This page lists all available functions and classes of `skrub`. DatetimeEncoder +.. autosummary:: + :toctree: generated/ + :template: function.rst + :nosignatures: + :caption: Converting datetime columns in a table + + to_datetime + .. raw:: html

Deduplication: merging variants of the same entry

diff --git a/doc/conf.py b/doc/conf.py index 710f4d69a..b1bccad12 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -504,6 +504,7 @@ def notebook_modification_function(notebook_content, notebook_filename): "SimilarityEncoder": "skrub.SimilarityEncoder", "DatetimeEncoder": "skrub.DatetimeEncoder", "deduplicate": "skrub.deduplicate", + "to_datetime": "skrub.to_datetime", "TableVectorizer": "skrub.TableVectorizer", "DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly", "DatasetAll": "skrub.datasets._fetching.DatasetAll", diff --git a/examples/03_datetime_encoder.py b/examples/03_datetime_encoder.py index 358187a7b..d89dff662 100644 --- a/examples/03_datetime_encoder.py +++ b/examples/03_datetime_encoder.py @@ -34,6 +34,9 @@ .. |HGBR| replace:: :class:`~sklearn.ensemble.HistGradientBoostingRegressor` + +.. |to_datetime| replace:: + :func:`~skrub.to_datetime` """ @@ -46,19 +49,26 @@ # on the location, date and time of measurement. from pprint import pprint - import pandas as pd data = pd.read_csv( "https://raw.githubusercontent.com/pandas-dev/pandas" "/main/doc/data/air_quality_no2_long.csv" -) +).sort_values("date.utc") # Extract our input data (X) and the target column (y) y = data["value"] X = data[["city", "date.utc"]] X +############################################################################### +# We convert the dataframe date columns using |to_datetime|. Notice how +# we don't need to specify the columns to convert. +from skrub import to_datetime + +X = to_datetime(X) +X.dtypes + ############################################################################### # Encoding the features # ..................... @@ -73,14 +83,12 @@ # lower units, as they are probably unimportant. from sklearn.preprocessing import OneHotEncoder - -from skrub import DatetimeEncoder - from sklearn.compose import make_column_transformer +from skrub import DatetimeEncoder encoder = make_column_transformer( (OneHotEncoder(handle_unknown="ignore"), ["city"]), - (DatetimeEncoder(add_day_of_the_week=True, extract_until="minute"), ["date.utc"]), + (DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]), remainder="drop", ) @@ -88,12 +96,9 @@ pprint(encoder.get_feature_names_out()) ############################################################################### -# We see that the encoder is working as expected: the "date.utc" column has -# been replaced by features extracting the month, day, hour, and day of the -# week information. -# -# Note the year and minute features are not present, this is because they -# have been removed by the encoder as they are constant the whole period. +# We see that the encoder is working as expected: the ``"date.utc"`` column has +# been replaced by features extracting the month, day, hour, minute, day of the +# week and total second since Epoch information. ############################################################################### # One-liner with the |TableVectorizer| @@ -104,8 +109,7 @@ from skrub import TableVectorizer -table_vec = TableVectorizer() -table_vec.fit_transform(X) +table_vec = TableVectorizer().fit(X) pprint(table_vec.get_feature_names_out()) ############################################################################### @@ -116,8 +120,7 @@ table_vec = TableVectorizer( datetime_transformer=DatetimeEncoder(add_day_of_the_week=True), -) -table_vec.fit_transform(X) +).fit(X) pprint(table_vec.get_feature_names_out()) ############################################################################### @@ -144,14 +147,9 @@ # ```py # from sklearn.experimental import enable_hist_gradient_boosting # ``` - -import numpy as np from sklearn.ensemble import HistGradientBoostingRegressor from sklearn.pipeline import make_pipeline -table_vec = TableVectorizer( - datetime_transformer=DatetimeEncoder(add_day_of_the_week=True), -) pipeline = make_pipeline(table_vec, HistGradientBoostingRegressor()) ############################################################################### @@ -164,11 +162,6 @@ # # Instead, we can use the |TimeSeriesSplit|, # which ensures that the test set is always in the future. - -sorted_indices = np.argsort(X["date.utc"]) -X = X.iloc[sorted_indices] -y = y.iloc[sorted_indices] - from sklearn.model_selection import TimeSeriesSplit, cross_val_score cross_val_score( @@ -185,82 +178,71 @@ # # The mean squared error is not obvious to interpret, so we compare # visually the prediction of our model with the actual values. - +import numpy as np import matplotlib.pyplot as plt -from matplotlib.dates import AutoDateFormatter, AutoDateLocator - -X_train = X[X["date.utc"] < "2019-06-01"] -X_test = X[X["date.utc"] >= "2019-06-01"] -y_train = y[X["date.utc"] < "2019-06-01"] -y_test = y[X["date.utc"] >= "2019-06-01"] +mask_train = X["date.utc"] < "2019-06-01" +X_train, X_test = X.loc[mask_train], X.loc[~mask_train] +y_train, y_test = y.loc[mask_train], y.loc[~mask_train] pipeline.fit(X_train, y_train) +y_pred = pipeline.predict(X_test) all_cities = X_test["city"].unique() -fig, axs = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) -fig.subplots_adjust(hspace=0.5) +fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) +for ax, city in zip(axes, all_cities): + mask_prediction = X_test["city"] == city + date_prediction = X_test.loc[mask_prediction]["date.utc"] + y_prediction = y_pred[mask_prediction] -for i, city in enumerate(all_cities): - axs[i].plot( - X.loc[X.city == city, "date.utc"], - y.loc[X.city == city], - label="Actual", - ) - axs[i].plot( - X_test.loc[X_test.city == city, "date.utc"], - pipeline.predict(X_test.loc[X_test.city == city]), - label="Predicted", + mask_reference = X["city"] == city + date_reference = X.loc[mask_reference]["date.utc"] + y_reference = y[mask_reference] + + ax.plot(date_reference, y_reference, label="Actual") + ax.plot(date_prediction, y_prediction, label="Predicted") + + ax.set( + ylabel="NO2", + title=city, ) - axs[i].set_title(city) - axs[i].set_ylabel("NO2") - xtick_locator = AutoDateLocator(maxticks=8) - xtick_formatter = AutoDateFormatter(xtick_locator) - axs[i].xaxis.set_major_locator(xtick_locator) - axs[i].xaxis.set_major_formatter(xtick_formatter) - axs[i].legend() + ax.legend() + +fig.subplots_adjust(hspace=0.5) plt.show() ############################################################################### # Let's zoom on a few days: -X_zoomed = X[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")] -y_zoomed = y[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")] - -X_train_zoomed = X_zoomed[X_zoomed["date.utc"] < "2019-06-03"] -X_test_zoomed = X_zoomed[X_zoomed["date.utc"] >= "2019-06-03"] +mask_zoom_reference = (X["date.utc"] >= "2019-06-01") & (X["date.utc"] < "2019-06-04") +mask_zoom_prediction = (X_test["date.utc"] >= "2019-06-01") & ( + X_test["date.utc"] < "2019-06-04" +) -y_train_zoomed = y[X["date.utc"] < "2019-06-03"] -y_test_zoomed = y[X["date.utc"] >= "2019-06-03"] +all_cities = ["Paris", "London"] +fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9)) +for ax, city in zip(axes, all_cities): + mask_prediction = (X_test["city"] == city) & mask_zoom_prediction + date_prediction = X_test.loc[mask_prediction]["date.utc"] + y_prediction = y_pred[mask_prediction] -zoomed_cities = X_test_zoomed["city"].unique() + mask_reference = (X["city"] == city) & mask_zoom_reference + date_reference = X.loc[mask_reference]["date.utc"] + y_reference = y[mask_reference] -fig, axs = plt.subplots(nrows=len(zoomed_cities), ncols=1, figsize=(12, 9)) -fig.subplots_adjust(hspace=0.5) + ax.plot(date_reference, y_reference, label="Actual") + ax.plot(date_prediction, y_prediction, label="Predicted") -for i, city in enumerate(zoomed_cities): - axs[i].plot( - X_zoomed.loc[X_zoomed["city"] == city, "date.utc"], - y_zoomed.loc[X_zoomed["city"] == city], - label="Actual", - ) - axs[i].plot( - X_test_zoomed.loc[X_test_zoomed["city"] == city, "date.utc"], - pipeline.predict(X_test_zoomed.loc[X_test_zoomed["city"] == city]), - label="Predicted", + ax.set( + ylabel="NO2", + title=city, ) - axs[i].set_title(city) - axs[i].set_ylabel("NO2") - - xtick_locator = AutoDateLocator(maxticks=8) - xtick_formatter = AutoDateFormatter(xtick_locator) - axs[i].xaxis.set_major_locator(xtick_locator) - axs[i].xaxis.set_major_formatter(xtick_formatter) + ax.legend() - axs[i].legend() plt.show() + ############################################################################### # Features importance # ------------------- @@ -280,27 +262,28 @@ # In this case, we don't use a pipeline, because we want to compute the # importance of the features created by the DatetimeEncoder -X_ = table_vec.fit_transform(X) -reg = HistGradientBoostingRegressor().fit(X_, y) -result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0) -std = result.importances_std -importances = result.importances_mean -indices = np.argsort(importances) -# Sort from least to most -indices = list(reversed(indices)) - -plt.figure(figsize=(12, 9)) -plt.title("Feature importances") -n = len(indices) -labels = np.array(table_vec.get_feature_names_out())[indices] -plt.barh(range(n), importances[indices], color="b", yerr=std[indices]) -plt.yticks(range(n), labels, size=15) -plt.tight_layout(pad=1) -plt.show() +X_transform = table_vec.fit_transform(X) +feature_names = table_vec.get_feature_names_out() + +model = HistGradientBoostingRegressor().fit(X_transform, y) +result = permutation_importance(model, X_transform, y, n_repeats=10, random_state=0) + +result = pd.DataFrame( + dict( + feature_names=feature_names, + std=result.importances_std, + importances=result.importances_mean, + ) +).sort_values("importances", ascending=False) + +result.plot.barh( + y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9) +) +plt.tight_layout() ############################################################################### -# We can see that the hour of the day is the most important feature, -# which seems reasonable. +# We can see that the total seconds since Epoch and the hour of the day +# are the most important feature, which seems reasonable. # # Conclusion # ---------- diff --git a/skrub/__init__.py b/skrub/__init__.py index 868632080..a55cc134a 100644 --- a/skrub/__init__.py +++ b/skrub/__init__.py @@ -5,7 +5,7 @@ from ._agg_joiner import AggJoiner, AggTarget from ._check_dependencies import check_dependencies -from ._datetime_encoder import DatetimeEncoder +from ._datetime_encoder import DatetimeEncoder, to_datetime from ._deduplicate import compute_ngram_distance, deduplicate from ._fuzzy_join import fuzzy_join from ._gap_encoder import GapEncoder @@ -34,6 +34,7 @@ "TargetEncoder", "deduplicate", "compute_ngram_distance", + "to_datetime", "AggJoiner", "AggTarget", "SelectCols", diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py index ed369964b..0e461c3bf 100644 --- a/skrub/_agg_joiner.py +++ b/skrub/_agg_joiner.py @@ -155,7 +155,7 @@ class AggJoiner(BaseEstimator, TransformerMixin): airportId airportName company_mode_1 total_passengers_mean_1 0 1 Paris CDG AF 103.33... 1 2 NY JFK DL 80.00... - """ # noqa: E501 + """ def __init__( self, @@ -416,18 +416,19 @@ class AggTarget(BaseEstimator, TransformerMixin): ... "company": ["DL", "AF", "AF", "DL", "DL", "TR"], ... }) >>> y = np.array([1, 1, 0, 0, 1, 1]) - >>> join_agg = AggTarget( + >>> agg_target = AggTarget( ... main_key="company", ... operation=["mean", "max"], ... ) - >>> join_agg.fit_transform(X, y) + >>> agg_target.fit_transform(X, y) flightId from_airport ... y_0_max_target y_0_mean_target - 0 1 1 ... 1 0.66... - 1 2 1 ... 1 0.50... - 2 3 1 ... 1 0.50... - 3 4 2 ... 1 0.66... - 4 5 2 ... 1 0.66... - 5 6 2 ... 1 1.00... + 0 1 1 ... 1 0.666667 + 1 2 1 ... 1 0.500000 + 2 3 1 ... 1 0.500000 + 3 4 2 ... 1 0.666667 + 4 5 2 ... 1 0.666667 + 5 6 2 ... 1 1.000000 + [6 rows x 6 columns] """ diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py index dccb39301..21839c41d 100644 --- a/skrub/_datetime_encoder.py +++ b/skrub/_datetime_encoder.py @@ -1,14 +1,18 @@ -from typing import Literal +import warnings +from collections import defaultdict +from typing import Iterable import numpy as np import pandas as pd -from numpy.typing import ArrayLike, NDArray +from pandas._libs.tslibs.parsing import guess_datetime_format from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_array +from sklearn.utils.fixes import parse_version from sklearn.utils.validation import check_is_fitted -from skrub._utils import check_input +from ._dataframe._namespace import get_df_namespace -WORD_TO_ALIAS: dict[str, str] = { +WORD_TO_ALIAS = { "year": "Y", "month": "M", "day": "D", @@ -18,78 +22,490 @@ "microsecond": "us", "nanosecond": "N", } -TIME_LEVELS: list[str] = list(WORD_TO_ALIAS.keys()) -AcceptedTimeValues = Literal[ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", -] - - -class DatetimeEncoder(BaseEstimator, TransformerMixin): - """Transform each datetime column into several numeric columns \ - for temporal features (e.g. "year", "month", "day"...). - - Constant extracted features are dropped; for instance, if the year is - always the same in a feature, the extracted "year" column won't be added. +TIME_LEVELS = list(WORD_TO_ALIAS) + + +def _is_pandas_format_mixed_available(): + pandas_version = pd.__version__ + min_pandas_version = "2.0.0" + return parse_version(min_pandas_version) < parse_version(pandas_version) + + +MIXED_FORMAT = "mixed" if _is_pandas_format_mixed_available() else None + + +def to_datetime( + X, + errors="coerce", + **kwargs, +): + """Convert the columns of a dataframe or 2d array into a datetime representation. + + This function augments :func:`pandas.to_datetime` by supporting dataframes + and 2d array inputs. It only attempts to convert columns whose dtype are + object or string. Numeric columns are skip and preserved in the output. + + Use the 'format' keyword to force a specific datetime format. See more details in + the parameters section. + + Parameters + ---------- + X : Pandas or Polars dataframe, 2d-array or any input accepted \ + by ``pd.to_datetime`` + The object to convert to a datetime. + + errors : {'coerce', 'raise'}, default 'coerce' + When set to 'raise', errors will be raised only when the following conditions + are satisfied, for each column ``X_col``: + - After converting to numpy, the column dtype is np.object_ or np.str_ + - Each entry of the column is datetime-parsable, i.e. + ``pd.to_datetime(X_col, format="mixed")`` doesn't raise an error. + This step is conservative, because e.g. + ``["2020-01-01", "hello", "2020-01-01"]`` + is not considered datetime-parsable, so we won't attempt to convert it). + - The column as a whole is not datetime-parsable, due to a clash of datetime + format, e.g. '2020/01/01' and '2020-01-01'. + + When set to ``'coerce'``, the entries of ``X_col`` that should have raised + an error are set to ``NaT`` instead. + You can choose which format to use with the keyword argument ``format``, as with + ``pd.to_datetime``, e.g. ``to_datetime(X_col, format='%Y/%m/%d')``. + Combined with ``error='coerce'``, this will convert all entries that don't + match this format to ``NaT``. + + Note that the ``'ignore'`` option is not used and will raise an error. + + **kwargs : key, value mappings + Other keyword arguments are passed down to :func:`pandas.to_datetime`. + + One notable argument is 'format'. Setting a format overwrites + the datetime format guessing behavior of this function for all columns. + + Note that we don't encourage you to use dayfirst or monthfirst argument, since + their behavior is ambiguous and might not be applied at all. + + Moreover, this function raises an error if 'unit' is set to any value. + This is because, in ``pandas.to_datetime``, 'unit' is specific to timestamps, + whereas in ``skrub.to_datetime`` we don't attempt to parse numeric columns. + + Returns + ------- + datetime + Return type depends on input. + - dataframes, series and 2d arrays return the same type + - otherwise return the same output as :func:`pandas.to_datetime`. + + See Also + -------- + :func:`pandas.to_datetime` + Convert argument to datetime. + + Examples + -------- + >>> X = pd.DataFrame(dict(a=[1, 2], b=["2021-01-01", "2021-02-02"])) + >>> X + a b + 0 1 2021-01-01 + 1 2 2021-02-02 + >>> to_datetime(X) + a b + 0 1 2021-01-01 + 1 2 2021-02-02 + """ + errors_options = ["coerce", "raise"] + if errors not in errors_options: + raise ValueError(f"errors options are {errors_options!r}, got {errors!r}.") + kwargs["errors"] = errors + + if "unit" in kwargs: + raise ValueError( + "'unit' is not a parameter of skrub.to_datetime; it is only meaningful " + "when applying pandas.to_datetime to a numerical column" + ) + + # dataframe + if hasattr(X, "__dataframe__"): + return _to_datetime_dataframe(X, **kwargs) + + # series, this attribute is available since Pandas 2.1.0 + elif hasattr(X, "__column_consortium_standard__"): + return _to_datetime_series(X, **kwargs) + + # 2d array + elif isinstance(X, Iterable) and np.asarray(X).ndim == 2: + X = _to_datetime_2d_array(np.asarray(X), **kwargs) + return np.vstack(X).T + + # 1d array + elif isinstance(X, Iterable) and np.asarray(X).ndim == 1: + return _to_datetime_1d_array(np.asarray(X), **kwargs) + + # scalar or unknown type + elif np.asarray(X).ndim == 0: + return _to_datetime_scalar(X, **kwargs) + + else: + raise TypeError( + "X must be a Dataframe, series, 2d array or any " + f"valid input for ``pd.to_datetime``. Got {X=!r}." + ) + + +def _to_datetime_dataframe(X, **kwargs): + """Dataframe specialization of ``_to_datetime_2d``. + + Parameters + ---------- + X : Pandas or Polars dataframe + + Returns + ------- + X : Pandas or Polars dataframe + """ + skrub_px, _ = get_df_namespace(X) + index = getattr(X, "index", None) + X_split = [X[col].to_numpy() for col in X.columns] + X_split = _to_datetime_2d(X_split, **kwargs) + X_split = {col: X_split[col_idx] for col_idx, col in enumerate(X.columns)} + return skrub_px.make_dataframe(X_split, index=index) + + +def _to_datetime_series(X, **kwargs): + """Series specialization of :func:`pandas.to_datetime`. + + Parameters + ---------- + X : Pandas or Polars series + + Returns + ------- + X : Pandas or Polars series + """ + skrub_px, _ = get_df_namespace(X.to_frame()) + index = getattr(X, "index", None) + name = X.name + X_split = [X.to_numpy()] + X_split = _to_datetime_2d(X_split, **kwargs) + return skrub_px.make_series(X_split[0], index=index, name=name) + + +def _to_datetime_2d_array(X, **kwargs): + """2d array specialization of ``_to_datetime_2d``. + + Parameters + ---------- + X : ndarray of shape ``(n_samples, n_features)`` + + Returns + ------- + X_split : list of array, of shape ``n_features`` + """ + X_split = list(X.T) + return _to_datetime_2d(X_split, **kwargs) + + +def _to_datetime_1d_array(X, **kwargs): + X_split = [X] + X_split = _to_datetime_2d(X_split, **kwargs) + return np.asarray(X_split[0]) + + +def _to_datetime_scalar(X, **kwargs): + X_split = [np.atleast_1d(X)] + X_split = _to_datetime_2d(X_split, **kwargs) + return X_split[0][0] + + +def _to_datetime_2d( + X_split, + indices=None, + index_to_format=None, + format=None, + **kwargs, +): + """Convert datetime parsable columns from a 2d array or dataframe \ + to datetime format. + + The conversion is done inplace. + + Parameters + ---------- + X_split : list of 1d array of length n_features + The 2d input, chunked into a list of array. This format allows us + to treat each column individually and preserve their dtype, because + dataframe.to_numpy() casts all columns to object when at least one + column dtype is object. + + indices : list of int, default=None + Indices of the parsable columns to convert. + If None, indices are computed using the current input X. + + index_to_format : mapping of int to str, default=None + Dictionary mapping column indices to their datetime format. + It defines the format parameter for each column when calling + pd.to_datetime. + + If indices is None, ``indices_to_format`` is computed using the + current input X. + If format is not None, all values of ``indices_to_format`` are set + to format. + + format : str, default=None + When format is not None, it overwrites the values in indices_to_format. + + Returns + ------- + X_split : list of 1d array of length n_features + """ + if indices is None: + indices, index_to_format = _get_datetime_column_indices(X_split) + + # format overwrite indices_to_format + if format is not None: + index_to_format = {col_idx: format for col_idx in indices} + + for col_idx in indices: + X_split[col_idx] = pd.to_datetime( + X_split[col_idx], format=index_to_format[col_idx], **kwargs + ) + + return X_split + + +def _get_datetime_column_indices(X_split, dayfirst=True): + """Select the datetime parsable columns by their indices \ + and return their datetime format. + + Parameters + ---------- + X_split : list of 1d array of length n_features + + Returns + ------- + datetime_indices : list of int + List of parsable column, identified by their indices. + + index_to_format: mapping of int to str + Dictionary mapping parsable column indices to their datetime format. + """ + indices = [] + index_to_format = {} + + for col_idx, X_col in enumerate(X_split): + X_col = X_col[pd.notnull(X_col)] + + # convert pd.TimeStamp to np.datetime64 + if all(isinstance(val, pd.Timestamp) for val in X_col): + X_col = X_col.astype("datetime64") + + if _is_column_datetime_parsable(X_col): + indices.append(col_idx) + + if np.issubdtype(X_col.dtype, np.datetime64): + # We don't need to specify a parsing format + # for columns that are already of type datetime64. + datetime_format = None + else: + datetime_format = _guess_datetime_format(X_col) + + index_to_format[col_idx] = datetime_format + + return indices, index_to_format + + +def _is_column_datetime_parsable(X_col): + """Check whether a 1d array can be converted into a \ + :class:`pandas.DatetimeIndex`. + + Parameters + ---------- + X_col : array-like of shape ``(n_samples,)`` + + Returns + ------- + is_dt_parsable : bool + """ + # Remove columns of int, float or bool casted as object. + # Pandas < 2.0.0 raise a deprecation warning instead of an error. + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=DeprecationWarning) + try: + if np.array_equal(X_col, X_col.astype(np.float64)): + return False + except ValueError: + pass + + np_dtypes_candidates = [np.object_, np.str_, np.datetime64] + is_type_datetime_compatible = any( + np.issubdtype(X_col.dtype, np_dtype) for np_dtype in np_dtypes_candidates + ) + if is_type_datetime_compatible: + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + # format=mixed parses entries individually, + # avoiding ValueError when both date and datetime formats + # are present. + # At this stage, the format itself doesn't matter. + _ = pd.to_datetime(X_col, format=MIXED_FORMAT) + return True + except (pd.errors.ParserError, ValueError): + pass + return False + + +def _guess_datetime_format(X_col): + """Infer the format of a 1d array. + + This functions uses Pandas ``guess_datetime_format`` routine for both + dayfirst and monthfirst case, and select either format when using one + give a unify format on the array. + + When both dayfirst and monthfirst format are possible, we select + monthfirst by default. + + You can overwrite this behaviour by setting a format of the caller function. + Setting a format always take precedence over infering it using + ``_guess_datetime_format``. + + Parameters + ---------- + X_col : ndarray of shape ``(n_samples,)`` + X_col must only contains string objects without any missing value. + + Returns + ------- + datetime_format : str or None + """ + X_col = X_col.astype(np.object_) + vfunc = np.vectorize(guess_datetime_format) + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=UserWarning) + # pd.unique handles None + month_first_formats = pd.unique(vfunc(X_col, dayfirst=False)) + day_first_formats = pd.unique(vfunc(X_col, dayfirst=True)) + + if len(month_first_formats) == 1 and month_first_formats[0] is not None: + return str(month_first_formats[0]) + + elif len(day_first_formats) == 1 and day_first_formats[0] is not None: + return str(day_first_formats[0]) + + # special heuristic: when both date and datetime formats are + # present, allow the format to be mixed. + elif ( + len(month_first_formats) == 2 + and len(day_first_formats) == 2 + and len(month_first_formats[0]) != len(month_first_formats[1]) + ): + return MIXED_FORMAT + + else: + return None + + +def _is_column_date_only(X_col): + """Check whether a :obj:`pandas.DatetimeIndex` only contains dates. + + Parameters + ---------- + X_col : pandas.DatetimeIndex of shape ``(n_samples,)`` + + Returns + ------- + is_date : bool + """ + return np.array_equal(X_col, X_col.normalize()) + + +def _datetime_to_total_seconds(X_col): + """ + Parameters + ---------- + X_col : DatetimeIndex of shape (n_samples,) + + Returns + ------- + X_col : ndarray of shape (n_samples) + """ + if X_col.tz is not None: + X_col = X_col.tz_convert("utc") + + # Total seconds since epoch + mask_notnull = X_col == X_col + + return np.where( + mask_notnull, + X_col.astype("int64") / 1e9, + np.nan, + ) + + +class DatetimeEncoder(TransformerMixin, BaseEstimator): + """Transforms each datetime column into several numeric columns \ + for temporal features (e.g year, month, day...). + If the dates are timezone aware, all the features extracted will correspond to the provided timezone. Parameters ---------- - extract_until : {"year", "month", "day", "hour", "minute", "second", + resolution : {"year", "month", "day", "hour", "minute", "second", "microsecond", "nanosecond", None}, default="hour" - Extract up to this granularity. - If all non-constant features have not been extracted, - add the "total_time" feature, which contains the time to epoch (in seconds). - For instance, if you specify "day", only "year", "month", "day" and - "total_time" features will be created. - If None, only the "total_time" feature will be created. + Extract up to this resolution. + E.g., ``resolution="day"`` generates the features "year", "month", + "day" only. + If ``None``, no such feature will be created (but day of the week and \ + total seconds may still be extracted, see below). + add_day_of_the_week : bool, default=False - Add day of the week feature (if day is extracted). - This is a numerical feature from 0 (Monday) to 6 (Sunday). + Add day of the week feature as a numerical feature + from 0 (Monday) to 6 (Sunday). + + add_total_seconds : bool, default=True + Add the total number of seconds since Epoch. + + errors : {'coerce', 'raise'}, default="coerce" + During transform: + - If ``"coerce"``, then invalid parsing will be set as ``pd.NaT``. + - If ``"raise"``, then invalid parsing will raise an exception. Attributes ---------- - n_features_in_ : int - Number of features in the data seen during fit. + column_indices_ : list of int + Indices of the datetime-parsable columns. + + index_to_format_ : dict[int, str] + Mapping from column indices to their datetime formats. + + index_to_features_ : dict[int, list[str]] + Dictionary mapping the column names to the list of datetime + features extracted for each column. + n_features_out_ : int Number of features of the transformed data. - features_per_column_ : mapping of int to list of str - Dictionary mapping the index of the original columns - to the list of features extracted for each column. - col_names_ : None or list of str - List of the names of the features of the input data, - if input data was a pandas DataFrame, otherwise None. See Also -------- GapEncoder : Encode dirty categories (strings) by constructing latent topics with continuous encoding. + MinHashEncoder : Encode string columns as a numeric array with the minhash method. + SimilarityEncoder : Encode string columns as a numeric array with n-gram string similarity. Examples -------- - >>> enc = DatetimeEncoder() - - Let's encode the following dates: - + >>> enc = DatetimeEncoder(add_total_seconds=False) >>> X = [['2022-10-15'], ['2021-12-25'], ['2020-05-18'], ['2019-10-15 12:00:00']] - >>> enc.fit(X) - DatetimeEncoder() + DatetimeEncoder(add_total_seconds=False) The encoder will output a transformed array - with four columns ("year", "month", "day" and "hour"): + with four columns ("year", "month", "day", "hour"): >>> enc.transform(X) array([[2022., 10., 15., 0.], @@ -98,80 +514,31 @@ class DatetimeEncoder(BaseEstimator, TransformerMixin): [2019., 10., 15., 12.]]) """ - n_features_in_: int - n_features_out_: int - features_per_column_: dict[int, list[str]] - col_names_: list[str] | None - def __init__( self, *, - extract_until: AcceptedTimeValues | None = "hour", - add_day_of_the_week: bool = False, + resolution="hour", + add_day_of_the_week=False, + add_total_seconds=True, + errors="coerce", ): - self.extract_until = extract_until + self.resolution = resolution self.add_day_of_the_week = add_day_of_the_week + self.add_total_seconds = add_total_seconds + self.errors = errors - def _more_tags(self): - """ - Used internally by sklearn to ease the estimator checks. - """ - return { - "X_types": ["2darray", "categorical"], - "allow_nan": True, - "_xfail_checks": {"check_dtype_object": "Specific datetime error."}, - } - - def _validate_keywords(self): - if self.extract_until not in TIME_LEVELS and self.extract_until is not None: - raise ValueError( - f'"extract_until" should be one of {TIME_LEVELS}, ' - f"got {self.extract_until}. " - ) - - @staticmethod - def _extract_from_date(date_series: pd.Series, feature: str): - if feature == "year": - return pd.DatetimeIndex(date_series).year.to_numpy() - elif feature == "month": - return pd.DatetimeIndex(date_series).month.to_numpy() - elif feature == "day": - return pd.DatetimeIndex(date_series).day.to_numpy() - elif feature == "hour": - return pd.DatetimeIndex(date_series).hour.to_numpy() - elif feature == "minute": - return pd.DatetimeIndex(date_series).minute.to_numpy() - elif feature == "second": - return pd.DatetimeIndex(date_series).second.to_numpy() - elif feature == "microsecond": - return pd.DatetimeIndex(date_series).microsecond.to_numpy() - elif feature == "nanosecond": - return pd.DatetimeIndex(date_series).nanosecond.to_numpy() - elif feature == "dayofweek": - return pd.DatetimeIndex(date_series).dayofweek.to_numpy() - elif feature == "total_time": - tz = pd.DatetimeIndex(date_series).tz - # Compute the time in seconds from the epoch time UTC - if tz is None: - return ( - pd.to_datetime(date_series) - pd.Timestamp("1970-01-01") - ) // pd.Timedelta("1s") - else: - return ( - pd.DatetimeIndex(date_series).tz_convert("utc") - - pd.Timestamp("1970-01-01", tz="utc") - ) // pd.Timedelta("1s") - - def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": - """Fit the instance to ``X``. + def fit(self, X, y=None): + """Fit the instance to X. - In practice, just check keywords and input validity, - and stores which extracted features are not constant. + Select datetime-parsable columns and generate the list of + datetime feature to extract. Parameters ---------- - X : array-like, shape (``n_samples``, ``n_features``) - Data where each column is a datetime feature. + X : array-like, shape ``(n_samples, n_features)`` + Input data. Columns that can't be converted into + ``pandas.DatetimeIndex`` and numerical values will + be dropped. y : None Unused, only here for compatibility. @@ -180,95 +547,144 @@ def fit(self, X: ArrayLike, y=None) -> "DatetimeEncoder": DatetimeEncoder Fitted DatetimeEncoder instance (self). """ - self._validate_keywords() - if isinstance(X, pd.DataFrame): - self.col_names_ = X.columns.to_list() - else: - self.col_names_ = None - X = check_input(X) - # Features to extract for each column, after removing constant features - self.features_per_column_ = {} - for i in range(X.shape[1]): - self.features_per_column_[i] = [] - # Check which columns are constant - for i in range(X.shape[1]): - if self.extract_until is None: - if np.nanstd(self._extract_from_date(X[:, i], "total_time")) > 0: - self.features_per_column_[i].append("total_time") - else: - for feature in TIME_LEVELS: - if np.nanstd(self._extract_from_date(X[:, i], feature)) > 0: - if TIME_LEVELS.index(feature) <= TIME_LEVELS.index( - self.extract_until - ): - self.features_per_column_[i].append(feature) - # we add a total_time feature, which contains the full - # time to epoch, if there is at least one - # feature that has not been extracted and is not constant - if TIME_LEVELS.index(feature) > TIME_LEVELS.index( - self.extract_until - ): - self.features_per_column_[i].append("total_time") - break - # Add day of the week feature if needed - if ( - self.add_day_of_the_week - and np.nanstd(self._extract_from_date(X[:, i], "dayofweek")) > 0 - ): - self.features_per_column_[i].append("dayofweek") - - self.n_features_in_ = X.shape[1] - self.n_features_out_ = len( - np.concatenate(list(self.features_per_column_.values())) + if self.resolution not in TIME_LEVELS and self.resolution is not None: + raise ValueError( + f"'resolution' options are {TIME_LEVELS}, got {self.resolution!r}." + ) + + errors_options = ["coerce", "raise"] + if self.errors not in errors_options: + raise ValueError( + f"'errors' options are {errors_options!r}, got {self.errors!r}." + ) + + self._check_feature_names(X, reset=True) + self._check_n_features(X, reset=True) + X = check_array( + X, ensure_2d=True, force_all_finite=False, dtype=None, copy=False ) + self._select_datetime_cols(X) + return self - def transform(self, X: ArrayLike, y=None) -> NDArray: + def _select_datetime_cols(self, X): + """Select datetime-parsable columns and generate the list of + datetime feature to extract. + + If the input only contains dates (and no datetimes), only the features + ["year", "month", "day"] will be filtered with resolution. + + Parameters + ---------- + X : array-like of shape ``(n_samples, n_features)`` + """ + if self.resolution is None: + levels = [] + else: + idx_level = TIME_LEVELS.index(self.resolution) + levels = TIME_LEVELS[: idx_level + 1] + + X_split = np.hsplit(X, X.shape[1]) + self.column_indices_, self.index_to_format_ = _get_datetime_column_indices( + X_split + ) + + self.index_to_features_ = defaultdict(list) + self.n_features_out_ = 0 + + for col_idx in self.column_indices_: + X_col = pd.DatetimeIndex(X[:, col_idx]) + if _is_column_date_only(X_col): + # Keep only date attributes + levels = [ + level for level in levels if level in ["year", "month", "day"] + ] + + self.index_to_features_[col_idx] += levels + self.n_features_out_ += len(levels) + + if self.add_total_seconds: + self.index_to_features_[col_idx].append("total_seconds") + self.n_features_out_ += 1 + + if self.add_day_of_the_week: + self.index_to_features_[col_idx].append("day_of_week") + self.n_features_out_ += 1 + + def transform(self, X, y=None): """Transform ``X`` by replacing each datetime column with \ corresponding numerical features. Parameters ---------- - X : array-like, shape (``n_samples``, ``n_features``) + X : array-like of shape ``(n_samples, n_features)`` The data to transform, where each column is a datetime feature. y : None Unused, only here for compatibility. Returns ------- - ndarray, shape (``n_samples``, ``n_features_out_``) + X_out : ndarray of shape ``(n_samples, n_features_out_)`` Transformed input. """ - check_is_fitted( - self, - attributes=["n_features_in_", "n_features_out_", "features_per_column_"], + check_is_fitted(self) + self._check_n_features(X, reset=False) + self._check_feature_names(X, reset=False) + + X = check_array( + X, + ensure_2d=True, + force_all_finite=False, + dtype=None, + copy=False, ) - X = check_input(X) - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"The number of features in the input data ({X.shape[1]}) " - "does not match the number of features " - f"seen during fit ({self.n_features_in_}). " - ) - # Create a new array with the extracted features, - # choosing only features that weren't constant during fit - X_ = np.empty((X.shape[0], self.n_features_out_), dtype=np.float64) - idx = 0 - for i in range(X.shape[1]): - for j, feature in enumerate(self.features_per_column_[i]): - X_[:, idx + j] = self._extract_from_date(X[:, i], feature) - idx += len(self.features_per_column_[i]) - return X_ - - def get_feature_names_out(self, input_features=None) -> list[str]: - """Return clean feature names. + X_split = _to_datetime_2d_array( + X, + indices=self.column_indices_, + index_to_format=self.index_to_format_, + errors=self.errors, + ) + + return self._extract_features(X_split) + + def _extract_features(self, X_split): + """Extract datetime features from the selected columns. + + Parameters + ---------- + X_split : list of 1d array of length n_features + + Returns + ------- + X_out : ndarray of shape ``(n_samples, n_features_out_)`` + """ + # X_out must be of dtype float64 otherwise np.nan will overflow + # to large negative numbers. + X_out = np.empty((X_split[0].shape[0], self.n_features_out_), dtype=np.float64) + offset_idx = 0 + for col_idx in self.column_indices_: + X_col = X_split[col_idx] + features = self.index_to_features_[col_idx] + for feat_idx, feature in enumerate(features): + if feature == "total_seconds": + X_feature = _datetime_to_total_seconds(X_col) + else: + X_feature = getattr(X_col, feature).to_numpy() + X_out[:, offset_idx + feat_idx] = X_feature + + offset_idx += len(features) + + return X_out + + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. Feature names are formatted like: "_" if the original data has column names, otherwise with format "_" where `` is one of {"year", "month", "day", "hour", "minute", "second", - "microsecond", "nanosecond", "dayofweek"}. + "microsecond", "nanosecond", "day_of_week"}. Parameters ---------- @@ -277,12 +693,23 @@ def get_feature_names_out(self, input_features=None) -> list[str]: Returns ------- - list of str + feature_names : list of str List of feature names. """ + check_is_fitted(self, "index_to_features_") feature_names = [] - for i in self.features_per_column_.keys(): - prefix = str(i) if self.col_names_ is None else self.col_names_[i] - for feature in self.features_per_column_[i]: - feature_names.append(f"{prefix}_{feature}") + columns = getattr(self, "feature_names_in_", list(range(self.n_features_in_))) + for col_idx, features in self.index_to_features_.items(): + column = columns[col_idx] + feature_names += [f"{column}_{feat}" for feat in features] return feature_names + + def _more_tags(self): + """ + Used internally by sklearn to ease the estimator checks. + """ + return { + "X_types": ["2darray", "categorical"], + "allow_nan": True, + "_xfail_checks": {"check_dtype_object": "Specific datetime error."}, + } diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py index fa7e93a93..3881eac67 100644 --- a/skrub/tests/test_datetime_encoder.py +++ b/skrub/tests/test_datetime_encoder.py @@ -1,494 +1,464 @@ +from copy import deepcopy +from itertools import product + import numpy as np import pandas as pd import pytest -from sklearn.exceptions import NotFittedError +from numpy.testing import assert_allclose, assert_array_equal +from pandas.testing import assert_frame_equal + +from skrub._datetime_encoder import ( + TIME_LEVELS, + DatetimeEncoder, + _is_pandas_format_mixed_available, + to_datetime, +) -from skrub._datetime_encoder import DatetimeEncoder +NANOSECONDS_FORMAT = ( + "%Y-%m-%d %H:%M:%S.%f" if _is_pandas_format_mixed_available() else None +) +MSG_MIN_PANDAS_SKIP = "Pandas format=mixed is not available" -def get_date_array() -> np.array: - return np.array( +def get_date(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"]), - pd.to_datetime(["2021-02-03", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2022-01-01", "2020-12-25", "2022-01-03"]), - pd.to_datetime(["2023-02-03", "2020-02-04", "2023-02-05"]), - ] + ["2020-01-01", "2020-01-02", "2020-01-03"], + ["2021-02-03", "2020-02-04", "2021-02-05"], + ["2022-01-01", "2020-12-25", "2022-01-03"], + ["2023-02-03", "2020-02-04", "2023-02-05"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_constant_date_array() -> np.array: - return np.array( +def get_datetime(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - pd.to_datetime(["2020-01-01", "2020-02-04", "2021-02-05"]), - ] + ["2020-01-01 10:12:01", "2020-01-02 10:23:00", "2020-01-03 10:00:00"], + ["2021-02-03 12:45:23", "2020-02-04 22:12:00", "2021-02-05 12:00:00"], + ["2022-01-01 23:23:43", "2020-12-25 11:12:00", "2022-01-03 11:00:00"], + ["2023-02-03 11:12:12", "2020-02-04 08:32:00", "2023-02-05 23:00:00"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_datetime_array() -> np.array: - return np.array( +def get_nanoseconds(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime( - [ - "2020-01-01 10:12:01", - "2020-01-02 10:23:00", - "2020-01-03 10:00:00", - ], - ), - pd.to_datetime( - [ - "2021-02-03 12:45:23", - "2020-02-04 22:12:00", - "2021-02-05 12:00:00", - ], - ), - pd.to_datetime( - [ - "2022-01-01 23:23:43", - "2020-12-25 11:12:00", - "2022-01-03 11:00:00", - ], - ), - pd.to_datetime( - [ - "2023-02-03 11:12:12", - "2020-02-04 08:32:00", - "2023-02-05 23:00:00", - ], - ), - ] + ["2020-08-24 15:55:30.123456789", "2020-08-24 15:55:30.123456789"], + ["2020-08-20 14:56:31.987654321", "2021-07-20 14:56:31.987654321"], + ["2020-08-20 14:57:32.123987654", "2023-09-20 14:57:32.123987654"], + ["2020-08-20 14:58:33.987123456", "2023-09-20 14:58:33.987123456"], + ], ) + if as_array: + return df.to_numpy() + return df -def get_datetime_array_nanoseconds() -> np.array: - return np.array( +def get_nan_datetime(as_array=False): + df = pd.DataFrame( [ - pd.to_datetime( - [ - # constant year and month - # for the first feature - "2020-08-24 15:55:30.123456789", - "2020-08-24 15:55:30.123456789", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:56:31.987654321", - "2021-07-20 14:56:31.987654321", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:57:32.123987654", - "2023-09-20 14:57:32.123987654", - ], - ), - pd.to_datetime( - [ - "2020-08-20 14:58:33.987123456", - "2023-09-20 14:58:33.987123456", - ], - ), - ] + ["2020-01-01 10:12:01", None, "2020-01-03 10:00:00"], + [np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"], + ["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NA], + ], ) + if as_array: + return df.to_numpy() + return df -def get_dirty_datetime_array() -> np.array: - return np.array( +def get_tz_datetime(as_array=False): + # The equivalent dtype is "datetime64[ns, Asia/Kolkata]" + df = pd.DataFrame( [ - np.array( - pd.to_datetime( - [ - "2020-01-01 10:12:01", - "2020-01-02 10:23:00", - "2020-01-03 10:00:00", - ] - ) - ), - np.array( - pd.to_datetime([np.nan, "2020-02-04 22:12:00", "2021-02-05 12:00:00"]) - ), - np.array( - pd.to_datetime(["2022-01-01 23:23:43", "2020-12-25 11:12:00", pd.NaT]) - ), - np.array( - pd.to_datetime( - [ - "2023-02-03 11:12:12", - "2020-02-04 08:32:00", - "2023-02-05 23:00:00", - ] - ) - ), - ] + ["2020-01-01 10:12:01+05:30"], + ["2021-02-03 12:45:23+05:30"], + ["2022-01-01 23:23:43+05:30"], + ["2023-02-03 11:12:12+05:30"], + ], + ) + if as_array: + return df.to_numpy() + return df + + +def get_mixed_type_dataframe(): + return pd.DataFrame( + dict( + a=["2020-01-01", "2020-02-04", "2021-02-05"], + b=["yo", "ya", "yu"], + c=[1, 2, 3], + d=["1", "2", "3"], + e=["01/01/2023", "03/01/2023", "14/01/2023"], + f=[True, False, True], + ) ) -def get_datetime_with_TZ_array() -> pd.DataFrame: - res = pd.DataFrame( - [ - pd.to_datetime(["2020-01-01 10:12:01"]), - pd.to_datetime(["2021-02-03 12:45:23"]), - pd.to_datetime(["2022-01-01 23:23:43"]), - pd.to_datetime(["2023-02-03 11:12:12"]), - ] +def get_mixed_datetime_format(as_array=False): + df = pd.DataFrame( + dict( + a=[ + "2022-10-15", + "2021-12-25", + "2020-05-18", + "2019-10-15 12:00:00", + ] + ) ) - for col in res.columns: - res[col] = pd.DatetimeIndex(res[col]).tz_localize("Asia/Kolkata") - return res - - -def test_fit() -> None: - # Dates - X = get_date_array() - enc = DatetimeEncoder() - expected_features_per_column_ = { - 0: ["year", "month", "day"], - 1: ["month", "day"], - 2: ["year", "month", "day"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - X = get_date_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_features_per_column_ = { - 0: ["year", "month", "day", "dayofweek"], - 1: ["month", "day", "dayofweek"], - 2: ["year", "month", "day", "dayofweek"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # Datetimes - X = get_datetime_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "total_time", "dayofweek"], - 1: ["month", "day", "hour", "total_time", "dayofweek"], - 2: ["year", "month", "day", "hour", "dayofweek"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # we check that the features are extracted until `extract_until` - # that constant feature are not extracted - # and that the total_time feature is extracted if needed - X = get_datetime_array() - enc = DatetimeEncoder(extract_until="minute") - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "minute", "total_time"], - 1: ["month", "day", "hour", "minute"], - 2: ["year", "month", "day", "hour"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # extract_until="nanosecond" - X = get_datetime_array_nanoseconds() - enc = DatetimeEncoder(extract_until="nanosecond") - expected_features_per_column_ = { - # constant year and month - # for first feature - 0: [ - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ], - 1: [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - # Dirty Datetimes - X = get_dirty_datetime_array() - enc = DatetimeEncoder() - expected_features_per_column_ = { - 0: ["year", "month", "day", "hour", "total_time"], - 1: ["month", "day", "hour", "total_time"], - 2: ["year", "month", "day", "hour"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ + if as_array: + return df.to_numpy() + return df - # Datetimes with TZ - X = get_datetime_with_TZ_array() - enc = DatetimeEncoder() - expected_features_per_column_ = {0: ["year", "month", "day", "hour", "total_time"]} - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - # Feature names - # Without column names - X = get_datetime_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_feature_names = [ - "0_year", - "0_month", - "0_day", - "0_hour", - "0_total_time", - "0_dayofweek", - "1_month", - "1_day", - "1_hour", - "1_total_time", - "1_dayofweek", - "2_year", - "2_month", - "2_day", - "2_hour", - "2_dayofweek", - ] +@pytest.mark.parametrize("as_array", [True, False]) +@pytest.mark.parametrize( + "get_data_func, features, format", + [ + (get_date, TIME_LEVELS[: TIME_LEVELS.index("day") + 1], "%Y-%m-%d"), + (get_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S"), + (get_tz_datetime, TIME_LEVELS, "%Y-%m-%d %H:%M:%S%z"), + (get_nanoseconds, TIME_LEVELS, NANOSECONDS_FORMAT), + ], +) +@pytest.mark.parametrize( + "add_total_seconds, add_day_of_the_week", + list(product([True, False], [True, False])), +) +@pytest.mark.parametrize("resolution", TIME_LEVELS) +def test_fit( + as_array, + get_data_func, + features, + format, + add_total_seconds, + add_day_of_the_week, + resolution, +): + X = get_data_func(as_array=as_array) + enc = DatetimeEncoder( + add_day_of_the_week=add_day_of_the_week, + add_total_seconds=add_total_seconds, + resolution=resolution, + ) enc.fit(X) - assert enc.get_feature_names_out() == expected_feature_names - # With column names - X = get_datetime_array() - X = pd.DataFrame(X) - X.columns = ["col1", "col2", "col3"] - enc = DatetimeEncoder(add_day_of_the_week=True) + total_seconds = ["total_seconds"] if add_total_seconds else [] + day_of_week = ["day_of_week"] if add_day_of_the_week else [] + + if resolution in features: + features_ = features[: features.index(resolution) + 1] + else: + features_ = deepcopy(features) + + features_ += total_seconds + day_of_week + columns = range(X.shape[1]) + + expected_index_to_features = {col: features_ for col in columns} + expected_index_to_format = {col: format for col in columns} + expected_n_features_out = len(features_) * X.shape[1] expected_feature_names = [ - "col1_year", - "col1_month", - "col1_day", - "col1_hour", - "col1_total_time", - "col1_dayofweek", - "col2_month", - "col2_day", - "col2_hour", - "col2_total_time", - "col2_dayofweek", - "col3_year", - "col3_month", - "col3_day", - "col3_hour", - "col3_dayofweek", + f"{col}_{feature}" for col in columns for feature in features_ ] - enc.fit(X) + + assert enc.index_to_features_ == expected_index_to_features + assert enc.index_to_format_ == expected_index_to_format + assert enc.n_features_out_ == expected_n_features_out assert enc.get_feature_names_out() == expected_feature_names -def test_transform() -> None: - # Dates - X = get_date_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( - [ - [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4], - [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4], - [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0], - [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6], - ] +def test_format_nan(): + X = get_nan_datetime() + enc = DatetimeEncoder().fit(X) + expected_index_to_format = { + 0: "%Y-%m-%d %H:%M:%S", + 1: "%Y-%m-%d %H:%M:%S", + 2: "%Y-%m-%d %H:%M:%S", + } + assert enc.index_to_format_ == expected_index_to_format + + +def test_format_nz(): + X = get_tz_datetime() + enc = DatetimeEncoder().fit(X) + assert enc.index_to_format_ == {0: "%Y-%m-%d %H:%M:%S%z"} + + +def test_resolution_none(): + X = get_datetime() + enc = DatetimeEncoder( + resolution=None, + add_total_seconds=False, ) enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) - enc = DatetimeEncoder(add_day_of_the_week=False) - expected_result = np.array( - [ - [2020, 1, 1, 1, 2, 2020, 1, 3], - [2021, 2, 3, 2, 4, 2021, 2, 5], - [2022, 1, 1, 12, 25, 2022, 1, 3], - [2023, 2, 3, 2, 4, 2023, 2, 5], - ] + assert enc.index_to_features_ == {0: [], 1: [], 2: []} + assert enc.n_features_out_ == 0 + assert enc.get_feature_names_out() == [] + + +def test_transform_date(): + X = get_date() + enc = DatetimeEncoder( + add_total_seconds=False, ) - enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) + X_trans = enc.fit_transform(X) - enc = DatetimeEncoder(add_day_of_the_week=True) expected_result = np.array( [ - [2020, 1, 1, 2, 1, 2, 3, 2020, 1, 3, 4], - [2021, 2, 3, 2, 2, 4, 1, 2021, 2, 5, 4], - [2022, 1, 1, 5, 12, 25, 4, 2022, 1, 3, 0], - [2023, 2, 3, 4, 2, 4, 1, 2023, 2, 5, 6], + [2020, 1, 1, 2020, 1, 2, 2020, 1, 3], + [2021, 2, 3, 2020, 2, 4, 2021, 2, 5], + [2022, 1, 1, 2020, 12, 25, 2022, 1, 3], + [2023, 2, 3, 2020, 2, 4, 2023, 2, 5], ] ) - enc.fit(X) - assert np.allclose(enc.transform(X), expected_result, equal_nan=True) + X_trans = enc.transform(X) + assert_array_equal(X_trans, expected_result) - # Datetimes - X = get_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - # Check that the "total_time" feature is working - expected_result = np.array( + +def test_transform_datetime(): + X = get_datetime() + enc = DatetimeEncoder( + resolution="second", + add_total_seconds=False, + ) + X_trans = enc.fit_transform(X) + expected_X_trans = np.array( [ - [2020, 1, 1, 10, 0, 2], - [2021, 2, 3, 12, 0, 2], - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], + [2020, 1, 1, 10, 12, 1, 2020, 1, 2, 10, 23, 0, 2020, 1, 3, 10, 0, 0], + [2021, 2, 3, 12, 45, 23, 2020, 2, 4, 22, 12, 0, 2021, 2, 5, 12, 0, 0], + [2022, 1, 1, 23, 23, 43, 2020, 12, 25, 11, 12, 0, 2022, 1, 3, 11, 0, 0], + [2023, 2, 3, 11, 12, 12, 2020, 2, 4, 8, 32, 0, 2023, 2, 5, 23, 0, 0], ] - ).astype(np.float64) - # Time from epochs in seconds - expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1) + ) + assert_array_equal(X_trans, expected_X_trans) - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) - - # Check if we find back the date from the time to epoch - assert ( - ( - pd.to_datetime(X_trans[:, 4], unit="s") - pd.to_datetime(X.reshape(-1)) - ).total_seconds() - == 0 - ).all() - - # Dirty datetimes - X = get_dirty_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( + +def test_transform_tz(): + X = get_tz_datetime() + enc = DatetimeEncoder( + add_total_seconds=True, + ) + X_trans = enc.fit_transform(X) + expected_X_trans = np.array( [ - [2020, 1, 1, 10, 0, 2], - [np.nan] * 6, - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], + [2020, 1, 1, 10, 1.57785372e09], + [2021, 2, 3, 12, 1.61233652e09], + [2022, 1, 1, 23, 1.64105962e09], + [2023, 2, 3, 11, 1.67540293e09], ] ) - # Time from epochs in seconds - expected_result[:, 4] = (X.astype("int64") // 1e9).astype(np.float64).reshape(-1) - expected_result[1, 4] = np.nan - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) - - # Datetimes with TZ - # If the dates are timezone-aware, all the feature extractions should - # be done in the provided timezone. - # But the full time to epoch should correspond to the true number of - # seconds between epoch time and the time of the date. - X = get_datetime_with_TZ_array() - enc = DatetimeEncoder(add_day_of_the_week=True) - expected_result = np.array( + assert_allclose(X_trans, expected_X_trans) + + +def test_transform_nan(): + X = get_nan_datetime() + enc = DatetimeEncoder( + add_total_seconds=True, + ) + X_trans = enc.fit_transform(X) + expected_X_trans = np.array( [ - [2020, 1, 1, 10, 0, 2], - [2021, 2, 3, 12, 0, 2], - [2022, 1, 1, 23, 0, 5], - [2023, 2, 3, 11, 0, 4], + [ + 2020, + 1, + 1, + 10, + 1.57787352e09, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2020, + 1, + 3, + 10, + 1.57804560e09, + ], + [ + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + 2020, + 2, + 4, + 22, + 1.58085432e09, + 2021, + 2, + 5, + 12, + 1.61252640e09, + ], + [ + 2022, + 1, + 1, + 23, + 1.64107942e09, + 2020, + 12, + 25, + 11, + 1.60889472e09, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ], ] - ).astype(np.float64) - # Time from epochs in seconds - expected_result[:, 4] = ( - (X.iloc[:, 0].view(dtype="int64") // 1e9) - .astype(np.float64) - .to_numpy() - .reshape(-1) ) - enc.fit(X) - X_trans = enc.transform(X) - assert np.allclose(X_trans, expected_result, equal_nan=True) + assert_allclose(X_trans, expected_X_trans) + + +def test_mixed_type_dataframe(): + X = get_mixed_type_dataframe() + enc = DatetimeEncoder().fit(X) + assert enc.index_to_format_ == {0: "%Y-%m-%d", 4: "%d/%m/%Y"} + + X_dt = to_datetime(X) + expected_dtypes = [ + np.dtype(" None: - time_levels = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - "nanosecond", - ] - X = get_datetime_array() - enc = DatetimeEncoder(extract_until=extract_until) - expected_features_per_column_ = { - # all features after seconds are constant - # we want total_time if we have not extracted all non-constant features - 0: time_levels[ - : min(time_levels.index(extract_until), time_levels.index("second")) + 1 - ] - + ( - ["total_time"] - if extract_until in ["year", "month", "day", "hour", "minute"] - else [] - ), - # constant after minute + year constant - 1: time_levels[ - 1 : min(time_levels.index(extract_until), time_levels.index("minute")) + 1 - ] - + (["total_time"] if extract_until in ["year", "month", "day", "hour"] else []), - # constant after hour - 2: time_levels[ - : min(time_levels.index(extract_until), time_levels.index("hour")) + 1 - ] - + (["total_time"] if extract_until in ["year", "month", "day"] else []), - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ - - -def test_extract_until_none() -> None: - X = get_dirty_datetime_array() - enc = DatetimeEncoder(extract_until=None) - expected_features_per_column_ = { - # all features after seconds are constant - # we want total_time if we have not extracted all non-constant features - 0: ["total_time"], - 1: ["total_time"], - 2: ["total_time"], - } - enc.fit(X) - assert enc.features_per_column_ == expected_features_per_column_ +def test_to_datetime_incorrect_skip(X): + assert_array_equal(to_datetime(X), X) - # check get_names_out - expected_feature_names = [ - "0_total_time", - "1_total_time", - "2_total_time", - ] - assert enc.get_feature_names_out() == expected_feature_names - # check with constant datetimes - X = get_constant_date_array() - enc = DatetimeEncoder(extract_until=None) - assert enc.fit_transform(X).shape[1] == 0 +def test_to_datetime_type_error(): + # 3d tensor + X = [[["2021-01-01"]]] + with pytest.raises(TypeError): + to_datetime(X) -def test_check_fitted_datetime_encoder() -> None: - """Test that calling transform before fit raises an error""" - X = get_datetime_array()[:, 0].reshape(-1, 1) - enc = DatetimeEncoder(add_day_of_the_week=True) - with pytest.raises(NotFittedError): - enc.transform(X) +def test_to_datetime_invalid_params(): + with pytest.raises(ValueError, match=r"(?=.*errors options)"): + to_datetime(2020, errors="skip") - # Check that it works after fit - enc.fit(X) - enc.transform(X) + with pytest.raises(ValueError, match=r"(?=.*not a parameter of skrub)"): + to_datetime(2020, unit="second") + + +@pytest.mark.skipif( + not _is_pandas_format_mixed_available(), + reason=MSG_MIN_PANDAS_SKIP, +) +def test_to_datetime_format_param(): + X_col = ["2021-01-01", "2021/01/01"] + + # without format (default) + out = to_datetime(X_col) + expected_out = np.array(["2021-01-01", "NaT"], dtype="datetime64[ns]") + assert_array_equal(out, expected_out) + + # with format + out = to_datetime(X_col, format="%Y/%m/%d") + expected_out = np.array(["NaT", "2021-01-01"], dtype="datetime64[ns]") + assert_array_equal(out, expected_out) + + +def test_mixed_datetime_format(): + df = get_mixed_datetime_format() + + df_dt = to_datetime(df) + expected_df_dt = pd.DataFrame( + dict( + a=[ + pd.Timestamp("2022-10-15"), + pd.Timestamp("2021-12-25"), + pd.Timestamp("2020-05-18"), + pd.Timestamp("2019-10-15 12:00:00"), + ] + ) + ) + assert_frame_equal(df_dt, expected_df_dt) + + series_dt = to_datetime(df["a"]) + expected_series_dt = expected_df_dt["a"] + assert_array_equal(series_dt, expected_series_dt) + + +@pytest.mark.skipif(not _is_pandas_format_mixed_available(), reason=MSG_MIN_PANDAS_SKIP) +def test_mix_of_unambiguous(): + X_col = ["2021/10/15", "01/14/2021"] + out = to_datetime(X_col) + expected_out = np.array( + [np.datetime64("2021-10-15"), np.datetime64("NaT")], + dtype="datetime64[ns]", + ) + assert_array_equal(out, expected_out) + + +def test_only_ambiguous(): + X_col = ["2021/10/10", "2020/01/02"] + out = to_datetime(X_col) + # monthfirst by default + expected_out = np.array(["2021-10-10", "2020-01-02"], dtype="datetime64[ns]") + assert_array_equal(out, expected_out) + + +def test_monthfirst_only(): + X_col = ["2021/02/02", "2021/01/15"] + out = to_datetime(X_col) + expected_out = np.array(["2021-02-02", "2021-01-15"], dtype="datetime64[ns]") + assert_array_equal(out, expected_out) diff --git a/skrub/tests/test_table_vectorizer.py b/skrub/tests/test_table_vectorizer.py index ffe41155b..333b4d7f9 100644 --- a/skrub/tests/test_table_vectorizer.py +++ b/skrub/tests/test_table_vectorizer.py @@ -8,9 +8,12 @@ from sklearn.utils.validation import check_is_fitted from skrub import GapEncoder, MinHashEncoder, SuperVectorizer, TableVectorizer +from skrub._datetime_encoder import _is_pandas_format_mixed_available from skrub._table_vectorizer import _infer_date_format from skrub.tests.utils import transformers_list_equal +MSG_PANDAS_DEPRECATED_WARNING = "Skip deprecation warning" + def check_same_transformers( expected_transformers: dict, actual_transformers: list @@ -788,7 +791,7 @@ def test_mixed_types() -> None: pd.DataFrame({"col1": [1.0, 2.0, np.nan]}), ), # All datetimes during fit, 1 category during transform - ( + pytest.param( pd.DataFrame( { "col1": [ @@ -816,6 +819,10 @@ def test_mixed_types() -> None: ] } ), + marks=pytest.mark.skipif( + not _is_pandas_format_mixed_available(), + reason=MSG_PANDAS_DEPRECATED_WARNING, + ), ), ], )