Skip to content

Commit

Permalink
MAIN Improve DatetimeEncoder (skrub-data#784)
Browse files Browse the repository at this point in the history
Co-authored-by: Jérôme Dockès <[email protected]>
  • Loading branch information
Vincent-Maladiere and jeromedockes authored Nov 9, 2023
1 parent 77b1ccc commit 2bda119
Show file tree
Hide file tree
Showing 9 changed files with 1,128 additions and 721 deletions.
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ development and backward compatibility is not ensured.
Major changes
-------------

* :func:`to_datetime` is now available to support pandas.to_datetime
over dataframes and 2d arrays.
:pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`

* Some parameters of :class:`Joiner` have changed. The goal is to harmonize
parameters across all estimator that perform join(-like) operations, as
discussed in `#751 <https://github.com/skrub-data/skrub/discussions/751>`_.
Expand Down Expand Up @@ -57,6 +61,11 @@ Major changes

Minor changes
-------------
* :class:`DatetimeEncoder` doesn't remove constant features anymore.
It also supports an 'errors' argument to raise or coerce errors during
transform, and a 'add_total_seconds' argument to include the number of
seconds since Epoch.
:pr:`784` by :user:`Vincent Maladiere <Vincent-Maladiere>`

* Scaling of ``matching_score`` in :func:`fuzzy_join` is now between 0 and 1; it used to be between 0.5 and 1. Moreover, the division by 0 error that occurred when all rows had a perfect match has been fixed. :pr:`802` by :user:`Jérôme Dockès <jeromedockes>`.

Expand Down
10 changes: 9 additions & 1 deletion doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ This page lists all available functions and classes of `skrub`.

.. raw:: html

<h2>Other encoders</h2>
<h2>Dealing with dates</h2>

.. autosummary::
:toctree: generated/
Expand All @@ -89,6 +89,14 @@ This page lists all available functions and classes of `skrub`.

DatetimeEncoder

.. autosummary::
:toctree: generated/
:template: function.rst
:nosignatures:
:caption: Converting datetime columns in a table

to_datetime

.. raw:: html

<h2>Deduplication: merging variants of the same entry</h2>
Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,7 @@ def notebook_modification_function(notebook_content, notebook_filename):
"SimilarityEncoder": "skrub.SimilarityEncoder",
"DatetimeEncoder": "skrub.DatetimeEncoder",
"deduplicate": "skrub.deduplicate",
"to_datetime": "skrub.to_datetime",
"TableVectorizer": "skrub.TableVectorizer",
"DatasetInfoOnly": "skrub.datasets._fetching.DatasetInfoOnly",
"DatasetAll": "skrub.datasets._fetching.DatasetAll",
Expand Down
181 changes: 82 additions & 99 deletions examples/03_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
.. |HGBR| replace::
:class:`~sklearn.ensemble.HistGradientBoostingRegressor`
.. |to_datetime| replace::
:func:`~skrub.to_datetime`
"""


Expand All @@ -46,19 +49,26 @@
# on the location, date and time of measurement.

from pprint import pprint

import pandas as pd

data = pd.read_csv(
"https://raw.githubusercontent.com/pandas-dev/pandas"
"/main/doc/data/air_quality_no2_long.csv"
)
).sort_values("date.utc")
# Extract our input data (X) and the target column (y)
y = data["value"]
X = data[["city", "date.utc"]]

X

###############################################################################
# We convert the dataframe date columns using |to_datetime|. Notice how
# we don't need to specify the columns to convert.
from skrub import to_datetime

X = to_datetime(X)
X.dtypes

###############################################################################
# Encoding the features
# .....................
Expand All @@ -73,27 +83,22 @@
# lower units, as they are probably unimportant.

from sklearn.preprocessing import OneHotEncoder

from skrub import DatetimeEncoder

from sklearn.compose import make_column_transformer
from skrub import DatetimeEncoder

encoder = make_column_transformer(
(OneHotEncoder(handle_unknown="ignore"), ["city"]),
(DatetimeEncoder(add_day_of_the_week=True, extract_until="minute"), ["date.utc"]),
(DatetimeEncoder(add_day_of_the_week=True, resolution="minute"), ["date.utc"]),
remainder="drop",
)

X_enc = encoder.fit_transform(X)
pprint(encoder.get_feature_names_out())

###############################################################################
# We see that the encoder is working as expected: the "date.utc" column has
# been replaced by features extracting the month, day, hour, and day of the
# week information.
#
# Note the year and minute features are not present, this is because they
# have been removed by the encoder as they are constant the whole period.
# We see that the encoder is working as expected: the ``"date.utc"`` column has
# been replaced by features extracting the month, day, hour, minute, day of the
# week and total second since Epoch information.

###############################################################################
# One-liner with the |TableVectorizer|
Expand All @@ -104,8 +109,7 @@

from skrub import TableVectorizer

table_vec = TableVectorizer()
table_vec.fit_transform(X)
table_vec = TableVectorizer().fit(X)
pprint(table_vec.get_feature_names_out())

###############################################################################
Expand All @@ -116,8 +120,7 @@

table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
)
table_vec.fit_transform(X)
).fit(X)
pprint(table_vec.get_feature_names_out())

###############################################################################
Expand All @@ -144,14 +147,9 @@
# ```py
# from sklearn.experimental import enable_hist_gradient_boosting
# ```

import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import make_pipeline

table_vec = TableVectorizer(
datetime_transformer=DatetimeEncoder(add_day_of_the_week=True),
)
pipeline = make_pipeline(table_vec, HistGradientBoostingRegressor())

###############################################################################
Expand All @@ -164,11 +162,6 @@
#
# Instead, we can use the |TimeSeriesSplit|,
# which ensures that the test set is always in the future.

sorted_indices = np.argsort(X["date.utc"])
X = X.iloc[sorted_indices]
y = y.iloc[sorted_indices]

from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cross_val_score(
Expand All @@ -185,82 +178,71 @@
#
# The mean squared error is not obvious to interpret, so we compare
# visually the prediction of our model with the actual values.

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import AutoDateFormatter, AutoDateLocator

X_train = X[X["date.utc"] < "2019-06-01"]
X_test = X[X["date.utc"] >= "2019-06-01"]

y_train = y[X["date.utc"] < "2019-06-01"]
y_test = y[X["date.utc"] >= "2019-06-01"]
mask_train = X["date.utc"] < "2019-06-01"
X_train, X_test = X.loc[mask_train], X.loc[~mask_train]
y_train, y_test = y.loc[mask_train], y.loc[~mask_train]

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

all_cities = X_test["city"].unique()

fig, axs = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
fig.subplots_adjust(hspace=0.5)
fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
for ax, city in zip(axes, all_cities):
mask_prediction = X_test["city"] == city
date_prediction = X_test.loc[mask_prediction]["date.utc"]
y_prediction = y_pred[mask_prediction]

for i, city in enumerate(all_cities):
axs[i].plot(
X.loc[X.city == city, "date.utc"],
y.loc[X.city == city],
label="Actual",
)
axs[i].plot(
X_test.loc[X_test.city == city, "date.utc"],
pipeline.predict(X_test.loc[X_test.city == city]),
label="Predicted",
mask_reference = X["city"] == city
date_reference = X.loc[mask_reference]["date.utc"]
y_reference = y[mask_reference]

ax.plot(date_reference, y_reference, label="Actual")
ax.plot(date_prediction, y_prediction, label="Predicted")

ax.set(
ylabel="NO2",
title=city,
)
axs[i].set_title(city)
axs[i].set_ylabel("NO2")
xtick_locator = AutoDateLocator(maxticks=8)
xtick_formatter = AutoDateFormatter(xtick_locator)
axs[i].xaxis.set_major_locator(xtick_locator)
axs[i].xaxis.set_major_formatter(xtick_formatter)
axs[i].legend()
ax.legend()

fig.subplots_adjust(hspace=0.5)
plt.show()

###############################################################################
# Let's zoom on a few days:

X_zoomed = X[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]
y_zoomed = y[(X["date.utc"] <= "2019-06-04") & (X["date.utc"] >= "2019-06-01")]

X_train_zoomed = X_zoomed[X_zoomed["date.utc"] < "2019-06-03"]
X_test_zoomed = X_zoomed[X_zoomed["date.utc"] >= "2019-06-03"]
mask_zoom_reference = (X["date.utc"] >= "2019-06-01") & (X["date.utc"] < "2019-06-04")
mask_zoom_prediction = (X_test["date.utc"] >= "2019-06-01") & (
X_test["date.utc"] < "2019-06-04"
)

y_train_zoomed = y[X["date.utc"] < "2019-06-03"]
y_test_zoomed = y[X["date.utc"] >= "2019-06-03"]
all_cities = ["Paris", "London"]
fig, axes = plt.subplots(nrows=len(all_cities), ncols=1, figsize=(12, 9))
for ax, city in zip(axes, all_cities):
mask_prediction = (X_test["city"] == city) & mask_zoom_prediction
date_prediction = X_test.loc[mask_prediction]["date.utc"]
y_prediction = y_pred[mask_prediction]

zoomed_cities = X_test_zoomed["city"].unique()
mask_reference = (X["city"] == city) & mask_zoom_reference
date_reference = X.loc[mask_reference]["date.utc"]
y_reference = y[mask_reference]

fig, axs = plt.subplots(nrows=len(zoomed_cities), ncols=1, figsize=(12, 9))
fig.subplots_adjust(hspace=0.5)
ax.plot(date_reference, y_reference, label="Actual")
ax.plot(date_prediction, y_prediction, label="Predicted")

for i, city in enumerate(zoomed_cities):
axs[i].plot(
X_zoomed.loc[X_zoomed["city"] == city, "date.utc"],
y_zoomed.loc[X_zoomed["city"] == city],
label="Actual",
)
axs[i].plot(
X_test_zoomed.loc[X_test_zoomed["city"] == city, "date.utc"],
pipeline.predict(X_test_zoomed.loc[X_test_zoomed["city"] == city]),
label="Predicted",
ax.set(
ylabel="NO2",
title=city,
)
axs[i].set_title(city)
axs[i].set_ylabel("NO2")

xtick_locator = AutoDateLocator(maxticks=8)
xtick_formatter = AutoDateFormatter(xtick_locator)
axs[i].xaxis.set_major_locator(xtick_locator)
axs[i].xaxis.set_major_formatter(xtick_formatter)
ax.legend()

axs[i].legend()
plt.show()


###############################################################################
# Features importance
# -------------------
Expand All @@ -280,27 +262,28 @@

# In this case, we don't use a pipeline, because we want to compute the
# importance of the features created by the DatetimeEncoder
X_ = table_vec.fit_transform(X)
reg = HistGradientBoostingRegressor().fit(X_, y)
result = permutation_importance(reg, X_, y, n_repeats=10, random_state=0)
std = result.importances_std
importances = result.importances_mean
indices = np.argsort(importances)
# Sort from least to most
indices = list(reversed(indices))

plt.figure(figsize=(12, 9))
plt.title("Feature importances")
n = len(indices)
labels = np.array(table_vec.get_feature_names_out())[indices]
plt.barh(range(n), importances[indices], color="b", yerr=std[indices])
plt.yticks(range(n), labels, size=15)
plt.tight_layout(pad=1)
plt.show()
X_transform = table_vec.fit_transform(X)
feature_names = table_vec.get_feature_names_out()

model = HistGradientBoostingRegressor().fit(X_transform, y)
result = permutation_importance(model, X_transform, y, n_repeats=10, random_state=0)

result = pd.DataFrame(
dict(
feature_names=feature_names,
std=result.importances_std,
importances=result.importances_mean,
)
).sort_values("importances", ascending=False)

result.plot.barh(
y="importances", x="feature_names", title="Feature Importances", figsize=(12, 9)
)
plt.tight_layout()

###############################################################################
# We can see that the hour of the day is the most important feature,
# which seems reasonable.
# We can see that the total seconds since Epoch and the hour of the day
# are the most important feature, which seems reasonable.
#
# Conclusion
# ----------
Expand Down
3 changes: 2 additions & 1 deletion skrub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from ._agg_joiner import AggJoiner, AggTarget
from ._check_dependencies import check_dependencies
from ._datetime_encoder import DatetimeEncoder
from ._datetime_encoder import DatetimeEncoder, to_datetime
from ._deduplicate import compute_ngram_distance, deduplicate
from ._fuzzy_join import fuzzy_join
from ._gap_encoder import GapEncoder
Expand Down Expand Up @@ -34,6 +34,7 @@
"TargetEncoder",
"deduplicate",
"compute_ngram_distance",
"to_datetime",
"AggJoiner",
"AggTarget",
"SelectCols",
Expand Down
Loading

0 comments on commit 2bda119

Please sign in to comment.