Skip to content

Commit

Permalink
[FIX] Hotfix to_datetime for timezone (#834)
Browse files Browse the repository at this point in the history
* hotfix datetime tz

* simplify _is_column_datetime_parsable logic
  • Loading branch information
Vincent-Maladiere authored Nov 22, 2023
1 parent c4b263b commit 8aa1283
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 19 deletions.
40 changes: 21 additions & 19 deletions skrub/_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import numpy as np
import pandas as pd
from pandas._libs.tslibs.parsing import guess_datetime_format
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.fixes import parse_version
Expand Down Expand Up @@ -314,7 +315,14 @@ def _get_datetime_column_indices(X_split, dayfirst=True):
for col_idx, X_col in enumerate(X_split):
X_col = X_col[pd.notnull(X_col)] # X_col is a numpy array

if _is_column_datetime_parsable(X_col):
if is_numeric_dtype(X_col):
continue

elif is_datetime64_any_dtype(X_col):
indices.append(col_idx)
index_to_format[col_idx] = None

elif _is_column_datetime_parsable(X_col):
indices.append(col_idx)

# _guess_datetime_format only accept string columns.
Expand All @@ -339,7 +347,7 @@ def _is_column_datetime_parsable(X_col):
Parameters
----------
X_col : array-like of shape ``(n_samples,)``
X_col : array-like of shape ``(n_samples,)``, of dtype str or object.
Returns
-------
Expand All @@ -355,23 +363,17 @@ def _is_column_datetime_parsable(X_col):
except (ValueError, TypeError):
pass

np_dtypes_candidates = [np.object_, np.str_, np.datetime64]
is_type_datetime_compatible = any(
np.issubdtype(X_col.dtype, np_dtype) for np_dtype in np_dtypes_candidates
)
if is_type_datetime_compatible:
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
# format=mixed parses entries individually,
# avoiding ValueError when both date and datetime formats
# are present.
# At this stage, the format itself doesn't matter.
_ = pd.to_datetime(X_col, format=MIXED_FORMAT)
return True
except (pd.errors.ParserError, ValueError, TypeError):
pass
return False
try:
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=UserWarning)
# format=mixed parses entries individually,
# avoiding ValueError when both date and datetime formats
# are present.
# At this stage, the format itself doesn't matter.
_ = pd.to_datetime(X_col, format=MIXED_FORMAT)
return True
except (pd.errors.ParserError, ValueError, TypeError):
return False


def _guess_datetime_format(X_col):
Expand Down
18 changes: 18 additions & 0 deletions skrub/tests/test_datetime_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pandas as pd
import pytest
from numpy.testing import assert_allclose, assert_array_equal
from pandas.api.types import is_datetime64_any_dtype
from pandas.testing import assert_frame_equal

from skrub._datetime_encoder import (
Expand Down Expand Up @@ -175,6 +176,23 @@ def test_fit(
assert enc.get_feature_names_out() == expected_feature_names


@pytest.mark.parametrize(
"get_data_func, expected_datetime_columns",
[
(get_date, [0, 1, 2]),
(get_datetime, [0, 1, 2]),
(get_tz_datetime, [0]),
(get_mixed_type_dataframe, ["a", "e"]),
],
)
def test_to_datetime(get_data_func, expected_datetime_columns):
X = get_data_func()
X = to_datetime(X)
X = pd.DataFrame(X)
datetime_columns = [col for col in X.columns if is_datetime64_any_dtype(X[col])]
assert_array_equal(datetime_columns, expected_datetime_columns)


def test_format_nan():
X = get_nan_datetime()
enc = DatetimeEncoder().fit(X)
Expand Down

0 comments on commit 8aa1283

Please sign in to comment.