From 3e41ce7cd6d7bb28e6508ad1f45259c818ce8b1b Mon Sep 17 00:00:00 2001 From: Maxime Carbonneau-Leclerc Date: Fri, 15 Sep 2023 09:40:47 -0400 Subject: [PATCH] Maxi297/fix datetime format inference issue (#30442) --- .../airbyte_cdk/utils/datetime_format_inferrer.py | 12 ++++++++---- .../utils/test_datetime_format_inferrer.py | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py b/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py index 8e29a274d25d..cd423db9c201 100644 --- a/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py +++ b/airbyte-cdk/python/airbyte_cdk/utils/datetime_format_inferrer.py @@ -36,10 +36,14 @@ def _can_be_datetime(self, value: Any) -> bool: This is the case if the value is a string or an integer between 1_000_000_000 and 2_000_000_000 for seconds or between 1_000_000_000_000 and 2_000_000_000_000 for milliseconds. This is separate from the format check for performance reasons""" - for timestamp_range in self._timestamp_heuristic_ranges: - if isinstance(value, str) and (not value.isdecimal() or int(value) in timestamp_range): - return True - if isinstance(value, int) and value in timestamp_range: + if isinstance(value, (str, int)): + try: + value_as_int = int(value) + for timestamp_range in self._timestamp_heuristic_ranges: + if value_as_int in timestamp_range: + return True + except ValueError: + # given that it's not parsable as an int, it can represent a datetime with one of the self._formats return True return False diff --git a/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py b/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py index 68152184b66f..766007467184 100644 --- a/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py +++ b/airbyte-cdk/python/unit_tests/utils/test_datetime_format_inferrer.py @@ -22,6 +22,7 @@ ("timestamp_ms_match_string", [{"d": "1686058051000"}], {"d": "%ms"}), ("timestamp_no_match_integer", [{"d": 99}], {}), ("timestamp_no_match_string", [{"d": "99999999999999999999"}], {}), + ("timestamp_overflow", [{"d": f"{10**100}_100"}], {}), # this case was previously causing OverflowError hence this test ("simple_no_match", [{"d": "20220203"}], {}), ("multiple_match", [{"d": "2022-02-03", "e": "2022-02-03"}], {"d": "%Y-%m-%d", "e": "%Y-%m-%d"}), (