From d6da3331ff623d3f9e70f7a2fbdb2ae7cdca6f31 Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:09:36 +0100 Subject: [PATCH 01/10] Feat: iso_date auto-detection function --- dlt/common/schema/detections.py | 19 +++++++++++++++++++ dlt/common/schema/typing.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/dlt/common/schema/detections.py b/dlt/common/schema/detections.py index 574cb44c93..207c934091 100644 --- a/dlt/common/schema/detections.py +++ b/dlt/common/schema/detections.py @@ -36,6 +36,25 @@ def is_iso_timestamp(t: Type[Any], v: Any) -> Optional[TDataType]: return None +def is_iso_date(t: Type[Any], v: Any) -> Optional[TDataType]: + # only strings can be converted + if not issubclass(t, str): + return None + if not v: + return None + # don't cast iso timestamps as dates + if is_iso_timestamp(t,v): + return None + # strict autodetection of iso timestamps + try: + dtv = parse_iso_like_datetime(v) + if isinstance(dtv, datetime.date): + return "date" + except Exception: + pass + return None + + def is_large_integer(t: Type[Any], v: Any) -> Optional[TDataType]: # only ints can be converted if issubclass(t, int): diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 2cc057560c..ac17f0ae9f 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -25,7 +25,7 @@ """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg"] -TTypeDetections = Literal["timestamp", "iso_timestamp", "large_integer", "hexbytes_to_text", "wei_to_double"] +TTypeDetections = Literal["timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"] TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]] TColumnNames = Union[str, Sequence[str]] """A string representing a column name or a list of""" From f85c8bb9bb8b7d636c98d68074230866ee0cc0cd Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:10:42 +0100 Subject: [PATCH 02/10] Feat: tests for iso_date auto-detection --- tests/common/schema/test_detections.py | 29 +++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 3a74c6f368..197f1b2844 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -2,7 +2,7 @@ from dlt.common import pendulum, Decimal, Wei from dlt.common.schema.utils import autodetect_sc_type -from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS +from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_iso_date, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS def test_timestamp_detection() -> None: @@ -34,6 +34,31 @@ def test_iso_timestamp_detection() -> None: assert is_iso_timestamp(float, str(pendulum.now())) is None +def test_iso_date_detection() -> None: + assert is_iso_date(str, str(pendulum.now().date())) == "date" + assert is_iso_date(str, "1975-05-21") == "date" + + # dont auto-detect timestamps as dates + assert is_iso_date(str, str(pendulum.now())) is None + assert is_iso_date(str, "1975-05-21T22:00:00Z") is None + assert is_iso_date(str, "2022-06-01T00:48:35.040Z") is None + assert is_iso_date(str, "1975-0521T22:00:00Z") is None + assert is_iso_date(str, "2021-07-24 10:51") is None + + # times are not accepted + assert is_iso_date(str, "22:00:00") is None + # wrong formats + assert is_iso_date(str, "0-05-01") is None + assert is_iso_date(str, "") is None + assert is_iso_date(str, "1975-05") is None + assert is_iso_date(str, "1975") is None + assert is_iso_date(str, "01-12") is None + assert is_iso_date(str, "1975/05/01") is None + + # wrong type + assert is_iso_date(float, str(pendulum.now().date())) is None + + def test_detection_large_integer() -> None: assert is_large_integer(str, "A") is None assert is_large_integer(int, 2**64 // 2) == "wei" @@ -56,6 +81,8 @@ def test_detection_function() -> None: assert autodetect_sc_type(None, str, str(pendulum.now())) is None assert autodetect_sc_type(["iso_timestamp"], str, str(pendulum.now())) == "timestamp" assert autodetect_sc_type(["iso_timestamp"], float, str(pendulum.now())) is None + assert autodetect_sc_type(["iso_date"], str, str(pendulum.now().date())) == "date" + assert autodetect_sc_type(["iso_date"], float, str(pendulum.now().date())) is None assert autodetect_sc_type(["timestamp"], str, str(pendulum.now())) is None assert autodetect_sc_type(["timestamp", "iso_timestamp"], float, pendulum.now().timestamp()) == "timestamp" assert autodetect_sc_type(["timestamp", "large_integer"], int, 2**64) == "wei" From 4306d8fe5ad1cd46f61252ec27fa417743d51c3b Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:11:14 +0100 Subject: [PATCH 03/10] Feat: make iso_date autodetection a default behavior --- dlt/common/schema/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index f2075ce85d..2b7b26ab4f 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -698,4 +698,4 @@ def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: def standard_type_detections() -> List[TTypeDetections]: - return ["iso_timestamp"] + return ["iso_timestamp", "iso_date"] From d03b42f33e221388ed5dd64dabe6991b64cf2f99 Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:11:49 +0100 Subject: [PATCH 04/10] NB: update docs --- .../examples/archive/examples/schemas/dlt_quickstart.schema.yaml | 1 + docs/examples/archive/schemas/dlt_quickstart.schema.yaml | 1 + docs/technical/working_with_schemas.md | 1 + docs/website/docs/general-usage/schema.md | 1 + 4 files changed, 4 insertions(+) diff --git a/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml index 01cdb7f4ea..436aad7530 100644 --- a/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml +++ b/docs/examples/archive/examples/schemas/dlt_quickstart.schema.yaml @@ -109,6 +109,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date default_hints: not_null: - _dlt_id diff --git a/docs/examples/archive/schemas/dlt_quickstart.schema.yaml b/docs/examples/archive/schemas/dlt_quickstart.schema.yaml index 003868bb96..3994ef7433 100644 --- a/docs/examples/archive/schemas/dlt_quickstart.schema.yaml +++ b/docs/examples/archive/schemas/dlt_quickstart.schema.yaml @@ -84,6 +84,7 @@ normalizers: detections: - timestamp - iso_timestamp + - iso_date names: dlt.common.normalizers.names.snake_case json: module: dlt.common.normalizers.json.relational diff --git a/docs/technical/working_with_schemas.md b/docs/technical/working_with_schemas.md index d8f048c172..d94edb8727 100644 --- a/docs/technical/working_with_schemas.md +++ b/docs/technical/working_with_schemas.md @@ -124,6 +124,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date ``` ⛔ we may define `all_text` function that will generate string only schemas by telling `dlt` that all types should be coerced to strings. diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 3e690634f3..13347b952b 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -182,6 +182,7 @@ settings: detections: - timestamp - iso_timestamp + - iso_date ``` ### Column hint rules From c82614214f89a48d14a3aa6f63a717dafc26cad7 Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:12:05 +0100 Subject: [PATCH 05/10] fixup! Feat: tests for iso_date auto-detection --- tests/common/cases/schemas/github/issues.schema.json | 5 +++-- .../cases/schemas/sheets/google_spreadsheet_v4.schema.json | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json index 2760a20db0..4c4f5425ae 100644 --- a/tests/common/cases/schemas/github/issues.schema.json +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -1294,7 +1294,8 @@ "settings": { "detections": [ "timestamp", - "iso_timestamp" + "iso_timestamp", + "iso_date" ], "default_hints": { "not_null": [ @@ -1318,4 +1319,4 @@ "module": "dlt.common.normalizers.json.relational" } } -} \ No newline at end of file +} diff --git a/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json b/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json index b74a4a5c51..e3a1803371 100644 --- a/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json +++ b/tests/common/cases/schemas/sheets/google_spreadsheet_v4.schema.json @@ -387,11 +387,12 @@ "normalizers": { "detections": [ "timestamp", - "iso_timestamp" + "iso_timestamp", + "iso_date" ], "names": "dlt.common.normalizers.names.snake_case", "json": { "module": "dlt.common.normalizers.json.relational" } } -} \ No newline at end of file +} From 64e91b4424181a1af2b11c36ec6c114778cb5e4a Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:12:18 +0100 Subject: [PATCH 06/10] NB: Fix makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bd522c9ba3..85f67818ac 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ help: @echo " runs flake and mypy" @echo " test" @echo " tests all the components including destinations" - @echo " test-local" + @echo " test-load-local" @echo " tests all components unsing local destinations: duckdb and postgres" @echo " test-common" @echo " tests common components" From e6f5f5a44480966bd5a0b049be2c0aecb7f287db Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:26:22 +0100 Subject: [PATCH 07/10] Fix: linting --- tests/common/schema/test_detections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 197f1b2844..9cf9e480fe 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -44,7 +44,7 @@ def test_iso_date_detection() -> None: assert is_iso_date(str, "2022-06-01T00:48:35.040Z") is None assert is_iso_date(str, "1975-0521T22:00:00Z") is None assert is_iso_date(str, "2021-07-24 10:51") is None - + # times are not accepted assert is_iso_date(str, "22:00:00") is None # wrong formats From d19f739e0331550ffcdfea91e2e81c5e79f3aeb9 Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:43:00 +0100 Subject: [PATCH 08/10] Fix: support dates with reduced precision --- tests/common/schema/test_detections.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 9cf9e480fe..92052a5688 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -37,6 +37,11 @@ def test_iso_timestamp_detection() -> None: def test_iso_date_detection() -> None: assert is_iso_date(str, str(pendulum.now().date())) == "date" assert is_iso_date(str, "1975-05-21") == "date" + assert is_iso_date(str, "19750521") == "date" + + # ISO-8601 allows dates with reduced precision + assert is_iso_date(str, "1975-05") == "date" + assert is_iso_date(str, "1975") == "date" # dont auto-detect timestamps as dates assert is_iso_date(str, str(pendulum.now())) is None @@ -48,10 +53,10 @@ def test_iso_date_detection() -> None: # times are not accepted assert is_iso_date(str, "22:00:00") is None # wrong formats + assert is_iso_date(str, "197505") is None assert is_iso_date(str, "0-05-01") is None assert is_iso_date(str, "") is None - assert is_iso_date(str, "1975-05") is None - assert is_iso_date(str, "1975") is None + assert is_iso_date(str, "75") is None assert is_iso_date(str, "01-12") is None assert is_iso_date(str, "1975/05/01") is None From fcd382e558b41edb97cbd2d1e71b7bf0ed1dea80 Mon Sep 17 00:00:00 2001 From: Skynet Date: Wed, 15 Nov 2023 22:43:32 +0100 Subject: [PATCH 09/10] fixup! Fix: support dates with reduced precision --- tests/common/schema/test_detections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 92052a5688..13cb09faec 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -38,7 +38,7 @@ def test_iso_date_detection() -> None: assert is_iso_date(str, str(pendulum.now().date())) == "date" assert is_iso_date(str, "1975-05-21") == "date" assert is_iso_date(str, "19750521") == "date" - + # ISO-8601 allows dates with reduced precision assert is_iso_date(str, "1975-05") == "date" assert is_iso_date(str, "1975") == "date" From 91b729308b07c08b5b7578907a5ff387aa74782f Mon Sep 17 00:00:00 2001 From: Skynet Date: Fri, 17 Nov 2023 18:07:24 +0100 Subject: [PATCH 10/10] Fix: don't make iso-date a default auto-detector --- dlt/common/schema/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 2b7b26ab4f..f2075ce85d 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -698,4 +698,4 @@ def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: def standard_type_detections() -> List[TTypeDetections]: - return ["iso_timestamp", "iso_date"] + return ["iso_timestamp"]