From 78d6d71e13238b7087bc54b7806f159e991d0af0 Mon Sep 17 00:00:00 2001 From: Dave Date: Sun, 27 Aug 2023 17:51:25 +0200 Subject: [PATCH 01/73] basic schema freezing --- dlt/normalize/configuration.py | 5 ++- dlt/normalize/normalize.py | 26 +++++++++++--- tests/load/test_freeze_schema.py | 58 ++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 6 deletions(-) create mode 100644 tests/load/test_freeze_schema.py diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index c4ed7aa89a..2f2f20dc86 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -1,15 +1,18 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Literal from dlt.common.configuration import configspec from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration +TSchemaUpdateMode = Literal["update-schema", "freeze-and-discard", "freeze-and-raise"] + @configspec class NormalizeConfiguration(PoolRunnerConfiguration): pool_type: TPoolType = "process" destination_capabilities: DestinationCapabilitiesContext = None # injectable + schema_update_mode: TSchemaUpdateMode = "update-schema" _schema_storage_config: SchemaStorageConfiguration _normalize_storage_config: NormalizeStorageConfiguration _load_storage_config: LoadStorageConfiguration diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 19a63e5f80..979083f826 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -67,6 +67,7 @@ def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Sc @staticmethod def w_normalize_files( + normalize_config: NormalizeConfiguration, normalize_storage_config: NormalizeStorageConfiguration, loader_storage_config: LoadStorageConfiguration, destination_caps: DestinationCapabilitiesContext, @@ -74,7 +75,6 @@ def w_normalize_files( load_id: str, extracted_items_files: Sequence[str], ) -> TWorkerRV: - schema_updates: List[TSchemaUpdate] = [] total_items = 0 row_counts: TRowCount = {} @@ -98,7 +98,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -127,7 +127,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -145,8 +145,23 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) + + # if there is a schema update and we froze schema and discaro additional data, clean up + if partial_table and config.schema_update_mode == "freeze-and-discard": + # do not create new tables + if table_name not in schema.tables: + continue + # pop unknown values + for item in list(row.keys()): + if item not in schema.tables[table_name]["columns"]: + row.pop(item) + + # if there is a schema update and we disallow any data not fitting the schema, raise! + elif partial_table and config.schema_update_mode == "freeze-and-raise": + raise Exception("Schema frozen!") + # theres a new table or new columns in existing table - if partial_table: + elif partial_table: # update schema and save the change schema.update_schema(partial_table) table_updates = schema_update.setdefault(table_name, []) @@ -196,7 +211,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM workers = self.pool._processes # type: ignore chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() - config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) + config_tuple = (self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) param_chunk = [[*config_tuple, load_id, files] for files in chunk_files] tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] row_counts: TRowCount = {} @@ -249,6 +264,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: result = Normalize.w_normalize_files( + self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py new file mode 100644 index 0000000000..72b26cc7c9 --- /dev/null +++ b/tests/load/test_freeze_schema.py @@ -0,0 +1,58 @@ +from typing import Dict +import yaml +import dlt, os, pytest +from dlt.common.utils import uniq_id + +from tests.pipeline.utils import assert_load_info +from tests.load.pipeline.utils import drop_active_pipeline_data, load_table_counts, load_tables_to_dicts +from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from dlt.pipeline.exceptions import PipelineStepFailed + +@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) +def test_freeze_schema(destination_config: DestinationTestConfiguration) -> None: + + pipeline = destination_config.setup_pipeline("test_freeze_schema", dataset_name="freeze" + uniq_id()) + + @dlt.resource(name="items", write_disposition="append") + def load_items(): + global offset + for _, index in enumerate(range(0, 120), 1): + yield { + "id": index, + "name": f"item {index}" + } + + @dlt.resource(name="items", write_disposition="append") + def load_items_with_subitems(): + global offset + for _, index in enumerate(range(0, 120), 1): + yield { + "id": index, + "name": f"item {index}", + "new_attribute": "hello", + "sub_items": [{ + "id": index + 1000, + "name": f"sub item {index + 1000}" + },{ + "id": index + 2000, + "name": f"sub item {index + 2000}" + }] + } + + pipeline.run([load_items], loader_file_format=destination_config.file_format) + + # freeze pipeline, drop additional values + os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-discard" + pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + + # schema was not migrated to contain new subtable + assert "items__sub_items" not in pipeline.default_schema.tables + # schema was not migrated to contain new attribute + assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] + + # now raise on migration + os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-raise" + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + assert isinstance(py_ex.value.__context__, Exception) + From 09d8c636d8836eb8f82b8e7cdc4d07e17013c9fd Mon Sep 17 00:00:00 2001 From: Dave Date: Sun, 27 Aug 2023 22:31:43 +0200 Subject: [PATCH 02/73] small changes --- dlt/normalize/exceptions.py | 9 +++++++++ dlt/normalize/normalize.py | 3 ++- tests/load/test_freeze_schema.py | 23 ++++++++++++++++------- 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py index e69de29bb2..6b1d01c932 100644 --- a/dlt/normalize/exceptions.py +++ b/dlt/normalize/exceptions.py @@ -0,0 +1,9 @@ +from dlt.common.exceptions import DltException + +class NormalizeException(DltException): + def __init__(self, msg: str) -> None: + super().__init__(msg) + +class SchemaFrozenException(DltException): + def __init__(self, msg: str) -> None: + super().__init__(msg) \ No newline at end of file diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 979083f826..537ee149b8 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -20,6 +20,7 @@ from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo from dlt.common.utils import chunks, TRowCount, merge_row_count, increase_row_count +from dlt.normalize.exceptions import SchemaFrozenException from dlt.normalize.configuration import NormalizeConfiguration @@ -158,7 +159,7 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage # if there is a schema update and we disallow any data not fitting the schema, raise! elif partial_table and config.schema_update_mode == "freeze-and-raise": - raise Exception("Schema frozen!") + raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") # theres a new table or new columns in existing table elif partial_table: diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 72b26cc7c9..bf414ac7f5 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -1,12 +1,11 @@ -from typing import Dict -import yaml import dlt, os, pytest from dlt.common.utils import uniq_id -from tests.pipeline.utils import assert_load_info -from tests.load.pipeline.utils import drop_active_pipeline_data, load_table_counts, load_tables_to_dicts +from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from dlt.pipeline.exceptions import PipelineStepFailed +from dlt.normalize.exceptions import SchemaFrozenException +from dlt.common.schema import utils @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) def test_freeze_schema(destination_config: DestinationTestConfiguration) -> None: @@ -16,7 +15,7 @@ def test_freeze_schema(destination_config: DestinationTestConfiguration) -> None @dlt.resource(name="items", write_disposition="append") def load_items(): global offset - for _, index in enumerate(range(0, 120), 1): + for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}" @@ -25,7 +24,7 @@ def load_items(): @dlt.resource(name="items", write_disposition="append") def load_items_with_subitems(): global offset - for _, index in enumerate(range(0, 120), 1): + for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}", @@ -40,11 +39,21 @@ def load_items_with_subitems(): } pipeline.run([load_items], loader_file_format=destination_config.file_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + # check data + assert table_counts["items"] == 10 + schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict()) # freeze pipeline, drop additional values os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-discard" pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + # check schema has not changed + assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) + # check data + assert table_counts["items"] == 20 + assert "items__sub_items" not in table_counts # schema was not migrated to contain new subtable assert "items__sub_items" not in pipeline.default_schema.tables # schema was not migrated to contain new attribute @@ -54,5 +63,5 @@ def load_items_with_subitems(): os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-raise" with pytest.raises(PipelineStepFailed) as py_ex: pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) - assert isinstance(py_ex.value.__context__, Exception) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) From aec10b10812768766dab476a6c41052ee110c082 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 28 Aug 2023 15:58:50 +0200 Subject: [PATCH 03/73] temp --- dlt/common/schema/schema.py | 6 +++++ dlt/normalize/configuration.py | 2 +- dlt/normalize/normalize.py | 20 ++++++++++------ tests/load/test_freeze_schema.py | 39 ++++++++++++++++++++------------ 4 files changed, 45 insertions(+), 22 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index e5421b2e12..6211cda89c 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -323,6 +323,12 @@ def tables(self) -> TSchemaTables: @property def settings(self) -> TSchemaSettings: return self._settings + + @property + def has_data_columns(self) -> bool: + for table in self.data_tables(): + return True + return False def to_pretty_json(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index 2f2f20dc86..5fcce38cb9 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -5,7 +5,7 @@ from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration -TSchemaUpdateMode = Literal["update-schema", "freeze-and-discard", "freeze-and-raise"] +TSchemaUpdateMode = Literal["update-schema", "freeze-and-filter", "freeze-and-raise", "freeze-and-discard"] @configspec diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 537ee149b8..568d399028 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -86,6 +86,8 @@ def w_normalize_files( load_storage = LoadStorage(False, destination_caps.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, loader_storage_config) normalize_storage = NormalizeStorage(False, normalize_storage_config) + schema_has_columns = schema.has_data_columns + try: root_tables: Set[str] = set() populated_root_tables: Set[str] = set() @@ -99,7 +101,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items, schema_has_columns) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -128,7 +130,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_has_columns: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -146,19 +148,23 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) - - # if there is a schema update and we froze schema and discaro additional data, clean up - if partial_table and config.schema_update_mode == "freeze-and-discard": + + # if there is a schema update and we froze schema and filter additional data, clean up + if schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-filter": # do not create new tables - if table_name not in schema.tables: + if table_name not in schema.tables or not len(schema.tables[table_name].get("columns", {})): continue # pop unknown values for item in list(row.keys()): if item not in schema.tables[table_name]["columns"]: row.pop(item) + # if there is a schema update and we froze schema and discard additional rows, just continue + elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-filter": + continue + # if there is a schema update and we disallow any data not fitting the schema, raise! - elif partial_table and config.schema_update_mode == "freeze-and-raise": + elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-raise": raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") # theres a new table or new columns in existing table diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index bf414ac7f5..3c85fd9194 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -7,10 +7,16 @@ from dlt.normalize.exceptions import SchemaFrozenException from dlt.common.schema import utils +SCHEMA_UPDATE_MODES = ["update-schema", "freeze-and-filter", "freeze-and-raise", "freeze-and-discard"] + @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) -def test_freeze_schema(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES) +def test_freeze_schema(update_mode: str, destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("test_freeze_schema", dataset_name="freeze" + uniq_id()) + # freeze pipeline, drop additional values + # this will allow for the first run to create the schema, but will not accept further updates after that + os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = update_mode + pipeline = destination_config.setup_pipeline("test_freeze_schema_2", dataset_name="freeze" + uniq_id()) @dlt.resource(name="items", write_disposition="append") def load_items(): @@ -44,14 +50,25 @@ def load_items_with_subitems(): assert table_counts["items"] == 10 schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict()) - # freeze pipeline, drop additional values - os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-discard" - pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - # check schema has not changed - assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) + + # on freeze and raise we expect an exception + if update_mode == "freeze-and-raise": + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + else: + pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + + + + # check schema has not changed for frozen modes + if update_mode != "update-schema": + assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) + + return # check data + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert "items__sub_items" not in table_counts # schema was not migrated to contain new subtable @@ -59,9 +76,3 @@ def load_items_with_subitems(): # schema was not migrated to contain new attribute assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] - # now raise on migration - os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = "freeze-and-raise" - with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - From 9441dcbff42f19f7e3539d6f62649af8af8f7f32 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 28 Aug 2023 16:12:49 +0200 Subject: [PATCH 04/73] add new schema update mode --- dlt/normalize/normalize.py | 2 +- tests/load/test_freeze_schema.py | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 568d399028..6561592b81 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -160,7 +160,7 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage row.pop(item) # if there is a schema update and we froze schema and discard additional rows, just continue - elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-filter": + elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-discard": continue # if there is a schema update and we disallow any data not fitting the schema, raise! diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 3c85fd9194..2f4d78ade6 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -50,7 +50,6 @@ def load_items_with_subitems(): assert table_counts["items"] == 10 schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict()) - # on freeze and raise we expect an exception if update_mode == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: @@ -59,20 +58,19 @@ def load_items_with_subitems(): else: pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + # check data + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 if update_mode not in ["freeze-and-raise", "freeze-and-discard"] else 10 - - # check schema has not changed for frozen modes + # frozen schemas should not have changed if update_mode != "update-schema": assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) - - return - - # check data - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 20 - assert "items__sub_items" not in table_counts - # schema was not migrated to contain new subtable - assert "items__sub_items" not in pipeline.default_schema.tables - # schema was not migrated to contain new attribute - assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] + assert "items__sub_items" not in table_counts + # schema was not migrated to contain new attribute + assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] + # regular mode evolves the schema + else: + assert table_counts["items__sub_items"] == 20 + # schema was not migrated to contain new attribute + assert "new_attribute" in pipeline.default_schema.tables["items"]["columns"] From edad4ad76483c80b5e35cc8792eec7ba01d33b9c Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 31 Aug 2023 09:47:15 +0200 Subject: [PATCH 05/73] fix linting errors and one bug --- dlt/common/schema/schema.py | 4 ++-- dlt/normalize/normalize.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 6211cda89c..34c6bb6e03 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -323,11 +323,11 @@ def tables(self) -> TSchemaTables: @property def settings(self) -> TSchemaSettings: return self._settings - + @property def has_data_columns(self) -> bool: for table in self.data_tables(): - return True + return bool(table.get("columns", None)) return False def to_pretty_json(self, remove_defaults: bool = True) -> str: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 6561592b81..3362ff160d 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -148,7 +148,7 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) - + # if there is a schema update and we froze schema and filter additional data, clean up if schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-filter": # do not create new tables From bef7ea4eb6a86c9e84a8aebe19ec9d6b931037f6 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 31 Aug 2023 13:31:48 +0200 Subject: [PATCH 06/73] move freeze code to schema --- dlt/common/schema/exceptions.py | 5 ++ dlt/common/schema/schema.py | 36 ++++++++++++-- dlt/common/schema/typing.py | 2 + dlt/normalize/configuration.py | 6 +-- dlt/normalize/exceptions.py | 4 -- dlt/normalize/normalize.py | 83 +++++++++++++------------------- tests/load/test_freeze_schema.py | 6 +-- 7 files changed, 78 insertions(+), 64 deletions(-) diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 2245a77b61..d68dbc47a3 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -69,3 +69,8 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi self.from_engine = from_engine self.to_engine = to_engine super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + + +class SchemaFrozenException(SchemaException): + def __init__(self, msg: str) -> None: + super().__init__(msg) \ No newline at end of file diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 34c6bb6e03..9f05aadf91 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,6 +1,6 @@ import yaml from copy import copy, deepcopy -from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast +from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal from dlt.common import json from dlt.common.utils import extend_list_deduplicated @@ -11,10 +11,13 @@ from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, TColumnSchemaBase, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TWriteDisposition) + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TWriteDisposition, TSchemaUpdateMode) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict +from dlt.common.schema.exceptions import SchemaFrozenException + + class Schema: @@ -174,7 +177,32 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D updated_table_partial["columns"][new_col_name] = new_col_def return new_row, updated_table_partial - + + def check_schema_update(self, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_update_mode: TSchemaUpdateMode) -> Tuple[DictStrAny, TPartialTableSchema]: + """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" + has_columns = self.has_data_columns + # if there is a schema update and we froze schema and filter additional data, clean up + if has_columns and partial_table and schema_update_mode == "freeze-and-trim": + # do not create new tables + if table_name not in self.tables or not len(self.tables[table_name].get("columns", {})): + return None, None + # pop unknown values + for item in list(row.keys()): + if item not in self.tables[table_name]["columns"]: + row.pop(item) + return row, None + + # if there is a schema update and we froze schema and discard additional rows, do nothing + elif has_columns and partial_table and schema_update_mode == "freeze-and-discard": + return None, None + + # if there is a schema update and we disallow any data not fitting the schema, raise! + elif has_columns and partial_table and schema_update_mode == "freeze-and-raise": + raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") + + return row, partial_table + + def update_schema(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] parent_table_name = partial_table.get("parent") @@ -327,7 +355,7 @@ def settings(self) -> TSchemaSettings: @property def has_data_columns(self) -> bool: for table in self.data_tables(): - return bool(table.get("columns", None)) + return bool(table.get("columns", None)) return False def to_pretty_json(self, remove_defaults: bool = True) -> str: diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 2a3f78fb62..06935d9629 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -104,3 +104,5 @@ class TStoredSchema(TypedDict, total=False): settings: Optional[TSchemaSettings] tables: TSchemaTables normalizers: TNormalizersConfig + +TSchemaUpdateMode = Literal["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index 5fcce38cb9..dfba9b16b3 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -4,15 +4,13 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration - -TSchemaUpdateMode = Literal["update-schema", "freeze-and-filter", "freeze-and-raise", "freeze-and-discard"] - +from dlt.common.schema.typing import TSchemaUpdateMode @configspec class NormalizeConfiguration(PoolRunnerConfiguration): pool_type: TPoolType = "process" destination_capabilities: DestinationCapabilitiesContext = None # injectable - schema_update_mode: TSchemaUpdateMode = "update-schema" + schema_update_mode: TSchemaUpdateMode = "evolve" _schema_storage_config: SchemaStorageConfiguration _normalize_storage_config: NormalizeStorageConfiguration _load_storage_config: LoadStorageConfiguration diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py index 6b1d01c932..79da16b925 100644 --- a/dlt/normalize/exceptions.py +++ b/dlt/normalize/exceptions.py @@ -3,7 +3,3 @@ class NormalizeException(DltException): def __init__(self, msg: str) -> None: super().__init__(msg) - -class SchemaFrozenException(DltException): - def __init__(self, msg: str) -> None: - super().__init__(msg) \ No newline at end of file diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 3362ff160d..18ea8965c0 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -20,7 +20,6 @@ from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo from dlt.common.utils import chunks, TRowCount, merge_row_count, increase_row_count -from dlt.normalize.exceptions import SchemaFrozenException from dlt.normalize.configuration import NormalizeConfiguration @@ -86,8 +85,6 @@ def w_normalize_files( load_storage = LoadStorage(False, destination_caps.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, loader_storage_config) normalize_storage = NormalizeStorage(False, normalize_storage_config) - schema_has_columns = schema.has_data_columns - try: root_tables: Set[str] = set() populated_root_tables: Set[str] = set() @@ -101,7 +98,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items, schema_has_columns) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -130,7 +127,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_has_columns: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -142,50 +139,38 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) # do not process empty rows - if row: - # decode pua types - for k, v in row.items(): - row[k] = custom_pua_decode(v) # type: ignore - # coerce row of values into schema table, generating partial table with new columns if any - row, partial_table = schema.coerce_row(table_name, parent_table, row) - - # if there is a schema update and we froze schema and filter additional data, clean up - if schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-filter": - # do not create new tables - if table_name not in schema.tables or not len(schema.tables[table_name].get("columns", {})): - continue - # pop unknown values - for item in list(row.keys()): - if item not in schema.tables[table_name]["columns"]: - row.pop(item) - - # if there is a schema update and we froze schema and discard additional rows, just continue - elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-discard": - continue - - # if there is a schema update and we disallow any data not fitting the schema, raise! - elif schema_has_columns and partial_table and config.schema_update_mode == "freeze-and-raise": - raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") - - # theres a new table or new columns in existing table - elif partial_table: - # update schema and save the change - schema.update_schema(partial_table) - table_updates = schema_update.setdefault(table_name, []) - table_updates.append(partial_table) - # update our columns - column_schemas[table_name] = schema.get_table_columns(table_name) - # get current columns schema - columns = column_schemas.get(table_name) - if not columns: - columns = schema.get_table_columns(table_name) - column_schemas[table_name] = columns - # store row - # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock - load_storage.write_data_item(load_id, schema_name, table_name, row, columns) - # count total items - items_count += 1 - increase_row_count(row_counts, table_name, 1) + if not row: + continue + # decode pua types + for k, v in row.items(): + row[k] = custom_pua_decode(v) # type: ignore + # coerce row of values into schema table, generating partial table with new columns if any + row, partial_table = schema.coerce_row(table_name, parent_table, row) + # check update + row, partial_table = schema.check_schema_update(table_name, row, partial_table, config.schema_update_mode) + + if not row: + continue + + # theres a new table or new columns in existing table + if partial_table: + # update schema and save the change + schema.update_schema(partial_table) + table_updates = schema_update.setdefault(table_name, []) + table_updates.append(partial_table) + # update our columns + column_schemas[table_name] = schema.get_table_columns(table_name) + # get current columns schema + columns = column_schemas.get(table_name) + if not columns: + columns = schema.get_table_columns(table_name) + column_schemas[table_name] = columns + # store row + # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock + load_storage.write_data_item(load_id, schema_name, table_name, row, columns) + # count total items + items_count += 1 + increase_row_count(row_counts, table_name, 1) signals.raise_if_signalled() return schema_update, items_count, row_counts diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 2f4d78ade6..f2d25b9ad8 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -4,10 +4,10 @@ from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from dlt.pipeline.exceptions import PipelineStepFailed -from dlt.normalize.exceptions import SchemaFrozenException +from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils -SCHEMA_UPDATE_MODES = ["update-schema", "freeze-and-filter", "freeze-and-raise", "freeze-and-discard"] +SCHEMA_UPDATE_MODES = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) @pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES) @@ -63,7 +63,7 @@ def load_items_with_subitems(): assert table_counts["items"] == 20 if update_mode not in ["freeze-and-raise", "freeze-and-discard"] else 10 # frozen schemas should not have changed - if update_mode != "update-schema": + if update_mode != "evolve": assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) assert "items__sub_items" not in table_counts # schema was not migrated to contain new attribute From fc6f08368f981081528c22cd36ccacde75b8f31e Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 4 Sep 2023 15:57:12 +0200 Subject: [PATCH 07/73] some work on schema evolution modes --- dlt/common/schema/schema.py | 29 +++++++++++++++++++---------- dlt/common/schema/typing.py | 15 ++++++++++++--- dlt/common/schema/utils.py | 11 +++++++++-- dlt/common/validation.py | 3 +++ dlt/extract/decorators.py | 9 +++++++-- dlt/extract/schema.py | 9 +++++---- dlt/normalize/configuration.py | 2 -- dlt/normalize/normalize.py | 14 ++++++-------- tests/load/test_freeze_schema.py | 17 ++++++++--------- 9 files changed, 69 insertions(+), 40 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 9f05aadf91..498db26fc2 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -11,7 +11,7 @@ from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, TColumnSchemaBase, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TWriteDisposition, TSchemaUpdateMode) + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaEvolutionModes) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict @@ -177,13 +177,23 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D updated_table_partial["columns"][new_col_name] = new_col_def return new_row, updated_table_partial - - def check_schema_update(self, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_update_mode: TSchemaUpdateMode) -> Tuple[DictStrAny, TPartialTableSchema]: + + def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" + + # for now we defined the schema as new if there are no data columns defined has_columns = self.has_data_columns + if not has_columns: + return row, partial_table + + # resolve evolution settings + table_with_settings = parent_table or table_name + evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", "evolve") + if isinstance(evolution_settings, str): + evolution_settings = TSchemaEvolutionModes(table=evolution_settings, column=evolution_settings, column_variant=evolution_settings) + # if there is a schema update and we froze schema and filter additional data, clean up - if has_columns and partial_table and schema_update_mode == "freeze-and-trim": - # do not create new tables + if evolution_settings["table"] == "freeze-and-trim": if table_name not in self.tables or not len(self.tables[table_name].get("columns", {})): return None, None # pop unknown values @@ -193,16 +203,15 @@ def check_schema_update(self, table_name: str, row: DictStrAny, partial_table: T return row, None # if there is a schema update and we froze schema and discard additional rows, do nothing - elif has_columns and partial_table and schema_update_mode == "freeze-and-discard": + elif evolution_settings["table"] == "freeze-and-discard": return None, None # if there is a schema update and we disallow any data not fitting the schema, raise! - elif has_columns and partial_table and schema_update_mode == "freeze-and-raise": + elif evolution_settings["table"] == "freeze-and-raise": raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") - + return row, partial_table - - + def update_schema(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] parent_table_name = partial_table.get("parent") diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 06935d9629..2a2871f644 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -61,6 +61,15 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" +TSchemaEvolutionMode = Literal["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] + +class TSchemaEvolutionModes(TypedDict, total=False): + """TypedDict defining the schema update settings""" + table: TSchemaEvolutionMode + column: TSchemaEvolutionMode + column_variant: TSchemaEvolutionMode + +TSchemaEvolutionSettings = Union[TSchemaEvolutionMode, TSchemaEvolutionModes] class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] @@ -72,7 +81,7 @@ class TTableSchema(TypedDict, total=False): name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] - table_sealed: Optional[bool] + schema_evolution_settings: Optional[TSchemaEvolutionSettings] parent: Optional[str] filters: Optional[TRowFilters] columns: TTableSchemaColumns @@ -86,8 +95,9 @@ class TPartialTableSchema(TTableSchema): TSchemaTables = Dict[str, TTableSchema] TSchemaUpdate = Dict[str, List[TPartialTableSchema]] + class TSchemaSettings(TypedDict, total=False): - schema_sealed: Optional[bool] + schema_evolution_settings: Optional[TSchemaEvolutionSettings] detections: Optional[List[TTypeDetections]] default_hints: Optional[Dict[TColumnHint, List[TSimpleRegex]]] preferred_types: Optional[Dict[TSimpleRegex, TDataType]] @@ -105,4 +115,3 @@ class TStoredSchema(TypedDict, total=False): tables: TSchemaTables normalizers: TNormalizersConfig -TSchemaUpdateMode = Literal["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 94efa975e8..f892a21354 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -16,7 +16,7 @@ from dlt.common.schema import detections from dlt.common.schema.typing import (SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition) + TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaEvolutionSettings, TSchemaEvolutionModes) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName) @@ -403,6 +403,10 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa if table.get('parent') is None and (resource := partial_table.get('resource')): table['resource'] = resource + partial_e_s = partial_table.get("schema_evolution_settings") + if partial_e_s: + table["schema_evolution_settings"] = partial_e_s + return diff_table @@ -568,7 +572,8 @@ def new_table( write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, - resource: str = None + resource: str = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, ) -> TTableSchema: table: TTableSchema = { @@ -579,10 +584,12 @@ def new_table( table["parent"] = parent_table_name assert write_disposition is None assert resource is None + assert schema_evolution_settings is None else: # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name + table["schema_evolution_settings"] = schema_evolution_settings if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/common/validation.py b/dlt/common/validation.py index f1900c1b0e..36c7f8cac7 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -50,6 +50,9 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil def verify_prop(pk: str, pv: Any, t: Any) -> None: if is_optional_type(t): + # pass if value actually is none + if pv is None: + return t = extract_optional_type(t) if is_literal_type(t): diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index dd756b1e6b..c5515f2fda 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -14,7 +14,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings from dlt.extract.utils import ensure_table_schema_columns_hint from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -200,6 +200,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[TResourceFunParams, DltResource]: @@ -215,6 +216,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: @@ -230,6 +232,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> DltResource: @@ -245,6 +248,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, depends_on: TUnboundDltResource = None, @@ -311,7 +315,8 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa write_disposition=write_disposition, columns=schema_columns, primary_key=primary_key, - merge_key=merge_key + merge_key=merge_key, + schema_evolution_settings=schema_evolution_settings ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, depends_on), incremental=incremental) diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 709f5c8b0a..46f29c1d1e 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -3,7 +3,7 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings from dlt.common.typing import TDataItem from dlt.common.validation import validate_dict_ignoring_xkeys @@ -23,6 +23,7 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] + schema_evolution_settings: TSchemaEvolutionSettings class DltResourceSchema: @@ -181,7 +182,8 @@ def new_table_template( write_disposition: TTableHintTemplate[TWriteDisposition] = None, columns: TTableHintTemplate[TTableSchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, - merge_key: TTableHintTemplate[TColumnNames] = None + merge_key: TTableHintTemplate[TColumnNames] = None, + schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, ) -> TTableSchemaTemplate: if not table_name: raise TableNameMissing() @@ -194,8 +196,7 @@ def new_table_template( column["name"] = name column_list.append(column) columns = column_list # type: ignore - - new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns) # type: ignore + new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_evolution_settings=schema_evolution_settings) # type: ignore if primary_key: new_template["primary_key"] = primary_key if merge_key: diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index dfba9b16b3..19a88f1639 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -4,13 +4,11 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration -from dlt.common.schema.typing import TSchemaUpdateMode @configspec class NormalizeConfiguration(PoolRunnerConfiguration): pool_type: TPoolType = "process" destination_capabilities: DestinationCapabilitiesContext = None # injectable - schema_update_mode: TSchemaUpdateMode = "evolve" _schema_storage_config: SchemaStorageConfiguration _normalize_storage_config: NormalizeStorageConfiguration _load_storage_config: LoadStorageConfiguration diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 18ea8965c0..27a9f46123 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -67,7 +67,6 @@ def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Sc @staticmethod def w_normalize_files( - normalize_config: NormalizeConfiguration, normalize_storage_config: NormalizeStorageConfiguration, loader_storage_config: LoadStorageConfiguration, destination_caps: DestinationCapabilitiesContext, @@ -98,7 +97,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(normalize_config, load_storage, schema, load_id, root_table_name, items) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -127,7 +126,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -146,9 +145,9 @@ def _w_normalize_chunk(config: NormalizeConfiguration, load_storage: LoadStorage row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) - # check update - row, partial_table = schema.check_schema_update(table_name, row, partial_table, config.schema_update_mode) - + # if we detect a migration, the check update + if partial_table: + row, partial_table = schema.check_schema_update(parent_table, table_name, row, partial_table) if not row: continue @@ -203,7 +202,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM workers = self.pool._processes # type: ignore chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() - config_tuple = (self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) + config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) param_chunk = [[*config_tuple, load_id, files] for files in chunk_files] tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] row_counts: TRowCount = {} @@ -256,7 +255,6 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: result = Normalize.w_normalize_files( - self.config, self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index f2d25b9ad8..86f2914e9e 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -1,5 +1,6 @@ import dlt, os, pytest from dlt.common.utils import uniq_id +import duckdb from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -9,16 +10,14 @@ SCHEMA_UPDATE_MODES = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) @pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES) -def test_freeze_schema(update_mode: str, destination_config: DestinationTestConfiguration) -> None: +def test_freeze_schema(update_mode: str) -> None: # freeze pipeline, drop additional values # this will allow for the first run to create the schema, but will not accept further updates after that - os.environ['NORMALIZE__SCHEMA_UPDATE_MODE'] = update_mode - pipeline = destination_config.setup_pipeline("test_freeze_schema_2", dataset_name="freeze" + uniq_id()) + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - @dlt.resource(name="items", write_disposition="append") + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=update_mode) def load_items(): global offset for _, index in enumerate(range(0, 10), 1): @@ -27,7 +26,7 @@ def load_items(): "name": f"item {index}" } - @dlt.resource(name="items", write_disposition="append") + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=update_mode) def load_items_with_subitems(): global offset for _, index in enumerate(range(0, 10), 1): @@ -44,7 +43,7 @@ def load_items_with_subitems(): }] } - pipeline.run([load_items], loader_file_format=destination_config.file_format) + pipeline.run([load_items]) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) # check data assert table_counts["items"] == 10 @@ -53,10 +52,10 @@ def load_items_with_subitems(): # on freeze and raise we expect an exception if update_mode == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + pipeline.run([load_items_with_subitems]) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([load_items_with_subitems], loader_file_format=destination_config.file_format) + pipeline.run([load_items_with_subitems]) # check data table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) From b36a74fec802087475c87858d66fe37e3664ae81 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 4 Sep 2023 17:29:26 +0200 Subject: [PATCH 08/73] add tests --- dlt/common/schema/schema.py | 57 +++++--- dlt/common/validation.py | 5 +- dlt/pipeline/pipeline.py | 1 + tests/load/test_freeze_schema.py | 235 ++++++++++++++++++++++++++----- 4 files changed, 244 insertions(+), 54 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 498db26fc2..d0749bf2a8 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -18,7 +18,11 @@ from dlt.common.schema.exceptions import SchemaFrozenException - +DEFAULT_SCHEMA_EVOLUTION_MODES: TSchemaEvolutionModes = { + "table": "evolve", + "column": "evolve", + "column_variant": "evolve" +} class Schema: ENGINE_VERSION: ClassVar[int] = SCHEMA_ENGINE_VERSION @@ -181,6 +185,8 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" + assert partial_table + # for now we defined the schema as new if there are no data columns defined has_columns = self.has_data_columns if not has_columns: @@ -188,27 +194,42 @@ def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAn # resolve evolution settings table_with_settings = parent_table or table_name - evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", "evolve") + evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", DEFAULT_SCHEMA_EVOLUTION_MODES) if isinstance(evolution_settings, str): - evolution_settings = TSchemaEvolutionModes(table=evolution_settings, column=evolution_settings, column_variant=evolution_settings) + evolution_modes = TSchemaEvolutionModes(table=evolution_settings, column=evolution_settings, column_variant=evolution_settings) + else: + evolution_modes = evolution_settings + evolution_modes = {**DEFAULT_SCHEMA_EVOLUTION_MODES, **evolution_modes} # type: ignore - # if there is a schema update and we froze schema and filter additional data, clean up - if evolution_settings["table"] == "freeze-and-trim": - if table_name not in self.tables or not len(self.tables[table_name].get("columns", {})): - return None, None - # pop unknown values - for item in list(row.keys()): - if item not in self.tables[table_name]["columns"]: - row.pop(item) - return row, None + # default settings allow all evolutions + if evolution_modes == DEFAULT_SCHEMA_EVOLUTION_MODES: + return row, partial_table - # if there is a schema update and we froze schema and discard additional rows, do nothing - elif evolution_settings["table"] == "freeze-and-discard": - return None, None + table_exists = table_name in self.tables and len(self.tables[table_name].get("columns", {})) - # if there is a schema update and we disallow any data not fitting the schema, raise! - elif evolution_settings["table"] == "freeze-and-raise": - raise SchemaFrozenException(f"Trying to modify table {table_name} but schema is frozen.") + # check case where we have a new table + if not table_exists: + if evolution_modes == "freeze-and-trim": + return None, None + if evolution_modes["table"] in ["freeze-and-discard", "freeze-and-trim"]: + return None, None + if evolution_modes["table"] == "freeze-and-raise": + raise SchemaFrozenException(f"Trying to add table {table_name} but new tables are frozen.") + + # check columns + for item in list(row.keys()): + for item in list(row.keys()): + # if this is a new column for an existing table... + if table_exists and item not in self.tables[table_name]["columns"]: + print("in_here " + item) + is_variant = item in partial_table["columns"] and partial_table["columns"][item].get("variant") + if evolution_modes["column"] == "freeze-and-trim" or (is_variant and evolution_modes["column_variant"] == "freeze-and-trim"): + row.pop(item) + partial_table["columns"].pop(item) + if evolution_modes["column"] == "freeze-and-discard" or (is_variant and evolution_modes["column_variant"] == "freeze-and-discard"): + return None, None + if evolution_modes["column"] == "freeze-and-raise" or (is_variant and evolution_modes["column_variant"] == "freeze-and-raise"): + raise SchemaFrozenException(f"Trying to add column {item} to table {table_name}  but columns are frozen.") return row, partial_table diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 36c7f8cac7..6e6ac62e39 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -55,7 +55,10 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: return t = extract_optional_type(t) - if is_literal_type(t): + # TODO: support for union types? + if pk == "schema_evolution_settings": + pass + elif is_literal_type(t): a_l = get_args(t) if pv not in a_l: raise DictValidationException(f"In {path}: field {pk} value {pv} not in allowed {a_l}", path, pk, pv) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 6cdc4ef309..93069aaf4a 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -289,6 +289,7 @@ def extract( storage.commit_extract_files(extract_id) return ExtractInfo(describe_extract_data(data)) except Exception as exc: + raise exc # TODO: provide metrics from extractor raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 86f2914e9e..7f7ea2168d 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -1,6 +1,8 @@ import dlt, os, pytest +from dlt.common.schema.typing import TSchemaEvolutionSettings from dlt.common.utils import uniq_id import duckdb +from typing import Any from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -8,68 +10,231 @@ from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils -SCHEMA_UPDATE_MODES = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] +SCHEMA_EVOLUTION_SETTINGS = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] -@pytest.mark.parametrize("update_mode", SCHEMA_UPDATE_MODES) -def test_freeze_schema(update_mode: str) -> None: +def items(settings: TSchemaEvolutionSettings) -> Any: - # freeze pipeline, drop additional values - # this will allow for the first run to create the schema, but will not accept further updates after that - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=update_mode) + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) def load_items(): global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, + "some_int": 1, "name": f"item {index}" } - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=update_mode) - def load_items_with_subitems(): + return load_items + +def items_with_variant(settings: TSchemaEvolutionSettings) -> Any: + + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + def load_items(): + global offset + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "some_int": "hello" + } + + return load_items + +def items_with_new_column(settings: TSchemaEvolutionSettings) -> Any: + + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + def load_items(): + global offset + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "new_col": "hello" + } + + return load_items + + +def items_with_subtable(settings: TSchemaEvolutionSettings) -> Any: + + @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + def load_items(): global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}", - "new_attribute": "hello", "sub_items": [{ "id": index + 1000, "name": f"sub item {index + 1000}" - },{ - "id": index + 2000, - "name": f"sub item {index + 2000}" }] } - pipeline.run([load_items]) + return load_items + +def new_items(settings: TSchemaEvolutionSettings) -> Any: + + @dlt.resource(name="new_items", write_disposition="append", schema_evolution_settings=settings) + def load_items(): + global offset + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "some_int": 1, + "name": f"item {index}" + } + + return load_items + +OLD_COLUMN_NAME = "name" +NEW_COLUMN_NAME = "new_col" +VARIANT_COLUMN_NAME = "some_int__v_text" +SUBITEMS_TABLE = "items__sub_items" +NEW_ITEMS_TABLE = "new_items" + +@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +def test_freeze_new_tables(evolution_setting: str) -> None: + + full_settings = { + "table": evolution_setting + } + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) + pipeline.run([items(full_settings)]) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - # check data assert table_counts["items"] == 10 - schema_hash = utils.generate_version_hash(pipeline.default_schema.to_dict()) + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # on freeze and raise we expect an exception - if update_mode == "freeze-and-raise": + pipeline.run([items_with_new_column(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + pipeline.run([items_with_variant(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding new subtable + if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([load_items_with_subitems]) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) + pipeline.run([items_with_subtable(full_settings)]) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([load_items_with_subitems]) + pipeline.run([items_with_subtable(full_settings)]) - # check data table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 20 if update_mode not in ["freeze-and-raise", "freeze-and-discard"] else 10 - - # frozen schemas should not have changed - if update_mode != "evolve": - assert schema_hash == utils.generate_version_hash(pipeline.default_schema.to_dict()) - assert "items__sub_items" not in table_counts - # schema was not migrated to contain new attribute - assert "new_attribute" not in pipeline.default_schema.tables["items"]["columns"] - # regular mode evolves the schema + assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 + assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if evolution_setting in ["evolve"] else 0) + + # test adding new table + if evolution_setting == "freeze-and-raise": + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([new_items(full_settings)]) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + else: + pipeline.run([new_items(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) + + +@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +def test_freeze_new_columns(evolution_setting: str) -> None: + + full_settings = { + "column": evolution_setting + } + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline.run([items(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # subtable should work + pipeline.run([items_with_subtable(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[SUBITEMS_TABLE] == 10 + + # new should work + pipeline.run([new_items(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column + if evolution_setting == "freeze-and-raise": + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([items_with_new_column(full_settings)]) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + else: + pipeline.run([items_with_new_column(full_settings)]) + + if evolution_setting == "evolve": + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == (30 if evolution_setting in ["evolve", "freeze-and-trim"] else 20) + + + # test adding variant column + if evolution_setting == "freeze-and-raise": + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([items_with_variant(full_settings)]) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - assert table_counts["items__sub_items"] == 20 - # schema was not migrated to contain new attribute - assert "new_attribute" in pipeline.default_schema.tables["items"]["columns"] + pipeline.run([items_with_variant(full_settings)]) + + if evolution_setting == "evolve": + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == (40 if evolution_setting in ["evolve", "freeze-and-trim"] else 20) + + +@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +def test_freeze_variants(evolution_setting: str) -> None: + + full_settings = { + "column_variant": evolution_setting + } + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline.run([items(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # subtable should work + pipeline.run([items_with_subtable(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[SUBITEMS_TABLE] == 10 + + # new should work + pipeline.run([new_items(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column + pipeline.run([items_with_new_column(full_settings)]) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding variant column + if evolution_setting == "freeze-and-raise": + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([items_with_variant(full_settings)]) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + else: + pipeline.run([items_with_variant(full_settings)]) + + if evolution_setting == "evolve": + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == (40 if evolution_setting in ["evolve", "freeze-and-trim"] else 30) From 56598273f4243af12388e8612511713a620b18c9 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 4 Sep 2023 17:37:11 +0200 Subject: [PATCH 09/73] small tests change --- dlt/pipeline/pipeline.py | 1 - tests/load/test_freeze_schema.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 93069aaf4a..6cdc4ef309 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -289,7 +289,6 @@ def extract( storage.commit_extract_files(extract_id) return ExtractInfo(describe_extract_data(data)) except Exception as exc: - raise exc # TODO: provide metrics from extractor raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 7f7ea2168d..d82e2defe9 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -16,7 +16,6 @@ def items(settings: TSchemaEvolutionSettings) -> Any: @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) def load_items(): - global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, @@ -30,7 +29,6 @@ def items_with_variant(settings: TSchemaEvolutionSettings) -> Any: @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) def load_items(): - global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, @@ -44,7 +42,6 @@ def items_with_new_column(settings: TSchemaEvolutionSettings) -> Any: @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) def load_items(): - global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, @@ -59,7 +56,6 @@ def items_with_subtable(settings: TSchemaEvolutionSettings) -> Any: @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) def load_items(): - global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, @@ -76,7 +72,6 @@ def new_items(settings: TSchemaEvolutionSettings) -> Any: @dlt.resource(name="new_items", write_disposition="append", schema_evolution_settings=settings) def load_items(): - global offset for _, index in enumerate(range(0, 10), 1): yield { "id": index, @@ -103,6 +98,9 @@ def test_freeze_new_tables(evolution_setting: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + "table": evolution_setting + } pipeline.run([items_with_new_column(full_settings)]) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -148,6 +146,9 @@ def test_freeze_new_columns(evolution_setting: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + "column": evolution_setting + } # subtable should work pipeline.run([items_with_subtable(full_settings)]) @@ -204,6 +205,9 @@ def test_freeze_variants(evolution_setting: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + "column_variant": evolution_setting + } # subtable should work pipeline.run([items_with_subtable(full_settings)]) From 894875f3577a491367023b7ecbb6b8ae2060a400 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 5 Sep 2023 10:49:56 +0200 Subject: [PATCH 10/73] small fix --- dlt/common/schema/schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 879b8161e9..b213088305 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -196,7 +196,7 @@ def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAn # resolve evolution settings table_with_settings = parent_table or table_name - evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", DEFAULT_SCHEMA_EVOLUTION_MODES) + evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {}) or DEFAULT_SCHEMA_EVOLUTION_MODES if isinstance(evolution_settings, str): evolution_modes = TSchemaEvolutionModes(table=evolution_settings, column=evolution_settings, column_variant=evolution_settings) else: From 8bccfe50a3d02d5d364c406b5c97816dbb074853 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 5 Sep 2023 13:31:28 +0200 Subject: [PATCH 11/73] fix some tests --- dlt/common/schema/schema.py | 36 +- dlt/common/schema/typing.py | 2 +- dlt/common/schema/utils.py | 9 + dlt/extract/decorators.py | 9 + dlt/pipeline/__init__.py | 6 +- .../cases/schemas/eth/ethereum_schema_v7.yml | 1104 +++++++++++++++++ tests/common/schema/test_schema.py | 8 +- tests/common/schema/test_versioning.py | 8 +- tests/common/storages/test_schema_storage.py | 10 +- tests/common/test_validation.py | 2 +- tests/common/utils.py | 2 +- tests/extract/test_decorators.py | 4 +- tests/load/pipeline/test_restore_state.py | 4 +- tests/load/test_freeze_schema.py | 34 +- tests/pipeline/test_dlt_versions.py | 6 +- 15 files changed, 1198 insertions(+), 46 deletions(-) create mode 100644 tests/common/cases/schemas/eth/ethereum_schema_v7.yml diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index b213088305..0c8543064e 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -11,7 +11,7 @@ from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaEvolutionModes, TColumnSchemaBase) + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaEvolutionModes, TSchemaEvolutionSettings) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict @@ -184,6 +184,26 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial + def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaEvolutionModes: + # find table settings + table_with_settings = parent_table or table_name + table_evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {}) or {} + if isinstance(table_evolution_settings, str): + table_evolution_modes = TSchemaEvolutionModes(table=table_evolution_settings, column=table_evolution_settings, column_variant=table_evolution_settings) + else: + table_evolution_modes = table_evolution_settings + + # find schema settings + schema_evolution_settings = self._settings.get("schema_evolution_settings", {}) or {} + if isinstance(schema_evolution_settings, str): + schema_evolution_modes = TSchemaEvolutionModes(table=schema_evolution_settings, column=schema_evolution_settings, column_variant=schema_evolution_settings) + else: + schema_evolution_modes = schema_evolution_settings + + # resolve to correct settings dict + return {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes} # type: ignore + + def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" @@ -194,16 +214,9 @@ def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAn if not has_columns: return row, partial_table - # resolve evolution settings - table_with_settings = parent_table or table_name - evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {}) or DEFAULT_SCHEMA_EVOLUTION_MODES - if isinstance(evolution_settings, str): - evolution_modes = TSchemaEvolutionModes(table=evolution_settings, column=evolution_settings, column_variant=evolution_settings) - else: - evolution_modes = evolution_settings - evolution_modes = {**DEFAULT_SCHEMA_EVOLUTION_MODES, **evolution_modes} # type: ignore + evolution_modes = self.resolve_evolution_settings_for_table(parent_table, table_name) - # default settings allow all evolutions + # default settings allow all evolutions, skipp all else if evolution_modes == DEFAULT_SCHEMA_EVOLUTION_MODES: return row, partial_table @@ -415,6 +428,9 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) + def set_schema_evolution_settings(self, settings: TSchemaEvolutionSettings) -> None: + self._settings["schema_evolution_settings"] = settings + def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: column_schema = TColumnSchema( name=k, diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index fc904becf0..65ec6075a0 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -11,7 +11,7 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 6 +SCHEMA_ENGINE_VERSION = 7 # dlt tables VERSION_TABLE_NAME = "_dlt_version" diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index f892a21354..21c0142453 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -283,6 +283,15 @@ def migrate_filters(group: str, filters: List[str]) -> None: # replace loads table schema_dict["tables"][LOADS_TABLE_NAME] = load_table() from_engine = 6 + if from_engine == 6 and to_engine > 6: + # migrate from sealed properties to schema evolution settings + schema_dict["settings"].pop("schema_sealed", None) + schema_dict["settings"]["schema_evolution_settings"] = "evolve" + for table in schema_dict["tables"].values(): + table.pop("table_sealed", None) + table["schema_evolution_settings"] = {} + schema_dict["tables"][LOADS_TABLE_NAME] = load_table() + from_engine = 7 schema_dict["engine_version"] = from_engine if from_engine != to_engine: diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index c5515f2fda..37c3e229ad 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -52,6 +52,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, spec: Type[BaseConfiguration] = None ) -> Callable[TSourceFunParams, DltSource]: ... @@ -65,6 +66,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, DltSource]]: ... @@ -77,6 +79,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, spec: Type[BaseConfiguration] = None ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. @@ -151,6 +154,8 @@ def decorator(f: Callable[TSourceFunParams, Any]) -> Callable[TSourceFunParams, @wraps(conf_f) def _wrap(*args: Any, **kwargs: Any) -> DltSource: + nonlocal schema + # make schema available to the source with Container().injectable_context(SourceSchemaInjectableContext(schema)): # configurations will be accessed in this section in the source @@ -164,6 +169,10 @@ def _wrap(*args: Any, **kwargs: Any) -> DltSource: if inspect.isgenerator(rv): rv = list(rv) + # prepare schema + schema = schema.clone(update_normalizers=True) + schema.set_schema_evolution_settings(schema_evolution_settings) + # convert to source s = DltSource.from_data(name, source_section, schema.clone(update_normalizers=True), rv) # apply hints diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index df4314cf0d..99bd53968a 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -175,7 +175,8 @@ def run( table_name: str = None, write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, - schema: Schema = None + schema: Schema = None, + loader_file_format: TLoaderFileFormat = None ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -235,7 +236,8 @@ def run( table_name=table_name, write_disposition=write_disposition, columns=columns, - schema=schema + schema=schema, + loader_file_format=loader_file_format ) # plug default tracking module diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml new file mode 100644 index 0000000000..02212de8d8 --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -0,0 +1,1104 @@ +version: 14 +version_hash: MuiE8s34Wub6EKLDCIIygZ3nfwXgIwtNDiyji7A6pwY= +engine_version: 7 +name: ethereum +tables: + _dlt_loads: + name: _dlt_loads + columns: + load_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: load_id + data_type: text + schema_name: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: schema_name + data_type: text + status: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: status + data_type: bigint + inserted_at: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: inserted_at + data_type: timestamp + schema_version_hash: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: schema_version_hash + data_type: text + write_disposition: skip + resource: _dlt_loads + schema_evolution_settings: null + description: Created by DLT. Tracks completed loads + _dlt_version: + columns: + version: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: version + engine_version: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: engine_version + inserted_at: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: timestamp + name: inserted_at + schema_name: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: schema_name + version_hash: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: version_hash + schema: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: schema + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_evolution_settings: {} + name: _dlt_version + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + description: load id coming from the extractor + data_type: text + name: _dlt_load_id + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + number: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: number + parent_hash: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: parent_hash + hash: + nullable: false + partition: false + cluster: true + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: hash + base_fee_per_gas: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: base_fee_per_gas + difficulty: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: difficulty + extra_data: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: extra_data + gas_limit: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas_limit + gas_used: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas_used + logs_bloom: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: binary + name: logs_bloom + miner: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: miner + mix_hash: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: mix_hash + nonce: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: nonce + receipts_root: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: receipts_root + sha3_uncles: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: sha3_uncles + size: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: size + state_root: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: state_root + timestamp: + nullable: false + partition: false + cluster: false + unique: true + sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: timestamp + name: timestamp + total_difficulty: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: total_difficulty + transactions_root: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: transactions_root + schema_evolution_settings: {} + name: blocks + resource: blocks + blocks__transactions: + parent: blocks + columns: + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + block_number: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: true + root_key: false + merge_key: false + data_type: bigint + name: block_number + transaction_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: transaction_index + hash: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: hash + block_hash: + nullable: false + partition: false + cluster: true + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: block_hash + block_timestamp: + nullable: false + partition: false + cluster: false + unique: false + sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: timestamp + name: block_timestamp + chain_id: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: chain_id + from: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: from + gas: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas + gas_price: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas_price + input: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: input + max_fee_per_gas: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: max_priority_fee_per_gas + nonce: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: nonce + r: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: r + s: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: s + status: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: status + to: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: to + type: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: type + v: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: v + value: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: value + eth_value: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: decimal + name: eth_value + schema_evolution_settings: {} + name: blocks__transactions + blocks__transactions__logs: + parent: blocks__transactions + columns: + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + address: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: address + block_timestamp: + nullable: false + partition: false + cluster: false + unique: false + sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: timestamp + name: block_timestamp + block_hash: + nullable: false + partition: false + cluster: true + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: block_hash + block_number: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: true + root_key: false + merge_key: false + data_type: bigint + name: block_number + transaction_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: true + root_key: false + merge_key: false + data_type: bigint + name: transaction_index + log_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: true + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: log_index + data: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: data + removed: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bool + name: removed + transaction_hash: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: transaction_hash + schema_evolution_settings: {} + name: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: true + root_key: false + merge_key: false + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: true + merge_key: false + data_type: text + name: _dlt_root_id + value: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: {} + name: blocks__transactions__logs__topics + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: true + root_key: false + merge_key: false + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: true + merge_key: false + data_type: text + name: _dlt_root_id + address: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: address + schema_evolution_settings: {} + name: blocks__transactions__access_list + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: true + root_key: false + merge_key: false + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: true + merge_key: false + data_type: text + name: _dlt_root_id + value: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: {} + name: blocks__transactions__access_list__storage_keys + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: true + root_key: false + merge_key: false + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + partition: false + cluster: false + unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: true + merge_key: false + data_type: text + name: _dlt_root_id + value: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: {} + name: blocks__uncles +settings: + default_hints: + foreign_key: + - _dlt_parent_id + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_evolution_settings: evolve +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + generate_dlt_id: true + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash + diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 5d16b3f57f..2bcfdba3b9 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -132,7 +132,7 @@ def test_simple_regex_validator() -> None: def test_load_corrupted_schema() -> None: - eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v4") + eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") del eth_v4["tables"]["blocks"] with pytest.raises(ParentTableNotFoundException): utils.validate_stored_schema(eth_v4) @@ -526,7 +526,7 @@ def assert_new_schema_values(schema: Schema) -> None: assert schema.stored_version == 1 assert schema.stored_version_hash is not None assert schema.version_hash is not None - assert schema.ENGINE_VERSION == 6 + assert schema.ENGINE_VERSION == 7 assert len(schema.settings["default_hints"]) > 0 # check settings assert utils.standard_type_detections() == schema.settings["detections"] == schema._type_detections @@ -585,8 +585,8 @@ def test_group_tables_by_resource(schema: Schema) -> None: result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) # both tables with resource "products" must be here assert result == {'products': [ - {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append'}, - {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append'}, + {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_evolution_settings': None}, + {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_evolution_settings': None}, {'columns': {}, 'name': 'mc_products__sub', 'parent': 'mc_products'} ] } diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index b535634ef4..f1d75028ac 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -83,10 +83,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") - version = eth_v6["version"] - version_hash = eth_v6["version_hash"] - schema = Schema.from_dict(eth_v6) + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + version = eth_v7["version"] + version_hash = eth_v7["version_hash"] + schema = Schema.from_dict(eth_v7) # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 078af856cb..985d1b4392 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -11,7 +11,7 @@ from dlt.common.storages import SchemaStorageConfiguration, SchemaStorage, LiveSchemaStorage, FileStorage from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT -from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V6 +from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V7 @pytest.fixture @@ -194,10 +194,10 @@ def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: ie_storage.save_schema(schema) assert schema.version_hash == schema_hash # we linked schema to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # load schema and make sure our new schema is here schema = ie_storage.load_schema("ethereum") - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 assert schema._stored_version_hash == schema_hash assert schema.version_hash == schema_hash # we have simple schema in export folder @@ -213,7 +213,7 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No schema = Schema("ethereum") schema_hash = schema.version_hash synced_storage.save_schema(schema) - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # import schema is overwritten fs = FileStorage(synced_storage.config.import_schema_path) exported_name = synced_storage._file_name_in_store("ethereum", "yaml") @@ -274,7 +274,7 @@ def prepare_import_folder(storage: SchemaStorage) -> None: def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: prepare_import_folder(synced_storage) - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") + eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") schema = synced_storage.load_schema("ethereum") # is linked to imported schema schema._imported_version_hash = eth_v6["version_hash"] diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index d4885ccd67..d278c1029c 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -83,7 +83,7 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: - with open("tests/common/cases/schemas/eth/ethereum_schema_v4.yml", mode="r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v7.yml", mode="r", encoding="utf-8") as f: schema_dict: TStoredSchema = yaml.safe_load(f) validate_dict_ignoring_xkeys( diff --git a/tests/common/utils.py b/tests/common/utils.py index 7a49a80efb..5dce971b69 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -15,7 +15,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V6 = "++bJOVuScYYoVUFtjmZMBV+cxsWs8irYHIMV8J1xD5g=" +IMPORTED_VERSION_HASH_ETH_V7 = "MuiE8s34Wub6EKLDCIIygZ3nfwXgIwtNDiyji7A6pwY=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index c89d378f9a..0b54a84d14 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -21,7 +21,7 @@ from dlt.extract.source import DltResource, DltSource from dlt.common.schema.exceptions import InvalidSchemaName -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V6 +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7 def test_none_returning_source() -> None: @@ -68,7 +68,7 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V7 def test_unbound_parametrized_transformer() -> None: diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 6e377cc59f..d0e37fdb2b 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -17,7 +17,7 @@ from tests.utils import TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V6, yml_case_path as common_yml_case_path +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7, yml_case_path as common_yml_case_path from tests.common.configuration.utils import environment from tests.load.pipeline.utils import assert_query_data, drop_active_pipeline_data from tests.load.utils import destinations_configs, DestinationTestConfiguration, get_normalized_dataset_name @@ -401,7 +401,7 @@ def test_restore_schemas_while_import_schemas_exist(destination_config: Destinat assert normalized_annotations in schema.tables # check if attached to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # extract some data with restored pipeline p.run(["C", "D", "E"], table_name="blacklist") assert normalized_labels in schema.tables diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index d82e2defe9..b2767466fb 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -3,6 +3,7 @@ from dlt.common.utils import uniq_id import duckdb from typing import Any +from dlt.extract.source import DltSource, DltResource from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -87,27 +88,38 @@ def load_items(): SUBITEMS_TABLE = "items__sub_items" NEW_ITEMS_TABLE = "new_items" + +def wrap_in_source(resource_fun, settings) -> DltSource: + + @dlt.source(name="freeze_tests", schema_evolution_settings=settings) + def source() -> DltResource: + return resource_fun(None) + + return [source()] + + @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -def test_freeze_new_tables(evolution_setting: str) -> None: +@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> None: full_settings = { "table": evolution_setting } pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) - pipeline.run([items(full_settings)]) + pipeline.run(wrap_in_source(items, full_settings)) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - "table": evolution_setting - } + # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + # "table": evolution_setting + # } - pipeline.run([items_with_new_column(full_settings)]) + pipeline.run(wrap_in_source(items_with_new_column, full_settings)) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - pipeline.run([items_with_variant(full_settings)]) + pipeline.run(wrap_in_source(items_with_variant, full_settings)) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -115,10 +127,10 @@ def test_freeze_new_tables(evolution_setting: str) -> None: # test adding new subtable if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([items_with_subtable(full_settings)]) + pipeline.run(wrap_in_source(items_with_subtable, full_settings)) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([items_with_subtable(full_settings)]) + pipeline.run(wrap_in_source(items_with_subtable, full_settings)) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 @@ -127,10 +139,10 @@ def test_freeze_new_tables(evolution_setting: str) -> None: # test adding new table if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([new_items(full_settings)]) + pipeline.run(wrap_in_source(new_items, full_settings)) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([new_items(full_settings)]) + pipeline.run(wrap_in_source(new_items, full_settings)) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 8ae9c01026..0a035d1e00 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -54,7 +54,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py")) # hash hash in schema github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) - assert github_schema["engine_version"] == 6 + assert github_schema["engine_version"] == 7 assert "schema_version_hash" in github_schema["tables"][LOADS_TABLE_NAME]["columns"] with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at") @@ -80,7 +80,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: pipeline.sync_destination() # print(pipeline.working_dir) # we have updated schema - assert pipeline.default_schema.ENGINE_VERSION == 6 + assert pipeline.default_schema.ENGINE_VERSION == 7 # make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped assert pipeline.default_schema.stored_version_hash == github_schema["version_hash"] @@ -112,6 +112,6 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) pipeline = pipeline.drop() pipeline.sync_destination() - assert pipeline.default_schema.ENGINE_VERSION == 6 + assert pipeline.default_schema.ENGINE_VERSION == 7 # schema version does not match `dlt.attach` does not update to the right schema by itself assert pipeline.default_schema.stored_version_hash != github_schema["version_hash"] From cfd3f64e784b999a0979e662fd5626feead6e7a3 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 5 Sep 2023 16:59:57 +0200 Subject: [PATCH 12/73] add global override for schema evolution --- dlt/common/pipeline.py | 5 +++-- dlt/common/schema/schema.py | 30 +++++++++++++++--------------- dlt/extract/extract.py | 2 +- dlt/normalize/normalize.py | 21 ++++++++++++--------- dlt/pipeline/__init__.py | 5 +++-- dlt/pipeline/pipeline.py | 11 ++++++----- tests/load/test_freeze_schema.py | 21 +++++++++++---------- 7 files changed, 51 insertions(+), 44 deletions(-) diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 858f2a0957..92be31b720 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -16,7 +16,7 @@ from dlt.common.destination import DestinationReference, TDestinationReferenceArg from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaEvolutionSettings from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.typing import DictStrAny, REPattern @@ -209,7 +209,8 @@ def run( columns: Sequence[TColumnSchema] = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, ) -> LoadInfo: ... diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 0c8543064e..eefd7db288 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -184,27 +184,27 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaEvolutionModes: + def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str, schema_evolution_settings_override: TSchemaEvolutionSettings) -> TSchemaEvolutionModes: + + def resolve_single(settings: TSchemaEvolutionSettings) -> TSchemaEvolutionModes: + settings = settings or {} + if isinstance(settings, str): + return TSchemaEvolutionModes(table=settings, column=settings, column_variant=settings) + return settings + # find table settings table_with_settings = parent_table or table_name - table_evolution_settings = self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {}) or {} - if isinstance(table_evolution_settings, str): - table_evolution_modes = TSchemaEvolutionModes(table=table_evolution_settings, column=table_evolution_settings, column_variant=table_evolution_settings) - else: - table_evolution_modes = table_evolution_settings - # find schema settings - schema_evolution_settings = self._settings.get("schema_evolution_settings", {}) or {} - if isinstance(schema_evolution_settings, str): - schema_evolution_modes = TSchemaEvolutionModes(table=schema_evolution_settings, column=schema_evolution_settings, column_variant=schema_evolution_settings) - else: - schema_evolution_modes = schema_evolution_settings + # modes + table_evolution_modes = resolve_single(self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {})) + schema_evolution_modes = resolve_single(self._settings.get("schema_evolution_settings", {})) + overide_modes = resolve_single(schema_evolution_settings_override) # resolve to correct settings dict - return {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes} # type: ignore + return {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes, **overide_modes} # type: ignore - def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_evolution_settings_override: TSchemaEvolutionSettings) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" assert partial_table @@ -214,7 +214,7 @@ def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAn if not has_columns: return row, partial_table - evolution_modes = self.resolve_evolution_settings_for_table(parent_table, table_name) + evolution_modes = self.resolve_evolution_settings_for_table(parent_table, table_name, schema_evolution_settings_override) # default settings allow all evolutions, skipp all else if evolution_modes == DEFAULT_SCHEMA_EVOLUTION_MODES: diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index d84895b308..a3642726bd 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -176,7 +176,7 @@ def extract_with_schema( schema: Schema, collector: Collector, max_parallel_items: int, - workers: int + workers: int, ) -> str: # generate extract_id to be able to commit all the sources together later extract_id = storage.create_extract_id() diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 27a9f46123..f09b4c8ef6 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -11,7 +11,7 @@ from dlt.common.runners import TRunMetrics, Runnable from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns +from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaEvolutionSettings from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration @@ -34,7 +34,7 @@ class Normalize(Runnable[ProcessPool]): @with_config(spec=NormalizeConfiguration, sections=(known_sections.NORMALIZE,)) - def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value) -> None: + def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value, schema_evolution_settings: TSchemaEvolutionSettings = None) -> None: self.config = config self.collector = collector self.pool: ProcessPool = None @@ -42,6 +42,7 @@ def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: Schema self.load_storage: LoadStorage = None self.schema_storage: SchemaStorage = None self._row_counts: TRowCount = {} + self.schema_evolution_settings = schema_evolution_settings # setup storages self.create_storages() @@ -73,6 +74,7 @@ def w_normalize_files( stored_schema: TStoredSchema, load_id: str, extracted_items_files: Sequence[str], + schema_evolution_settings: TSchemaEvolutionSettings ) -> TWorkerRV: schema_updates: List[TSchemaUpdate] = [] total_items = 0 @@ -97,7 +99,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items, schema_evolution_settings) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -126,7 +128,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_evolution_settings: TSchemaEvolutionSettings) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -147,7 +149,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, the check update if partial_table: - row, partial_table = schema.check_schema_update(parent_table, table_name, row, partial_table) + row, partial_table = schema.check_schema_update(parent_table, table_name, row, partial_table, schema_evolution_settings) if not row: continue @@ -198,12 +200,12 @@ def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[st l_idx = idx + 1 return chunk_files - def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: + def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schema_evolution_settings: TSchemaEvolutionSettings) -> TMapFuncRV: workers = self.pool._processes # type: ignore chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) - param_chunk = [[*config_tuple, load_id, files] for files in chunk_files] + param_chunk = [[*config_tuple, load_id, files, schema_evolution_settings] for files in chunk_files] tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] row_counts: TRowCount = {} @@ -253,7 +255,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM return schema_updates, row_counts - def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: + def map_single(self, schema: Schema, load_id: str, files: Sequence[str], schema_evolution_settings: TSchemaEvolutionSettings) -> TMapFuncRV: result = Normalize.w_normalize_files( self.normalize_storage.config, self.load_storage.config, @@ -261,6 +263,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap schema.to_dict(), load_id, files, + schema_evolution_settings ) self.update_schema(schema, result[0]) self.collector.update("Files", len(result[2])) @@ -271,7 +274,7 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) # process files in parallel or in single thread, depending on map_f - schema_updates, row_counts = map_f(schema, load_id, files) + schema_updates, row_counts = map_f(schema, load_id, files, self.schema_evolution_settings) # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) if len(schema_updates) > 0: logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 99bd53968a..dea7ea1b57 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,7 +1,7 @@ from typing import Sequence, cast, overload from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaEvolutionSettings from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config @@ -176,7 +176,8 @@ def run( write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_evolution_settings: TSchemaEvolutionSettings = None, ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 488a617eb3..1ae99efef7 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -18,7 +18,7 @@ MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -295,7 +295,7 @@ def extract( @with_runtime_trace @with_schemas_sync @with_config_section((known_sections.NORMALIZE,)) - def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None) -> NormalizeInfo: + def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None, schema_evolution_settings: TSchemaEvolutionSettings = None) -> NormalizeInfo: """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive() and workers > 1: raise NotImplementedError("Do not use normalize workers in interactive mode ie. in notebook") @@ -318,7 +318,7 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No # run with destination context with self._maybe_destination_capabilities(loader_file_format=loader_file_format): # shares schema storage with the pipeline so we do not need to install - normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage) + normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage, schema_evolution_settings=schema_evolution_settings) try: with signals.delayed_signals(): runner.run_pool(normalize.config, normalize) @@ -391,7 +391,8 @@ def run( columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_evolution_settings: TSchemaEvolutionSettings = None ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -468,7 +469,7 @@ def run( # extract from the source if data is not None: self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema) - self.normalize(loader_file_format=loader_file_format) + self.normalize(loader_file_format=loader_file_format, schema_evolution_settings=schema_evolution_settings) return self.load(destination, dataset_name, credentials=credentials) else: return None diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index b2767466fb..49fee34dd7 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -89,13 +89,13 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def wrap_in_source(resource_fun, settings) -> DltSource: +def run_resource(pipeline, resource_fun, settings) -> DltSource: - @dlt.source(name="freeze_tests", schema_evolution_settings=settings) + @dlt.source(name="freeze_tests", schema_evolution_settings=None) def source() -> DltResource: return resource_fun(None) - return [source()] + pipeline.run(source(), schema_evolution_settings=settings) @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) @@ -106,7 +106,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non "table": evolution_setting } pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) - pipeline.run(wrap_in_source(items, full_settings)) + run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -114,12 +114,12 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # "table": evolution_setting # } - pipeline.run(wrap_in_source(items_with_new_column, full_settings)) + run_resource(pipeline, items_with_new_column, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - pipeline.run(wrap_in_source(items_with_variant, full_settings)) + run_resource(pipeline, items_with_variant, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -127,10 +127,11 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new subtable if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run(wrap_in_source(items_with_subtable, full_settings)) + run_resource(pipeline, items_with_subtable, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run(wrap_in_source(items_with_subtable, full_settings)) + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 @@ -139,10 +140,10 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new table if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run(wrap_in_source(new_items, full_settings)) + run_resource(pipeline, new_items, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run(wrap_in_source(new_items, full_settings)) + run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) From e05855c19ec522e487da6d4ac174ca3160e68434 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 5 Sep 2023 18:08:50 +0200 Subject: [PATCH 13/73] finish implemention of global override --- dlt/common/schema/schema.py | 5 ++- dlt/normalize/normalize.py | 2 +- dlt/pipeline/pipeline.py | 2 +- tests/load/test_freeze_schema.py | 69 ++++++++++++++++---------------- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index eefd7db288..9e12c6e709 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -201,7 +201,9 @@ def resolve_single(settings: TSchemaEvolutionSettings) -> TSchemaEvolutionModes: overide_modes = resolve_single(schema_evolution_settings_override) # resolve to correct settings dict - return {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes, **overide_modes} # type: ignore + settings = cast(TSchemaEvolutionModes, {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes, **overide_modes}) + + return settings def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_evolution_settings_override: TSchemaEvolutionSettings) -> Tuple[DictStrAny, TPartialTableSchema]: @@ -236,7 +238,6 @@ def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAn for item in list(row.keys()): # if this is a new column for an existing table... if table_exists and item not in self.tables[table_name]["columns"]: - print("in_here " + item) is_variant = item in partial_table["columns"] and partial_table["columns"][item].get("variant") if evolution_modes["column"] == "freeze-and-trim" or (is_variant and evolution_modes["column_variant"] == "freeze-and-trim"): row.pop(item) diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index f09b4c8ef6..9c3b33fbaa 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -26,7 +26,7 @@ # normalize worker wrapping function (map_parallel, map_single) return type TMapFuncRV = Tuple[Sequence[TSchemaUpdate], TRowCount] # normalize worker wrapping function signature -TMapFuncType = Callable[[Schema, str, Sequence[str]], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) +TMapFuncType = Callable[[Schema, str, Sequence[str], TSchemaEvolutionSettings], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) # tuple returned by the worker TWorkerRV = Tuple[List[TSchemaUpdate], int, List[str], TRowCount] diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 1ae99efef7..40f6e2f4fc 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -459,7 +459,7 @@ def run( # normalize and load pending data if self.list_extracted_resources(): - self.normalize(loader_file_format=loader_file_format) + self.normalize(loader_file_format=loader_file_format, schema_evolution_settings=schema_evolution_settings) if self.list_normalized_load_packages(): # if there were any pending loads, load them and **exit** if data is not None: diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 49fee34dd7..e998f399dc 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -11,7 +11,7 @@ from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils -SCHEMA_EVOLUTION_SETTINGS = ["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] +SCHEMA_EVOLUTION_SETTINGS = ["evolve", "freeze-and-trim", "freeze-and-discard", "freeze-and-raise"] def items(settings: TSchemaEvolutionSettings) -> Any: @@ -89,13 +89,13 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline, resource_fun, settings) -> DltSource: +def run_resource(pipeline, resource_fun, settings, settings_location: str) -> DltSource: - @dlt.source(name="freeze_tests", schema_evolution_settings=None) + @dlt.source(name="freeze_tests", schema_evolution_settings=settings if settings_location == "source" else None) def source() -> DltResource: - return resource_fun(None) + return resource_fun(settings if settings_location == "resource" else None) - pipeline.run(source(), schema_evolution_settings=settings) + pipeline.run(source(), schema_evolution_settings=settings if settings_location == "global_override" else None) @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) @@ -106,7 +106,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non "table": evolution_setting } pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) - run_resource(pipeline, items, full_settings) + run_resource(pipeline, items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -114,12 +114,12 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # "table": evolution_setting # } - run_resource(pipeline, items_with_new_column, full_settings) + run_resource(pipeline, items_with_new_column, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - run_resource(pipeline, items_with_variant, full_settings) + run_resource(pipeline, items_with_variant, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -127,11 +127,10 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new subtable if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_subtable, full_settings) + run_resource(pipeline, items_with_subtable, full_settings, setting_location) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, items_with_subtable, full_settings) - + run_resource(pipeline, items_with_subtable, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 @@ -140,37 +139,38 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new table if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, new_items, full_settings) + run_resource(pipeline, new_items, full_settings, setting_location) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, new_items, full_settings) + run_resource(pipeline, new_items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -def test_freeze_new_columns(evolution_setting: str) -> None: +@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> None: full_settings = { "column": evolution_setting } pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - pipeline.run([items(full_settings)]) + run_resource(pipeline, items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - "column": evolution_setting - } + # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + # "column": evolution_setting + # } # subtable should work - pipeline.run([items_with_subtable(full_settings)]) + run_resource(pipeline, items_with_subtable, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[SUBITEMS_TABLE] == 10 # new should work - pipeline.run([new_items(full_settings)]) + run_resource(pipeline, new_items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[NEW_ITEMS_TABLE] == 10 @@ -178,10 +178,10 @@ def test_freeze_new_columns(evolution_setting: str) -> None: # test adding new column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([items_with_new_column(full_settings)]) + run_resource(pipeline, items_with_new_column, full_settings, setting_location) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([items_with_new_column(full_settings)]) + run_resource(pipeline, items_with_new_column, full_settings, setting_location) if evolution_setting == "evolve": assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -194,10 +194,10 @@ def test_freeze_new_columns(evolution_setting: str) -> None: # test adding variant column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([items_with_variant(full_settings)]) + run_resource(pipeline, items_with_variant, full_settings, setting_location) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([items_with_variant(full_settings)]) + run_resource(pipeline, items_with_variant, full_settings, setting_location) if evolution_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -208,34 +208,35 @@ def test_freeze_new_columns(evolution_setting: str) -> None: @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -def test_freeze_variants(evolution_setting: str) -> None: +@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: full_settings = { "column_variant": evolution_setting } pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - pipeline.run([items(full_settings)]) + run_resource(pipeline, items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - "column_variant": evolution_setting - } + # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { + # "column_variant": evolution_setting + # } # subtable should work - pipeline.run([items_with_subtable(full_settings)]) + run_resource(pipeline, items_with_subtable, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[SUBITEMS_TABLE] == 10 # new should work - pipeline.run([new_items(full_settings)]) + run_resource(pipeline, new_items, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column - pipeline.run([items_with_new_column(full_settings)]) + run_resource(pipeline, items_with_new_column, full_settings, setting_location) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -243,10 +244,10 @@ def test_freeze_variants(evolution_setting: str) -> None: # test adding variant column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([items_with_variant(full_settings)]) + run_resource(pipeline, items_with_variant, full_settings, setting_location) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - pipeline.run([items_with_variant(full_settings)]) + run_resource(pipeline, items_with_variant, full_settings, setting_location) if evolution_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] From 3f0712713bdee909c19378052cd75b8e5acf5d7e Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 09:44:50 +0200 Subject: [PATCH 14/73] better tests --- dlt/common/schema/utils.py | 5 +- tests/load/test_freeze_schema.py | 172 ++++++++++++++++++++++++------- 2 files changed, 134 insertions(+), 43 deletions(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 21c0142453..81332976cd 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -412,9 +412,8 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa if table.get('parent') is None and (resource := partial_table.get('resource')): table['resource'] = resource - partial_e_s = partial_table.get("schema_evolution_settings") - if partial_e_s: - table["schema_evolution_settings"] = partial_e_s + # always update evolution settings + table["schema_evolution_settings"] = partial_table.get("schema_evolution_settings") return diff_table diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index e998f399dc..9d917cdaf1 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -11,7 +11,13 @@ from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils +from tests.utils import skip_if_not_active + +skip_if_not_active("duckdb") + SCHEMA_EVOLUTION_SETTINGS = ["evolve", "freeze-and-trim", "freeze-and-discard", "freeze-and-raise"] +LOCATIONS = ["source", "resource", "override"] +SCHEMA_ELEMENTS = ["table", "column", "column_variant"] def items(settings: TSchemaEvolutionSettings) -> Any: @@ -89,37 +95,50 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline, resource_fun, settings, settings_location: str) -> DltSource: +def run_resource(pipeline, resource_fun, settings) -> DltSource: + + for item in settings.keys(): + assert item in LOCATIONS + ev_settings = settings[item] + if ev_settings in SCHEMA_EVOLUTION_SETTINGS: + continue + for key, val in ev_settings.items(): + assert val in SCHEMA_EVOLUTION_SETTINGS + assert key in SCHEMA_ELEMENTS - @dlt.source(name="freeze_tests", schema_evolution_settings=settings if settings_location == "source" else None) + @dlt.source(name="freeze_tests", schema_evolution_settings=settings.get("source")) def source() -> DltResource: - return resource_fun(settings if settings_location == "resource" else None) + return resource_fun(settings.get("resource")) - pipeline.run(source(), schema_evolution_settings=settings if settings_location == "global_override" else None) + # run pipeline + pipeline.run(source(), schema_evolution_settings=settings.get("override")) + # check updated schema + # assert pipeline.default_schema._settings["schema_evolution_settings"] == (settings if settings_location == "source" else None) + + # check items table settings + # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == (settings if settings_location == "resource" else None) @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +@pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> None: full_settings = { + setting_location: { "table": evolution_setting - } + }} pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) - run_resource(pipeline, items, full_settings, setting_location) + run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - # "table": evolution_setting - # } - run_resource(pipeline, items_with_new_column, full_settings, setting_location) + run_resource(pipeline, items_with_new_column, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - run_resource(pipeline, items_with_variant, full_settings, setting_location) + run_resource(pipeline, items_with_variant, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -127,10 +146,10 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new subtable if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_subtable, full_settings, setting_location) + run_resource(pipeline, items_with_subtable, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, items_with_subtable, full_settings, setting_location) + run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 @@ -139,38 +158,37 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non # test adding new table if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, new_items, full_settings, setting_location) + run_resource(pipeline, new_items, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, new_items, full_settings, setting_location) + run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +@pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> None: full_settings = { + setting_location: { "column": evolution_setting - } + }} + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - run_resource(pipeline, items, full_settings, setting_location) + run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - # "column": evolution_setting - # } # subtable should work - run_resource(pipeline, items_with_subtable, full_settings, setting_location) + run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[SUBITEMS_TABLE] == 10 # new should work - run_resource(pipeline, new_items, full_settings, setting_location) + run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[NEW_ITEMS_TABLE] == 10 @@ -178,10 +196,10 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No # test adding new column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_new_column, full_settings, setting_location) + run_resource(pipeline, items_with_new_column, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, items_with_new_column, full_settings, setting_location) + run_resource(pipeline, items_with_new_column, full_settings) if evolution_setting == "evolve": assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -190,14 +208,13 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == (30 if evolution_setting in ["evolve", "freeze-and-trim"] else 20) - # test adding variant column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_variant, full_settings, setting_location) + run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, items_with_variant, full_settings, setting_location) + run_resource(pipeline, items_with_variant, full_settings) if evolution_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -208,35 +225,33 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) -@pytest.mark.parametrize("setting_location", ["resource", "source", "global_override"]) +@pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: full_settings = { + setting_location: { "column_variant": evolution_setting - } + }} pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - run_resource(pipeline, items, full_settings, setting_location) + run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == { - # "column_variant": evolution_setting - # } # subtable should work - run_resource(pipeline, items_with_subtable, full_settings, setting_location) + run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[SUBITEMS_TABLE] == 10 # new should work - run_resource(pipeline, new_items, full_settings, setting_location) + run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column - run_resource(pipeline, items_with_new_column, full_settings, setting_location) + run_resource(pipeline, items_with_new_column, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -244,10 +259,10 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: # test adding variant column if evolution_setting == "freeze-and-raise": with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_variant, full_settings, setting_location) + run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: - run_resource(pipeline, items_with_variant, full_settings, setting_location) + run_resource(pipeline, items_with_variant, full_settings) if evolution_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -256,3 +271,80 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == (40 if evolution_setting in ["evolve", "freeze-and-trim"] else 30) + +def test_settings_precedence() -> None: + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + + # load some data + run_resource(pipeline, items, {}) + + # trying to add new column when forbidden on resource will fail + run_resource(pipeline, items_with_new_column, {"resource": { + "column": "freeze-and-discard" + }}) + + # when allowed on override it will work + run_resource(pipeline, items_with_new_column, { + "resource": {"column": "freeze-and-raise"}, + "override": {"column": "evolve"} + }) + + +def test_settings_precedence_2() -> None: + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + + # load some data + run_resource(pipeline, items, {"source": { + "column_variant": "freeze-and-discard" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden on source will fail + run_resource(pipeline, items_with_variant, {"source": { + "column_variant": "freeze-and-discard" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # if allowed on resource it will pass + run_resource(pipeline, items_with_variant, { + "resource": {"column_variant": "evolve"}, + "source": {"column_variant": "freeze-and-discard"} + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + # if allowed on override it will also pass + run_resource(pipeline, items_with_variant, { + "resource": {"column_variant": "freeze-and-discard"}, + "source": {"column_variant": "freeze-and-discard"}, + "override": {"column_variant": "evolve"}, + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_change_mode(setting_location: str) -> None: + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + + # load some data + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden on source will fail + run_resource(pipeline, items_with_variant, {setting_location: { + "column_variant": "freeze-and-discard" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # now allow + run_resource(pipeline, items_with_variant, {setting_location: { + "column_variant": "evolve" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + From 6308369c202175c5b62f7ec9b628a9730d97e6a7 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 10:04:56 +0200 Subject: [PATCH 15/73] carry over schema settings on update --- dlt/extract/decorators.py | 1 - dlt/pipeline/pipeline.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 37c3e229ad..d053b7c79d 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -119,7 +119,6 @@ def source( Returns: `DltSource` instance """ - if name and schema: raise ArgumentsOverloadException("'name' has no effect when `schema` argument is present", source.__name__) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 40f6e2f4fc..357c055f34 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -879,6 +879,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para # note we are not merging props like max nesting or column propagation for table in source_schema.data_tables(include_incomplete=True): pipeline_schema.update_schema(pipeline_schema.normalize_table_identifiers(table)) + pipeline_schema._settings["schema_evolution_settings"] = source_schema._settings.get("schema_evolution_settings") return extract_id From ab0b8d786a2f8f8c3cc96d7bc0a9f6d428332156 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 10:31:39 +0200 Subject: [PATCH 16/73] add tests for single values --- tests/load/test_freeze_schema.py | 47 +++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_schema.py index 9d917cdaf1..9983a92731 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_schema.py @@ -1,7 +1,6 @@ import dlt, os, pytest from dlt.common.schema.typing import TSchemaEvolutionSettings from dlt.common.utils import uniq_id -import duckdb from typing import Any from dlt.extract.source import DltSource, DltResource @@ -114,20 +113,26 @@ def source() -> DltResource: pipeline.run(source(), schema_evolution_settings=settings.get("override")) # check updated schema - # assert pipeline.default_schema._settings["schema_evolution_settings"] == (settings if settings_location == "source" else None) + assert pipeline.default_schema._settings["schema_evolution_settings"] == settings.get("source") # check items table settings - # assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == (settings if settings_location == "resource" else None) + assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == settings.get("resource") + +def get_pipeline(): + import duckdb + return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) + @pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> None: + pipeline = get_pipeline() + full_settings = { setting_location: { "table": evolution_setting }} - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 @@ -175,7 +180,7 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No "column": evolution_setting }} - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline = get_pipeline() run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 @@ -232,7 +237,7 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: setting_location: { "column_variant": evolution_setting }} - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline = get_pipeline() run_resource(pipeline, items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 @@ -273,7 +278,7 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: def test_settings_precedence() -> None: - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline = get_pipeline() # load some data run_resource(pipeline, items, {}) @@ -291,7 +296,7 @@ def test_settings_precedence() -> None: def test_settings_precedence_2() -> None: - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline = get_pipeline() # load some data run_resource(pipeline, items, {"source": { @@ -326,14 +331,14 @@ def test_settings_precedence_2() -> None: @pytest.mark.parametrize("setting_location", LOCATIONS) def test_change_mode(setting_location: str) -> None: - pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + pipeline = get_pipeline() # load some data run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 - # trying to add variant when forbidden on source will fail + # trying to add variant when forbidden will fail run_resource(pipeline, items_with_variant, {setting_location: { "column_variant": "freeze-and-discard" }}) @@ -347,4 +352,26 @@ def test_change_mode(setting_location: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_single_settings_value(setting_location: str) -> None: + pipeline = get_pipeline() + + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + # trying to add variant when forbidden will fail + run_resource(pipeline, items_with_variant, {setting_location: "freeze-and-discard"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new column will fail + run_resource(pipeline, items_with_new_column, {setting_location: "freeze-and-discard"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new table will fail + run_resource(pipeline, new_items, {setting_location: "freeze-and-discard"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert "new_items" not in table_counts \ No newline at end of file From 6e99ed9c1556c3e69acb9c561cd22073fe34e918 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 14:12:52 +0200 Subject: [PATCH 17/73] small changes to tests and code --- dlt/common/schema/utils.py | 4 +- .../cases/schemas/eth/ethereum_schema_v7.yml | 20 +- tests/common/utils.py | 2 +- .../cases/eth_source/ethereum.schema.yaml | 922 ++++++++++++++++-- 4 files changed, 847 insertions(+), 101 deletions(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 81332976cd..4b5565a45b 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -286,10 +286,10 @@ def migrate_filters(group: str, filters: List[str]) -> None: if from_engine == 6 and to_engine > 6: # migrate from sealed properties to schema evolution settings schema_dict["settings"].pop("schema_sealed", None) - schema_dict["settings"]["schema_evolution_settings"] = "evolve" + schema_dict["settings"]["schema_evolution_settings"] = None for table in schema_dict["tables"].values(): table.pop("table_sealed", None) - table["schema_evolution_settings"] = {} + table["schema_evolution_settings"] = None schema_dict["tables"][LOADS_TABLE_NAME] = load_table() from_engine = 7 diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index 02212de8d8..18cb63cf9c 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -1,5 +1,5 @@ version: 14 -version_hash: MuiE8s34Wub6EKLDCIIygZ3nfwXgIwtNDiyji7A6pwY= +version_hash: 0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4= engine_version: 7 name: ethereum tables: @@ -146,7 +146,7 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_evolution_settings: {} + schema_evolution_settings: null name: _dlt_version resource: _dlt_version blocks: @@ -410,7 +410,7 @@ tables: merge_key: false data_type: text name: transactions_root - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks resource: blocks blocks__transactions: @@ -680,7 +680,7 @@ tables: merge_key: false data_type: decimal name: eth_value - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions @@ -805,7 +805,7 @@ tables: merge_key: false data_type: text name: transaction_hash - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs @@ -870,7 +870,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions @@ -935,7 +935,7 @@ tables: merge_key: false data_type: text name: address - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list @@ -1000,7 +1000,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks @@ -1065,7 +1065,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: {} + schema_evolution_settings: null name: blocks__uncles settings: default_hints: @@ -1087,7 +1087,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_evolution_settings: evolve + schema_evolution_settings: null normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/common/utils.py b/tests/common/utils.py index 5dce971b69..fe8ab47e89 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -15,7 +15,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V7 = "MuiE8s34Wub6EKLDCIIygZ3nfwXgIwtNDiyji7A6pwY=" +IMPORTED_VERSION_HASH_ETH_V7 = "0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index b5d54f9c49..18cb63cf9c 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,328 +1,1073 @@ -version: 11 -version_hash: GPHX4B+0xnRuGZM/w3UYVbldRyg8jSJp1G60K4RDcZg= -engine_version: 5 +version: 14 +version_hash: 0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4= +engine_version: 7 name: ethereum tables: _dlt_loads: + name: _dlt_loads columns: load_id: - data_type: text nullable: false - schema_name: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: load_id data_type: text + schema_name: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: schema_name + data_type: text status: - data_type: bigint nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: status + data_type: bigint inserted_at: - data_type: timestamp nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: inserted_at + data_type: timestamp + schema_version_hash: + nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + name: schema_version_hash + data_type: text write_disposition: skip + resource: _dlt_loads + schema_evolution_settings: null description: Created by DLT. Tracks completed loads _dlt_version: columns: version: - data_type: bigint nullable: false - engine_version: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: bigint + name: version + engine_version: nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: engine_version inserted_at: - data_type: timestamp nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: timestamp + name: inserted_at schema_name: - data_type: text nullable: false - version_hash: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: schema_name + version_hash: nullable: false - schema: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: version_hash + schema: nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: schema write_disposition: skip description: Created by DLT. Tracks schema updates + schema_evolution_settings: null + name: _dlt_version + resource: _dlt_version blocks: description: Ethereum blocks x-annotation: this will be preserved on save write_disposition: append - table_sealed: true filters: includes: [] excludes: [] columns: _dlt_load_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false description: load id coming from the extractor data_type: text - nullable: false + name: _dlt_load_id _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id number: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true + foreign_key: false + root_key: false + merge_key: false data_type: bigint - nullable: false + name: number parent_hash: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: parent_hash hash: + nullable: false + partition: false cluster: true unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: hash base_fee_per_gas: - data_type: wei nullable: false - difficulty: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: wei + name: base_fee_per_gas + difficulty: nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: difficulty extra_data: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: extra_data gas_limit: - data_type: bigint nullable: false - gas_used: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: bigint + name: gas_limit + gas_used: nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas_used logs_bloom: - data_type: binary nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: binary + name: logs_bloom miner: - data_type: text nullable: true - mix_hash: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: miner + mix_hash: nullable: true - nonce: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: mix_hash + nonce: nullable: true - receipts_root: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: nonce + receipts_root: nullable: true - sha3_uncles: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: receipts_root + sha3_uncles: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: sha3_uncles size: - data_type: bigint nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: size state_root: - data_type: text nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: state_root timestamp: + nullable: false + partition: false + cluster: false unique: true sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: timestamp - nullable: false + name: timestamp total_difficulty: - data_type: wei nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: total_difficulty transactions_root: - data_type: text nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: transactions_root + schema_evolution_settings: null + name: blocks + resource: blocks blocks__transactions: parent: blocks columns: _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id block_number: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true foreign_key: true + root_key: false + merge_key: false data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true + foreign_key: false + root_key: false + merge_key: false data_type: bigint - nullable: false + name: transaction_index hash: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: hash block_hash: + nullable: false + partition: false cluster: true + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: block_hash block_timestamp: + nullable: false + partition: false + cluster: false + unique: false sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: timestamp - nullable: false + name: block_timestamp chain_id: - data_type: text nullable: true - from: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: chain_id + from: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: from gas: - data_type: bigint nullable: true - gas_price: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: bigint + name: gas + gas_price: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: gas_price input: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: input max_fee_per_gas: - data_type: wei nullable: true - max_priority_fee_per_gas: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: max_priority_fee_per_gas nonce: - data_type: bigint nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: nonce r: - data_type: text nullable: true - s: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: r + s: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: s status: - data_type: bigint nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: status to: - data_type: text nullable: true - type: + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text + name: to + type: nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: type v: - data_type: bigint nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: v value: - data_type: wei nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: wei + name: value eth_value: - data_type: decimal nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: decimal + name: eth_value + schema_evolution_settings: null + name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions columns: _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id address: - data_type: text nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: address block_timestamp: + nullable: false + partition: false + cluster: false + unique: false sort: true + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: timestamp - nullable: false + name: block_timestamp block_hash: + nullable: false + partition: false cluster: true + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: block_hash block_number: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true foreign_key: true + root_key: false + merge_key: false data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true foreign_key: true + root_key: false + merge_key: false data_type: bigint - nullable: false + name: transaction_index log_index: + nullable: false + partition: false + cluster: false + unique: false + sort: false primary_key: true + foreign_key: false + root_key: false + merge_key: false data_type: bigint - nullable: false + name: log_index data: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: data removed: - data_type: bool nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bool + name: removed transaction_hash: - data_type: text nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: transaction_hash + schema_evolution_settings: null + name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs columns: _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false foreign_key: true + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false root_key: true + merge_key: false data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: null + name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions columns: _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false foreign_key: true + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false root_key: true + merge_key: false data_type: text - nullable: false + name: _dlt_root_id address: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: address + schema_evolution_settings: null + name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list columns: _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false foreign_key: true + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false root_key: true + merge_key: false data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: null + name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks columns: _dlt_parent_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false foreign_key: true + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false + partition: false + cluster: false unique: true + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false root_key: true + merge_key: false data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + partition: false + cluster: false + unique: false + sort: false + primary_key: false + foreign_key: false + root_key: false + merge_key: false + data_type: text + name: value + schema_evolution_settings: null + name: blocks__uncles settings: - schema_sealed: true default_hints: foreign_key: - _dlt_parent_id @@ -342,6 +1087,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp + schema_evolution_settings: null normalizers: names: dlt.common.normalizers.names.snake_case json: From 1a10ec47f3d99b78a30b14e6d90a83988499f6fc Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 15:27:25 +0200 Subject: [PATCH 18/73] fix small error --- dlt/common/schema/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 4b5565a45b..27ae2be883 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -290,7 +290,6 @@ def migrate_filters(group: str, filters: List[str]) -> None: for table in schema_dict["tables"].values(): table.pop("table_sealed", None) table["schema_evolution_settings"] = None - schema_dict["tables"][LOADS_TABLE_NAME] = load_table() from_engine = 7 schema_dict["engine_version"] = from_engine From 175bee5d0ea3b8f0942d31d1a6ab5e22390e2abc Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 6 Sep 2023 15:49:13 +0200 Subject: [PATCH 19/73] add tests for data contract interaction --- ...ma.py => test_freeze_and_data_contract.py} | 65 ++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) rename tests/load/{test_freeze_schema.py => test_freeze_and_data_contract.py} (88%) diff --git a/tests/load/test_freeze_schema.py b/tests/load/test_freeze_and_data_contract.py similarity index 88% rename from tests/load/test_freeze_schema.py rename to tests/load/test_freeze_and_data_contract.py index 9983a92731..253b210a7f 100644 --- a/tests/load/test_freeze_schema.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -9,6 +9,7 @@ from dlt.pipeline.exceptions import PipelineStepFailed from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils +from pydantic import BaseModel from tests.utils import skip_if_not_active @@ -374,4 +375,66 @@ def test_single_settings_value(setting_location: str) -> None: run_resource(pipeline, new_items, {setting_location: "freeze-and-discard"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 - assert "new_items" not in table_counts \ No newline at end of file + assert "new_items" not in table_counts + + +def test_data_contract_interaction() -> None: + """ + ensure data contracts with pydantic are enforced properly + """ + pipeline = get_pipeline() + + class Items(BaseModel): + id: int # noqa: A003 + name: str + amount: int + + @dlt.resource(name="items", columns=Items) + def get_items_variant(): + yield from [{ + "id": 5, + "name": "dave", + "amount": "HELLO" + }] + + @dlt.resource(name="items", columns=Items) + def get_items_new_col(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "new_col": "hello" + }] + + @dlt.resource(name="items", columns=Items) + def get_items_subtable(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "sub": [{"hello": "dave"}] + }] + + # disallow variants + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([get_items_variant()], schema_evolution_settings={"column_variant": "freeze-and-raise"}) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + + # without settings it will pass + pipeline.run([get_items_variant()], schema_evolution_settings={"column_variant": "evolve"}) + + # disallow new col + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([get_items_new_col()], schema_evolution_settings={"column": "freeze-and-raise"}) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + + # without settings it will pass + pipeline.run([get_items_new_col()], schema_evolution_settings={"column": "evolve"}) + + # disallow new tables + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.run([get_items_subtable()], schema_evolution_settings={"table": "freeze-and-raise"}) + assert isinstance(py_ex.value.__context__, SchemaFrozenException) + + # without settings it will pass + pipeline.run([get_items_subtable()], schema_evolution_settings={"table": "evolve"}) \ No newline at end of file From 18b9341aaad5dd2ee9d7b5e6b79b618e03113353 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 7 Sep 2023 09:36:38 +0200 Subject: [PATCH 20/73] fix tests --- tests/load/test_freeze_and_data_contract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 253b210a7f..3d953ca885 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -9,7 +9,6 @@ from dlt.pipeline.exceptions import PipelineStepFailed from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils -from pydantic import BaseModel from tests.utils import skip_if_not_active @@ -382,6 +381,7 @@ def test_data_contract_interaction() -> None: """ ensure data contracts with pydantic are enforced properly """ + from pydantic import BaseModel pipeline = get_pipeline() class Items(BaseModel): From 881d79aeccd2f7ae8afc4ff1dbc487279e44a4ef Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 12 Sep 2023 22:52:58 +0200 Subject: [PATCH 21/73] some PR work --- dlt/common/pipeline.py | 4 +- dlt/common/schema/exceptions.py | 6 +- dlt/common/schema/schema.py | 58 +++++---- dlt/common/schema/typing.py | 16 +-- dlt/common/schema/utils.py | 14 +-- dlt/common/typing.py | 17 ++- dlt/common/validation.py | 29 +++-- dlt/extract/decorators.py | 20 ++-- dlt/extract/schema.py | 8 +- dlt/normalize/normalize.py | 30 ++--- dlt/pipeline/__init__.py | 4 +- dlt/pipeline/pipeline.py | 14 +-- .../cases/schemas/eth/ethereum_schema_v7.yml | 20 ++-- tests/common/schema/test_schema.py | 4 +- tests/common/test_typing.py | 20 +++- tests/common/test_validation.py | 30 ++++- .../cases/eth_source/ethereum.schema.yaml | 20 ++-- tests/load/test_freeze_and_data_contract.py | 110 +++++++++--------- 18 files changed, 237 insertions(+), 187 deletions(-) diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 92be31b720..ae96f5c300 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -16,7 +16,7 @@ from dlt.common.destination import DestinationReference, TDestinationReferenceArg from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaEvolutionSettings +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContractSettings from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.typing import DictStrAny, REPattern @@ -210,7 +210,7 @@ def run( primary_key: TColumnNames = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, ) -> LoadInfo: ... diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index d68dbc47a3..8fd0f1b772 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -72,5 +72,7 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi class SchemaFrozenException(SchemaException): - def __init__(self, msg: str) -> None: - super().__init__(msg) \ No newline at end of file + def __init__(self, schema_name: str, table_name: str, msg: str) -> None: + super().__init__(msg) + self.schema_name = schema_name + self.table_name = table_name \ No newline at end of file diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 8bd3536966..3d1603b13a 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -11,17 +11,17 @@ from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaEvolutionModes, TSchemaEvolutionSettings) + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractModes, TSchemaContractSettings) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict from dlt.common.schema.exceptions import SchemaFrozenException -DEFAULT_SCHEMA_EVOLUTION_MODES: TSchemaEvolutionModes = { +DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractModes = { "table": "evolve", "column": "evolve", - "column_variant": "evolve" + "data_type": "evolve" } class Schema: @@ -194,68 +194,64 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str, schema_evolution_settings_override: TSchemaEvolutionSettings) -> TSchemaEvolutionModes: + def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str, schema_contract_settings_override: TSchemaContractSettings) -> TSchemaContractModes: - def resolve_single(settings: TSchemaEvolutionSettings) -> TSchemaEvolutionModes: + def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: settings = settings or {} if isinstance(settings, str): - return TSchemaEvolutionModes(table=settings, column=settings, column_variant=settings) + return TSchemaContractModes(table=settings, column=settings, data_type=settings) return settings # find table settings table_with_settings = parent_table or table_name # modes - table_evolution_modes = resolve_single(self.tables.get(table_with_settings, {}).get("schema_evolution_settings", {})) - schema_evolution_modes = resolve_single(self._settings.get("schema_evolution_settings", {})) - overide_modes = resolve_single(schema_evolution_settings_override) + table_contract_modes = resolve_single(self.tables.get(table_with_settings, {}).get("schema_contract_settings", {})) + schema_contract_modes = resolve_single(self._settings.get("schema_contract_settings", {})) + overide_modes = resolve_single(schema_contract_settings_override) # resolve to correct settings dict - settings = cast(TSchemaEvolutionModes, {**DEFAULT_SCHEMA_EVOLUTION_MODES, **schema_evolution_modes, **table_evolution_modes, **overide_modes}) + settings = cast(TSchemaContractModes, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes, **overide_modes}) return settings - def check_schema_update(self, parent_table: str, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_evolution_settings_override: TSchemaEvolutionSettings) -> Tuple[DictStrAny, TPartialTableSchema]: + def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_contract_settings_override: TSchemaContractSettings) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" assert partial_table - # for now we defined the schema as new if there are no data columns defined - has_columns = self.has_data_columns - if not has_columns: - return row, partial_table - - evolution_modes = self.resolve_evolution_settings_for_table(parent_table, table_name, schema_evolution_settings_override) - # default settings allow all evolutions, skipp all else - if evolution_modes == DEFAULT_SCHEMA_EVOLUTION_MODES: + if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - table_exists = table_name in self.tables and len(self.tables[table_name].get("columns", {})) + table_exists = table_name in self.tables and self.get_table_columns(table_name, include_incomplete=False) # check case where we have a new table if not table_exists: - if evolution_modes == "freeze-and-trim": + if contract_modes == "discard-value": return None, None - if evolution_modes["table"] in ["freeze-and-discard", "freeze-and-trim"]: + if contract_modes["table"] in ["discard-row", "discard-value"]: return None, None - if evolution_modes["table"] == "freeze-and-raise": - raise SchemaFrozenException(f"Trying to add table {table_name} but new tables are frozen.") + if contract_modes["table"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") # check columns for item in list(row.keys()): for item in list(row.keys()): # if this is a new column for an existing table... - if table_exists and item not in self.tables[table_name]["columns"]: + if table_exists and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = item in partial_table["columns"] and partial_table["columns"][item].get("variant") - if evolution_modes["column"] == "freeze-and-trim" or (is_variant and evolution_modes["column_variant"] == "freeze-and-trim"): + if contract_modes["column"] == "discard-value" or (is_variant and contract_modes["data_type"] == "discard-value"): row.pop(item) partial_table["columns"].pop(item) - if evolution_modes["column"] == "freeze-and-discard" or (is_variant and evolution_modes["column_variant"] == "freeze-and-discard"): + elif contract_modes["column"] == "discard-row" or (is_variant and contract_modes["data_type"] == "discard-row"): return None, None - if evolution_modes["column"] == "freeze-and-raise" or (is_variant and evolution_modes["column_variant"] == "freeze-and-raise"): - raise SchemaFrozenException(f"Trying to add column {item} to table {table_name}  but columns are frozen.") + elif contract_modes["column"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name}  but columns are frozen.") + elif is_variant and contract_modes["data_type"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name}  data_types are frozen.") + return row, partial_table @@ -454,8 +450,8 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) - def set_schema_evolution_settings(self, settings: TSchemaEvolutionSettings) -> None: - self._settings["schema_evolution_settings"] = settings + def set_schema_contract_settings(self, settings: TSchemaContractSettings) -> None: + self._settings["schema_contract_settings"] = settings def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: column_schema = TColumnSchema( diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 450fd79035..08d89b1f13 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -65,15 +65,15 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" -TSchemaEvolutionMode = Literal["evolve", "freeze-and-trim", "freeze-and-raise", "freeze-and-discard"] +TSchemaEvolutionMode = Literal["evolve", "discard-value", "freeze", "discard-row"] -class TSchemaEvolutionModes(TypedDict, total=False): +class TSchemaContractModes(TypedDict, total=False): """TypedDict defining the schema update settings""" - table: TSchemaEvolutionMode - column: TSchemaEvolutionMode - column_variant: TSchemaEvolutionMode + table: Optional[TSchemaEvolutionMode] + column: Optional[TSchemaEvolutionMode] + data_type: Optional[TSchemaEvolutionMode] -TSchemaEvolutionSettings = Union[TSchemaEvolutionMode, TSchemaEvolutionModes] +TSchemaContractSettings = Union[TSchemaEvolutionMode, TSchemaContractModes] class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] @@ -85,7 +85,7 @@ class TTableSchema(TypedDict, total=False): name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] - schema_evolution_settings: Optional[TSchemaEvolutionSettings] + schema_contract_settings: Optional[TSchemaContractSettings] parent: Optional[str] filters: Optional[TRowFilters] columns: TTableSchemaColumns @@ -101,7 +101,7 @@ class TPartialTableSchema(TTableSchema): class TSchemaSettings(TypedDict, total=False): - schema_evolution_settings: Optional[TSchemaEvolutionSettings] + schema_contract_settings: Optional[TSchemaContractSettings] detections: Optional[List[TTypeDetections]] default_hints: Optional[Dict[TColumnHint, List[TSimpleRegex]]] preferred_types: Optional[Dict[TSimpleRegex, TDataType]] diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index bb0ba2e3f9..31cea1480d 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -16,7 +16,7 @@ from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaEvolutionSettings, TSchemaEvolutionModes) + TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContractSettings, TSchemaContractModes) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName) @@ -343,10 +343,10 @@ def migrate_filters(group: str, filters: List[str]) -> None: if from_engine == 6 and to_engine > 6: # migrate from sealed properties to schema evolution settings schema_dict["settings"].pop("schema_sealed", None) - schema_dict["settings"]["schema_evolution_settings"] = None + schema_dict["settings"]["schema_contract_settings"] = None for table in schema_dict["tables"].values(): table.pop("table_sealed", None) - table["schema_evolution_settings"] = None + table["schema_contract_settings"] = None from_engine = 7 schema_dict["engine_version"] = from_engine @@ -476,7 +476,7 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa table["columns"] = updated_columns # always update evolution settings - table["schema_evolution_settings"] = partial_table.get("schema_evolution_settings") + table["schema_contract_settings"] = partial_table.get("schema_contract_settings") return diff_table @@ -644,7 +644,7 @@ def new_table( columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, resource: str = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, ) -> TTableSchema: table: TTableSchema = { @@ -655,12 +655,12 @@ def new_table( table["parent"] = parent_table_name assert write_disposition is None assert resource is None - assert schema_evolution_settings is None + assert schema_contract_settings is None else: # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - table["schema_evolution_settings"] = schema_evolution_settings + table["schema_contract_settings"] = schema_contract_settings if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 86fa1635df..2f1cecc093 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -66,41 +66,38 @@ def asstr(self, verbosity: int = 0) -> str: ... +def is_union_type(t: Type[Any]) -> bool: + return get_origin(t) is Union + def is_optional_type(t: Type[Any]) -> bool: return get_origin(t) is Union and type(None) in get_args(t) - def is_final_type(t: Type[Any]) -> bool: return get_origin(t) is Final - -def extract_optional_type(t: Type[Any]) -> Any: - return get_args(t)[0] - +def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: + if no_none: + return [arg for arg in get_args(t) if arg is not type(None)] + return list(get_args(t)) def is_literal_type(hint: Type[Any]) -> bool: return get_origin(hint) is Literal - def is_union(hint: Type[Any]) -> bool: return get_origin(hint) is Union - def is_newtype_type(t: Type[Any]) -> bool: return hasattr(t, "__supertype__") - def is_typeddict(t: Type[Any]) -> bool: return isinstance(t, _TypedDict) - def is_list_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Sequence) except TypeError: return False - def is_dict_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Mapping) diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 6e6ac62e39..01e1f9f5b9 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -2,7 +2,7 @@ from typing import Callable, Any, Type, get_type_hints, get_args from dlt.common.exceptions import DictValidationException -from dlt.common.typing import StrAny, extract_optional_type, is_literal_type, is_optional_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict +from dlt.common.typing import StrAny, is_literal_type, is_optional_type, extract_union_types, is_union_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict, is_union TFilterFunc = Callable[[str], bool] @@ -49,15 +49,26 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil raise DictValidationException(f"In {path}: following fields are unexpected {unexpected}", path) def verify_prop(pk: str, pv: Any, t: Any) -> None: - if is_optional_type(t): - # pass if value actually is none - if pv is None: - return - t = extract_optional_type(t) - - # TODO: support for union types? - if pk == "schema_evolution_settings": + # covers none in optional and union types + if is_optional_type(t) and pv is None: pass + elif is_union_type(t): + # pass if value actually is none + union_types = extract_union_types(t, no_none=True) + # this is the case for optional fields + if len(union_types) == 1: + verify_prop(pk, pv, union_types[0]) + else: + has_passed = False + for ut in union_types: + try: + verify_prop(pk, pv, ut) + has_passed = True + except DictValidationException: + pass + if not has_passed: + type_names = [ut.__name__ for ut in union_types] + raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}. One of these types expected: {', '.join(type_names)}.", path, pk, pv) elif is_literal_type(t): a_l = get_args(t) if pv not in a_l: diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index d053b7c79d..1134dd9cc3 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -14,7 +14,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings from dlt.extract.utils import ensure_table_schema_columns_hint from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -52,7 +52,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, spec: Type[BaseConfiguration] = None ) -> Callable[TSourceFunParams, DltSource]: ... @@ -66,7 +66,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, DltSource]]: ... @@ -79,7 +79,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, spec: Type[BaseConfiguration] = None ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. @@ -170,7 +170,7 @@ def _wrap(*args: Any, **kwargs: Any) -> DltSource: # prepare schema schema = schema.clone(update_normalizers=True) - schema.set_schema_evolution_settings(schema_evolution_settings) + schema.set_schema_contract_settings(schema_contract_settings) # convert to source s = DltSource.from_data(name, source_section, schema.clone(update_normalizers=True), rv) @@ -208,7 +208,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[TResourceFunParams, DltResource]: @@ -224,7 +224,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: @@ -240,7 +240,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> DltResource: @@ -256,7 +256,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, depends_on: TUnboundDltResource = None, @@ -324,7 +324,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=schema_columns, primary_key=primary_key, merge_key=merge_key, - schema_evolution_settings=schema_evolution_settings + schema_contract_settings=schema_contract_settings ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, depends_on), incremental=incremental) diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 2731a7c228..e07fdfb40f 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -3,7 +3,7 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -24,7 +24,7 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] - schema_evolution_settings: TSchemaEvolutionSettings + schema_contract_settings: TSchemaContractSettings class DltResourceSchema: @@ -208,7 +208,7 @@ def new_table_template( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_evolution_settings: TTableHintTemplate[TSchemaEvolutionSettings] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, ) -> TTableSchemaTemplate: if not table_name: raise TableNameMissing() @@ -218,7 +218,7 @@ def new_table_template( if not callable(columns): columns = columns.values() # type: ignore # create a table schema template where hints can be functions taking TDataItem - new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_evolution_settings=schema_evolution_settings) # type: ignore + new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract_settings=schema_contract_settings) # type: ignore if primary_key: new_template["primary_key"] = primary_key if merge_key: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 9c3b33fbaa..ae3e2e658c 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -11,7 +11,7 @@ from dlt.common.runners import TRunMetrics, Runnable from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaEvolutionSettings +from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractSettings, TSchemaContractModes from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration @@ -26,7 +26,7 @@ # normalize worker wrapping function (map_parallel, map_single) return type TMapFuncRV = Tuple[Sequence[TSchemaUpdate], TRowCount] # normalize worker wrapping function signature -TMapFuncType = Callable[[Schema, str, Sequence[str], TSchemaEvolutionSettings], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) +TMapFuncType = Callable[[Schema, str, Sequence[str], TSchemaContractSettings], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) # tuple returned by the worker TWorkerRV = Tuple[List[TSchemaUpdate], int, List[str], TRowCount] @@ -34,7 +34,7 @@ class Normalize(Runnable[ProcessPool]): @with_config(spec=NormalizeConfiguration, sections=(known_sections.NORMALIZE,)) - def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value, schema_evolution_settings: TSchemaEvolutionSettings = None) -> None: + def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value, schema_contract_settings: TSchemaContractSettings = None) -> None: self.config = config self.collector = collector self.pool: ProcessPool = None @@ -42,7 +42,7 @@ def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: Schema self.load_storage: LoadStorage = None self.schema_storage: SchemaStorage = None self._row_counts: TRowCount = {} - self.schema_evolution_settings = schema_evolution_settings + self.schema_contract_settings = schema_contract_settings # setup storages self.create_storages() @@ -74,7 +74,7 @@ def w_normalize_files( stored_schema: TStoredSchema, load_id: str, extracted_items_files: Sequence[str], - schema_evolution_settings: TSchemaEvolutionSettings + schema_contract_settings: TSchemaContractSettings ) -> TWorkerRV: schema_updates: List[TSchemaUpdate] = [] total_items = 0 @@ -99,7 +99,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items, schema_evolution_settings) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items, schema_contract_settings) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -128,15 +128,19 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_evolution_settings: TSchemaEvolutionSettings) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_contract_settings: TSchemaContractSettings) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name items_count = 0 row_counts: TRowCount = {} + schema_contract_modes: TSchemaContractModes = None for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): + if not schema_contract_modes: + schema_contract_modes = schema.resolve_evolution_settings_for_table(parent_table, table_name, schema_contract_settings) + # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) # do not process empty rows @@ -149,7 +153,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, the check update if partial_table: - row, partial_table = schema.check_schema_update(parent_table, table_name, row, partial_table, schema_evolution_settings) + row, partial_table = schema.check_schema_update(schema_contract_modes, table_name, row, partial_table, schema_contract_settings) if not row: continue @@ -200,12 +204,12 @@ def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[st l_idx = idx + 1 return chunk_files - def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schema_evolution_settings: TSchemaEvolutionSettings) -> TMapFuncRV: + def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schema_contract_settings: TSchemaContractSettings) -> TMapFuncRV: workers = self.pool._processes # type: ignore chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) - param_chunk = [[*config_tuple, load_id, files, schema_evolution_settings] for files in chunk_files] + param_chunk = [[*config_tuple, load_id, files, schema_contract_settings] for files in chunk_files] tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] row_counts: TRowCount = {} @@ -255,7 +259,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schem return schema_updates, row_counts - def map_single(self, schema: Schema, load_id: str, files: Sequence[str], schema_evolution_settings: TSchemaEvolutionSettings) -> TMapFuncRV: + def map_single(self, schema: Schema, load_id: str, files: Sequence[str], schema_contract_settings: TSchemaContractSettings) -> TMapFuncRV: result = Normalize.w_normalize_files( self.normalize_storage.config, self.load_storage.config, @@ -263,7 +267,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str], schema_ schema.to_dict(), load_id, files, - schema_evolution_settings + schema_contract_settings ) self.update_schema(schema, result[0]) self.collector.update("Files", len(result[2])) @@ -274,7 +278,7 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) # process files in parallel or in single thread, depending on map_f - schema_updates, row_counts = map_f(schema, load_id, files, self.schema_evolution_settings) + schema_updates, row_counts = map_f(schema, load_id, files, self.schema_contract_settings) # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) if len(schema_updates) > 0: logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index dea7ea1b57..3dd7bd8310 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,7 +1,7 @@ from typing import Sequence, cast, overload from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaEvolutionSettings +from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContractSettings from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config @@ -177,7 +177,7 @@ def run( columns: Sequence[TColumnSchema] = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_evolution_settings: TSchemaEvolutionSettings = None, + schema_contract_settings: TSchemaContractSettings = None, ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 8cbd074ecc..567f286ba1 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -18,7 +18,7 @@ MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaEvolutionSettings +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -294,7 +294,7 @@ def extract( @with_runtime_trace @with_schemas_sync @with_config_section((known_sections.NORMALIZE,)) - def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None, schema_evolution_settings: TSchemaEvolutionSettings = None) -> NormalizeInfo: + def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None, schema_contract_settings: TSchemaContractSettings = None) -> NormalizeInfo: """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive(): workers = 1 @@ -316,7 +316,7 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No # run with destination context with self._maybe_destination_capabilities(loader_file_format=loader_file_format): # shares schema storage with the pipeline so we do not need to install - normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage, schema_evolution_settings=schema_evolution_settings) + normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage, schema_contract_settings=schema_contract_settings) try: with signals.delayed_signals(): runner.run_pool(normalize.config, normalize) @@ -390,7 +390,7 @@ def run( primary_key: TColumnNames = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_evolution_settings: TSchemaEvolutionSettings = None + schema_contract_settings: TSchemaContractSettings = None ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -457,7 +457,7 @@ def run( # normalize and load pending data if self.list_extracted_resources(): - self.normalize(loader_file_format=loader_file_format, schema_evolution_settings=schema_evolution_settings) + self.normalize(loader_file_format=loader_file_format, schema_contract_settings=schema_contract_settings) if self.list_normalized_load_packages(): # if there were any pending loads, load them and **exit** if data is not None: @@ -467,7 +467,7 @@ def run( # extract from the source if data is not None: self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema) - self.normalize(loader_file_format=loader_file_format, schema_evolution_settings=schema_evolution_settings) + self.normalize(loader_file_format=loader_file_format, schema_contract_settings=schema_contract_settings) return self.load(destination, dataset_name, credentials=credentials) else: return None @@ -878,7 +878,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para pipeline_schema.update_schema( pipeline_schema.normalize_table_identifiers(table) ) - pipeline_schema._settings["schema_evolution_settings"] = source_schema._settings.get("schema_evolution_settings") + pipeline_schema._settings["schema_contract_settings"] = source_schema._settings.get("schema_contract_settings") return extract_id diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index 18cb63cf9c..a83dd82aa4 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -68,7 +68,7 @@ tables: data_type: text write_disposition: skip resource: _dlt_loads - schema_evolution_settings: null + schema_contract_settings: null description: Created by DLT. Tracks completed loads _dlt_version: columns: @@ -146,7 +146,7 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_evolution_settings: null + schema_contract_settings: null name: _dlt_version resource: _dlt_version blocks: @@ -410,7 +410,7 @@ tables: merge_key: false data_type: text name: transactions_root - schema_evolution_settings: null + schema_contract_settings: null name: blocks resource: blocks blocks__transactions: @@ -680,7 +680,7 @@ tables: merge_key: false data_type: decimal name: eth_value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions @@ -805,7 +805,7 @@ tables: merge_key: false data_type: text name: transaction_hash - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs @@ -870,7 +870,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions @@ -935,7 +935,7 @@ tables: merge_key: false data_type: text name: address - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list @@ -1000,7 +1000,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks @@ -1065,7 +1065,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__uncles settings: default_hints: @@ -1087,7 +1087,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_evolution_settings: null + schema_contract_settings: null normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 0f6f28ffea..a0a2f4df9d 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -632,8 +632,8 @@ def test_group_tables_by_resource(schema: Schema) -> None: result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) # both tables with resource "products" must be here assert result == {'products': [ - {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_evolution_settings': None}, - {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_evolution_settings': None}, + {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': None}, + {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': None}, {'columns': {}, 'name': 'mc_products__sub', 'parent': 'mc_products'} ] } diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 278eb38973..0ee4e9268a 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -3,7 +3,7 @@ from dlt.common.configuration.specs.base_configuration import BaseConfiguration, get_config_if_union_hint from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.common.typing import StrAny, extract_inner_type, extract_optional_type, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict +from dlt.common.typing import StrAny, extract_inner_type, extract_union_types, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict, is_union_type @@ -15,6 +15,8 @@ class TTestTyDi(TypedDict): TOptionalLi = Optional[TTestLi] TOptionalTyDi = Optional[TTestTyDi] +TOptionalUnionLiTyDi = Optional[Union[TTestTyDi, TTestLi]] + def test_is_typeddict() -> None: assert is_typeddict(TTestTyDi) is True @@ -28,6 +30,7 @@ def test_is_list_generic_type() -> None: assert is_list_generic_type(List[str]) is True assert is_list_generic_type(Sequence[str]) is True assert is_list_generic_type(MutableSequence[str]) is True + assert is_list_generic_type(TOptionalUnionLiTyDi) is False def test_is_dict_generic_type() -> None: @@ -46,8 +49,19 @@ def test_optional() -> None: assert is_optional_type(TOptionalLi) is True assert is_optional_type(TOptionalTyDi) is True assert is_optional_type(TTestTyDi) is False - assert extract_optional_type(TOptionalLi) is TTestLi - assert extract_optional_type(TOptionalTyDi) is TTestTyDi + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] + + +def test_union_types() -> None: + assert is_optional_type(TOptionalLi) is True + assert is_optional_type(TOptionalTyDi) is True + assert is_optional_type(TTestTyDi) is False + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] + assert is_optional_type(TOptionalUnionLiTyDi) is True + assert extract_union_types(TOptionalUnionLiTyDi) == [TTestTyDi, TTestLi, type(None)] + assert is_union_type(MutableSequence[str]) is False def test_is_newtype() -> None: diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index d278c1029c..2de6b6fc15 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -1,7 +1,7 @@ from copy import deepcopy import pytest import yaml -from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional +from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional, Union from dlt.common import json from dlt.common.exceptions import DictValidationException @@ -10,8 +10,12 @@ from dlt.common.typing import DictStrStr, StrStr from dlt.common.validation import validate_dict, validate_dict_ignoring_xkeys + + TLiteral = Literal["uno", "dos", "tres"] +class TDict(TypedDict): + field: TLiteral class TTestRecord(TypedDict): f_bool: bool @@ -31,6 +35,7 @@ class TTestRecord(TypedDict): f_literal: TLiteral f_literal_optional: Optional[TLiteral] f_seq_literal: Sequence[Optional[TLiteral]] + f_optional_union: Optional[Union[TLiteral, TDict]] TEST_COL = { @@ -74,7 +79,8 @@ class TTestRecord(TypedDict): "f_column": deepcopy(TEST_COL), "f_literal": "uno", "f_literal_optional": "dos", - "f_seq_literal": ["uno", "dos", "tres"] + "f_seq_literal": ["uno", "dos", "tres"], + "f_optional_union": {"field": "uno"} } @pytest.fixture @@ -227,3 +233,23 @@ def test_filter(test_doc: TTestRecord) -> None: test_doc["x-extra"] = "x-annotation" # remove x-extra with a filter validate_dict(TTestRecord, test_doc, ".", filter_f=lambda k: k != "x-extra") + + +def test_nested_union(test_doc: TTestRecord) -> None: + test_doc["f_optional_union"] = {"field": "uno"} + validate_dict(TTestRecord, TEST_DOC, ".") + + test_doc["f_optional_union"] = {"field": "not valid"} + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == {'field': 'not valid'} + + test_doc["f_optional_union"] = "dos" + validate_dict(TTestRecord, test_doc, ".") + + test_doc["f_optional_union"] = "blah" + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == "blah" \ No newline at end of file diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index 18cb63cf9c..a83dd82aa4 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -68,7 +68,7 @@ tables: data_type: text write_disposition: skip resource: _dlt_loads - schema_evolution_settings: null + schema_contract_settings: null description: Created by DLT. Tracks completed loads _dlt_version: columns: @@ -146,7 +146,7 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_evolution_settings: null + schema_contract_settings: null name: _dlt_version resource: _dlt_version blocks: @@ -410,7 +410,7 @@ tables: merge_key: false data_type: text name: transactions_root - schema_evolution_settings: null + schema_contract_settings: null name: blocks resource: blocks blocks__transactions: @@ -680,7 +680,7 @@ tables: merge_key: false data_type: decimal name: eth_value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions @@ -805,7 +805,7 @@ tables: merge_key: false data_type: text name: transaction_hash - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs @@ -870,7 +870,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions @@ -935,7 +935,7 @@ tables: merge_key: false data_type: text name: address - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list @@ -1000,7 +1000,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks @@ -1065,7 +1065,7 @@ tables: merge_key: false data_type: text name: value - schema_evolution_settings: null + schema_contract_settings: null name: blocks__uncles settings: default_hints: @@ -1087,7 +1087,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_evolution_settings: null + schema_contract_settings: null normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 3d953ca885..5ef936bd66 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -1,5 +1,5 @@ import dlt, os, pytest -from dlt.common.schema.typing import TSchemaEvolutionSettings +from dlt.common.schema.typing import TSchemaContractSettings from dlt.common.utils import uniq_id from typing import Any from dlt.extract.source import DltSource, DltResource @@ -14,13 +14,13 @@ skip_if_not_active("duckdb") -SCHEMA_EVOLUTION_SETTINGS = ["evolve", "freeze-and-trim", "freeze-and-discard", "freeze-and-raise"] +schema_contract_settings = ["evolve", "discard-value", "discard-row", "freeze"] LOCATIONS = ["source", "resource", "override"] -SCHEMA_ELEMENTS = ["table", "column", "column_variant"] +SCHEMA_ELEMENTS = ["table", "column", "data_type"] -def items(settings: TSchemaEvolutionSettings) -> Any: +def items(settings: TSchemaContractSettings) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -31,9 +31,9 @@ def load_items(): return load_items -def items_with_variant(settings: TSchemaEvolutionSettings) -> Any: +def items_with_variant(settings: TSchemaContractSettings) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -44,9 +44,9 @@ def load_items(): return load_items -def items_with_new_column(settings: TSchemaEvolutionSettings) -> Any: +def items_with_new_column(settings: TSchemaContractSettings) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -58,9 +58,9 @@ def load_items(): return load_items -def items_with_subtable(settings: TSchemaEvolutionSettings) -> Any: +def items_with_subtable(settings: TSchemaContractSettings) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_evolution_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -74,9 +74,9 @@ def load_items(): return load_items -def new_items(settings: TSchemaEvolutionSettings) -> Any: +def new_items(settings: TSchemaContractSettings) -> Any: - @dlt.resource(name="new_items", write_disposition="append", schema_evolution_settings=settings) + @dlt.resource(name="new_items", write_disposition="append", schema_contract_settings=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -99,31 +99,31 @@ def run_resource(pipeline, resource_fun, settings) -> DltSource: for item in settings.keys(): assert item in LOCATIONS ev_settings = settings[item] - if ev_settings in SCHEMA_EVOLUTION_SETTINGS: + if ev_settings in schema_contract_settings: continue for key, val in ev_settings.items(): - assert val in SCHEMA_EVOLUTION_SETTINGS + assert val in schema_contract_settings assert key in SCHEMA_ELEMENTS - @dlt.source(name="freeze_tests", schema_evolution_settings=settings.get("source")) + @dlt.source(name="freeze_tests", schema_contract_settings=settings.get("source")) def source() -> DltResource: return resource_fun(settings.get("resource")) # run pipeline - pipeline.run(source(), schema_evolution_settings=settings.get("override")) + pipeline.run(source(), schema_contract_settings=settings.get("override")) # check updated schema - assert pipeline.default_schema._settings["schema_evolution_settings"] == settings.get("source") + assert pipeline.default_schema._settings["schema_contract_settings"] == settings.get("source") # check items table settings - assert pipeline.default_schema.tables["items"]["schema_evolution_settings"] == settings.get("resource") + assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == settings.get("resource") def get_pipeline(): import duckdb return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) -@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +@pytest.mark.parametrize("evolution_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> None: @@ -133,7 +133,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non setting_location: { "table": evolution_setting }} - run_resource(pipeline, items, full_settings) + run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -149,7 +149,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding new subtable - if evolution_setting == "freeze-and-raise": + if evolution_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_subtable, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -157,11 +157,11 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 30 if evolution_setting in ["freeze-and-raise"] else 40 + assert table_counts["items"] == 30 if evolution_setting in ["freeze"] else 40 assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if evolution_setting in ["evolve"] else 0) # test adding new table - if evolution_setting == "freeze-and-raise": + if evolution_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, new_items, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -171,7 +171,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) -@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +@pytest.mark.parametrize("evolution_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> None: @@ -181,7 +181,7 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No }} pipeline = get_pipeline() - run_resource(pipeline, items, full_settings) + run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -199,7 +199,7 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column - if evolution_setting == "freeze-and-raise": + if evolution_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_new_column, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -211,10 +211,10 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No else: assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (30 if evolution_setting in ["evolve", "freeze-and-trim"] else 20) + assert table_counts["items"] == (30 if evolution_setting in ["evolve", "discard-value"] else 20) # test adding variant column - if evolution_setting == "freeze-and-raise": + if evolution_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -226,19 +226,19 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if evolution_setting in ["evolve", "freeze-and-trim"] else 20) + assert table_counts["items"] == (40 if evolution_setting in ["evolve", "discard-value"] else 20) -@pytest.mark.parametrize("evolution_setting", SCHEMA_EVOLUTION_SETTINGS) +@pytest.mark.parametrize("evolution_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: full_settings = { setting_location: { - "column_variant": evolution_setting + "data_type": evolution_setting }} pipeline = get_pipeline() - run_resource(pipeline, items, full_settings) + run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -262,7 +262,7 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding variant column - if evolution_setting == "freeze-and-raise": + if evolution_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -274,7 +274,7 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if evolution_setting in ["evolve", "freeze-and-trim"] else 30) + assert table_counts["items"] == (40 if evolution_setting in ["evolve", "discard-value"] else 30) def test_settings_precedence() -> None: @@ -285,12 +285,12 @@ def test_settings_precedence() -> None: # trying to add new column when forbidden on resource will fail run_resource(pipeline, items_with_new_column, {"resource": { - "column": "freeze-and-discard" + "column": "discard-row" }}) # when allowed on override it will work run_resource(pipeline, items_with_new_column, { - "resource": {"column": "freeze-and-raise"}, + "resource": {"column": "freeze"}, "override": {"column": "evolve"} }) @@ -300,31 +300,31 @@ def test_settings_precedence_2() -> None: # load some data run_resource(pipeline, items, {"source": { - "column_variant": "freeze-and-discard" + "data_type": "discard-row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add variant when forbidden on source will fail run_resource(pipeline, items_with_variant, {"source": { - "column_variant": "freeze-and-discard" + "data_type": "discard-row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # if allowed on resource it will pass run_resource(pipeline, items_with_variant, { - "resource": {"column_variant": "evolve"}, - "source": {"column_variant": "freeze-and-discard"} + "resource": {"data_type": "evolve"}, + "source": {"data_type": "discard-row"} }) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 # if allowed on override it will also pass run_resource(pipeline, items_with_variant, { - "resource": {"column_variant": "freeze-and-discard"}, - "source": {"column_variant": "freeze-and-discard"}, - "override": {"column_variant": "evolve"}, + "resource": {"data_type": "discard-row"}, + "source": {"data_type": "discard-row"}, + "override": {"data_type": "evolve"}, }) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 @@ -340,14 +340,14 @@ def test_change_mode(setting_location: str) -> None: # trying to add variant when forbidden will fail run_resource(pipeline, items_with_variant, {setting_location: { - "column_variant": "freeze-and-discard" + "data_type": "discard-row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # now allow run_resource(pipeline, items_with_variant, {setting_location: { - "column_variant": "evolve" + "data_type": "evolve" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 @@ -361,17 +361,17 @@ def test_single_settings_value(setting_location: str) -> None: assert table_counts["items"] == 10 # trying to add variant when forbidden will fail - run_resource(pipeline, items_with_variant, {setting_location: "freeze-and-discard"}) + run_resource(pipeline, items_with_variant, {setting_location: "discard-row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add new column will fail - run_resource(pipeline, items_with_new_column, {setting_location: "freeze-and-discard"}) + run_resource(pipeline, items_with_new_column, {setting_location: "discard-row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add new table will fail - run_resource(pipeline, new_items, {setting_location: "freeze-and-discard"}) + run_resource(pipeline, new_items, {setting_location: "discard-row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert "new_items" not in table_counts @@ -417,24 +417,24 @@ def get_items_subtable(): # disallow variants with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_variant()], schema_evolution_settings={"column_variant": "freeze-and-raise"}) + pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "freeze"}) assert isinstance(py_ex.value.__context__, SchemaFrozenException) # without settings it will pass - pipeline.run([get_items_variant()], schema_evolution_settings={"column_variant": "evolve"}) + pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "evolve"}) # disallow new col with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_new_col()], schema_evolution_settings={"column": "freeze-and-raise"}) + pipeline.run([get_items_new_col()], schema_contract_settings={"column": "freeze"}) assert isinstance(py_ex.value.__context__, SchemaFrozenException) # without settings it will pass - pipeline.run([get_items_new_col()], schema_evolution_settings={"column": "evolve"}) + pipeline.run([get_items_new_col()], schema_contract_settings={"column": "evolve"}) # disallow new tables with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_subtable()], schema_evolution_settings={"table": "freeze-and-raise"}) + pipeline.run([get_items_subtable()], schema_contract_settings={"table": "freeze"}) assert isinstance(py_ex.value.__context__, SchemaFrozenException) # without settings it will pass - pipeline.run([get_items_subtable()], schema_evolution_settings={"table": "evolve"}) \ No newline at end of file + pipeline.run([get_items_subtable()], schema_contract_settings={"table": "evolve"}) \ No newline at end of file From e707580597c37c525bf833eec12950473ad8d591 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 13 Sep 2023 13:28:14 +0200 Subject: [PATCH 22/73] update schema management --- dlt/common/schema/schema.py | 12 ++--- dlt/common/typing.py | 2 +- dlt/extract/decorators.py | 2 +- dlt/extract/source.py | 10 ++++- dlt/normalize/normalize.py | 29 ++++++------ dlt/pipeline/pipeline.py | 6 ++- tests/load/test_freeze_and_data_contract.py | 50 ++++++++++----------- 7 files changed, 61 insertions(+), 50 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 3d1603b13a..8a225eb875 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -194,7 +194,7 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_evolution_settings_for_table(self, parent_table: str, table_name: str, schema_contract_settings_override: TSchemaContractSettings) -> TSchemaContractModes: + def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaContractModes: def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: settings = settings or {} @@ -208,15 +208,14 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: # modes table_contract_modes = resolve_single(self.tables.get(table_with_settings, {}).get("schema_contract_settings", {})) schema_contract_modes = resolve_single(self._settings.get("schema_contract_settings", {})) - overide_modes = resolve_single(schema_contract_settings_override) # resolve to correct settings dict - settings = cast(TSchemaContractModes, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes, **overide_modes}) + settings = cast(TSchemaContractModes, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes}) return settings - def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, schema_contract_settings_override: TSchemaContractSettings) -> Tuple[DictStrAny, TPartialTableSchema]: + def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" assert partial_table @@ -450,8 +449,11 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) - def set_schema_contract_settings(self, settings: TSchemaContractSettings) -> None: + def set_schema_contract_settings(self, settings: TSchemaContractSettings, update_table_settings: bool = False) -> None: self._settings["schema_contract_settings"] = settings + if update_table_settings: + for table in self.tables.values(): + table["schema_contract_settings"] = settings def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: column_schema = TColumnSchema( diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 2f1cecc093..607ee15d68 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -77,7 +77,7 @@ def is_final_type(t: Type[Any]) -> bool: def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: if no_none: - return [arg for arg in get_args(t) if arg is not type(None)] + return [arg for arg in get_args(t) if arg is not type(None)] # noqa: E721 return list(get_args(t)) def is_literal_type(hint: Type[Any]) -> bool: diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 1134dd9cc3..653d57d13f 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -170,13 +170,13 @@ def _wrap(*args: Any, **kwargs: Any) -> DltSource: # prepare schema schema = schema.clone(update_normalizers=True) - schema.set_schema_contract_settings(schema_contract_settings) # convert to source s = DltSource.from_data(name, source_section, schema.clone(update_normalizers=True), rv) # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting + s.schema_contract_settings = schema_contract_settings # enable root propagation s.root_key = root_key return s diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 52a0381dfe..da7119d3f4 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -11,7 +11,7 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer, RelationalNormalizerConfigPropagation from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnName +from dlt.common.schema.typing import TColumnName, TSchemaContractSettings from dlt.common.typing import AnyFun, StrAny, TDataItem, TDataItems, NoneType from dlt.common.configuration.container import Container from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, resource_state, source_state, pipeline_state @@ -605,6 +605,14 @@ def max_table_nesting(self) -> int: def max_table_nesting(self, value: int) -> None: RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) + @property + def schema_contract_settings(self) -> TSchemaContractSettings: + return self.schema.settings["schema_contract_settings"] + + @schema_contract_settings.setter + def schema_contract_settings(self, settings: TSchemaContractSettings) -> None: + self.schema.set_schema_contract_settings(settings) + @property def exhausted(self) -> bool: """check all selected pipes wether one of them has started. if so, the source is exhausted.""" diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index ae3e2e658c..73a192e6e1 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -11,7 +11,7 @@ from dlt.common.runners import TRunMetrics, Runnable from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractSettings, TSchemaContractModes +from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractModes from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration @@ -26,7 +26,7 @@ # normalize worker wrapping function (map_parallel, map_single) return type TMapFuncRV = Tuple[Sequence[TSchemaUpdate], TRowCount] # normalize worker wrapping function signature -TMapFuncType = Callable[[Schema, str, Sequence[str], TSchemaContractSettings], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) +TMapFuncType = Callable[[Schema, str, Sequence[str]], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) # tuple returned by the worker TWorkerRV = Tuple[List[TSchemaUpdate], int, List[str], TRowCount] @@ -34,7 +34,7 @@ class Normalize(Runnable[ProcessPool]): @with_config(spec=NormalizeConfiguration, sections=(known_sections.NORMALIZE,)) - def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value, schema_contract_settings: TSchemaContractSettings = None) -> None: + def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value) -> None: self.config = config self.collector = collector self.pool: ProcessPool = None @@ -42,7 +42,6 @@ def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: Schema self.load_storage: LoadStorage = None self.schema_storage: SchemaStorage = None self._row_counts: TRowCount = {} - self.schema_contract_settings = schema_contract_settings # setup storages self.create_storages() @@ -73,8 +72,7 @@ def w_normalize_files( destination_caps: DestinationCapabilitiesContext, stored_schema: TStoredSchema, load_id: str, - extracted_items_files: Sequence[str], - schema_contract_settings: TSchemaContractSettings + extracted_items_files: Sequence[str] ) -> TWorkerRV: schema_updates: List[TSchemaUpdate] = [] total_items = 0 @@ -99,7 +97,7 @@ def w_normalize_files( items_count = 0 for line_no, line in enumerate(f): items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items, schema_contract_settings) + partial_update, items_count, r_counts = Normalize._w_normalize_chunk(load_storage, schema, load_id, root_table_name, items) schema_updates.append(partial_update) total_items += items_count merge_row_count(row_counts, r_counts) @@ -128,7 +126,7 @@ def w_normalize_files( return schema_updates, total_items, load_storage.closed_files(), row_counts @staticmethod - def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem], schema_contract_settings: TSchemaContractSettings) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas: Dict[str, TTableSchemaColumns] = {} # quick access to column schema for writers below schema_update: TSchemaUpdate = {} schema_name = schema.name @@ -139,7 +137,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): if not schema_contract_modes: - schema_contract_modes = schema.resolve_evolution_settings_for_table(parent_table, table_name, schema_contract_settings) + schema_contract_modes = schema.resolve_contract_settings_for_table(parent_table, table_name) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) @@ -153,7 +151,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, the check update if partial_table: - row, partial_table = schema.check_schema_update(schema_contract_modes, table_name, row, partial_table, schema_contract_settings) + row, partial_table = schema.check_schema_update(schema_contract_modes, table_name, row, partial_table) if not row: continue @@ -204,12 +202,12 @@ def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[st l_idx = idx + 1 return chunk_files - def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schema_contract_settings: TSchemaContractSettings) -> TMapFuncRV: + def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: workers = self.pool._processes # type: ignore chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() config_tuple = (self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema_dict) - param_chunk = [[*config_tuple, load_id, files, schema_contract_settings] for files in chunk_files] + param_chunk = [[*config_tuple, load_id, files] for files in chunk_files] tasks: List[Tuple[AsyncResult[TWorkerRV], List[Any]]] = [] row_counts: TRowCount = {} @@ -259,15 +257,14 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str], schem return schema_updates, row_counts - def map_single(self, schema: Schema, load_id: str, files: Sequence[str], schema_contract_settings: TSchemaContractSettings) -> TMapFuncRV: + def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: result = Normalize.w_normalize_files( self.normalize_storage.config, self.load_storage.config, self.config.destination_capabilities, schema.to_dict(), load_id, - files, - schema_contract_settings + files ) self.update_schema(schema, result[0]) self.collector.update("Files", len(result[2])) @@ -278,7 +275,7 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) # process files in parallel or in single thread, depending on map_f - schema_updates, row_counts = map_f(schema, load_id, files, self.schema_contract_settings) + schema_updates, row_counts = map_f(schema, load_id, files) # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) if len(schema_updates) > 0: logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 567f286ba1..1789a02e0f 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -304,6 +304,10 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No if not self.default_schema_name: return None + # update global schema contract settings, could be moved into def normalize() + if schema_contract_settings is not None: + self.default_schema.set_schema_contract_settings(schema_contract_settings, True) + # make sure destination capabilities are available self._get_destination_capabilities() # create default normalize config @@ -316,7 +320,7 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No # run with destination context with self._maybe_destination_capabilities(loader_file_format=loader_file_format): # shares schema storage with the pipeline so we do not need to install - normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage, schema_contract_settings=schema_contract_settings) + normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage) try: with signals.delayed_signals(): runner.run_pool(normalize.config, normalize) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 5ef936bd66..2460679737 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -113,25 +113,25 @@ def source() -> DltResource: pipeline.run(source(), schema_contract_settings=settings.get("override")) # check updated schema - assert pipeline.default_schema._settings["schema_contract_settings"] == settings.get("source") + assert pipeline.default_schema._settings["schema_contract_settings"] == (settings.get("override") or settings.get("source")) # check items table settings - assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == settings.get("resource") + assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == (settings.get("override") or settings.get("resource")) def get_pipeline(): import duckdb return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) -@pytest.mark.parametrize("evolution_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) -def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> None: +def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None: pipeline = get_pipeline() full_settings = { setting_location: { - "table": evolution_setting + "table": contract_setting }} run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -149,7 +149,7 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding new subtable - if evolution_setting == "freeze": + if contract_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_subtable, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) @@ -157,27 +157,27 @@ def test_freeze_new_tables(evolution_setting: str, setting_location: str) -> Non run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 30 if evolution_setting in ["freeze"] else 40 - assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if evolution_setting in ["evolve"] else 0) + assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 + assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) # test adding new table - if evolution_setting == "freeze": + if contract_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, new_items, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts.get("new_items", 0) == (10 if evolution_setting in ["evolve"] else 0) + assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) -@pytest.mark.parametrize("evolution_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) -def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> None: +def test_freeze_new_columns(contract_setting: str, setting_location: str) -> None: full_settings = { setting_location: { - "column": evolution_setting + "column": contract_setting }} pipeline = get_pipeline() @@ -199,43 +199,43 @@ def test_freeze_new_columns(evolution_setting: str, setting_location: str) -> No assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column - if evolution_setting == "freeze": + if contract_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_new_column, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: run_resource(pipeline, items_with_new_column, full_settings) - if evolution_setting == "evolve": + if contract_setting == "evolve": assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] else: assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (30 if evolution_setting in ["evolve", "discard-value"] else 20) + assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard-value"] else 20) # test adding variant column - if evolution_setting == "freeze": + if contract_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: run_resource(pipeline, items_with_variant, full_settings) - if evolution_setting == "evolve": + if contract_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if evolution_setting in ["evolve", "discard-value"] else 20) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard-value"] else 20) -@pytest.mark.parametrize("evolution_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract_settings) @pytest.mark.parametrize("setting_location", LOCATIONS) -def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: +def test_freeze_variants(contract_setting: str, setting_location: str) -> None: full_settings = { setting_location: { - "data_type": evolution_setting + "data_type": contract_setting }} pipeline = get_pipeline() run_resource(pipeline, items, {}) @@ -262,19 +262,19 @@ def test_freeze_variants(evolution_setting: str, setting_location: str) -> None: assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding variant column - if evolution_setting == "freeze": + if contract_setting == "freeze": with pytest.raises(PipelineStepFailed) as py_ex: run_resource(pipeline, items_with_variant, full_settings) assert isinstance(py_ex.value.__context__, SchemaFrozenException) else: run_resource(pipeline, items_with_variant, full_settings) - if evolution_setting == "evolve": + if contract_setting == "evolve": assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if evolution_setting in ["evolve", "discard-value"] else 30) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard-value"] else 30) def test_settings_precedence() -> None: From b3dc41dea3ee8762d2067368666e71b1ccb91d42 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 13 Sep 2023 14:07:45 +0200 Subject: [PATCH 23/73] fix schema related tests --- dlt/common/schema/schema.py | 4 +- dlt/common/schema/utils.py | 7 +- dlt/pipeline/pipeline.py | 2 +- .../cases/schemas/eth/ethereum_schema_v7.yml | 669 +----------------- tests/common/schema/test_schema.py | 4 +- tests/common/schema/test_versioning.py | 2 +- tests/common/storages/test_schema_storage.py | 2 +- tests/common/utils.py | 2 +- .../cases/eth_source/ethereum.schema.yaml | 669 +----------------- tests/extract/test_decorators.py | 1 + tests/load/test_freeze_and_data_contract.py | 4 +- tests/load/weaviate/test_naming.py | 4 +- 12 files changed, 42 insertions(+), 1328 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 8a225eb875..95dd51fd76 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -68,6 +68,7 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: @classmethod def from_dict(cls, d: DictStrAny) -> "Schema": + # upgrade engine if needed stored_schema = utils.migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) # verify schema @@ -453,7 +454,8 @@ def set_schema_contract_settings(self, settings: TSchemaContractSettings, update self._settings["schema_contract_settings"] = settings if update_table_settings: for table in self.tables.values(): - table["schema_contract_settings"] = settings + if not table.get("parent"): + table["schema_contract_settings"] = settings def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: column_schema = TColumnSchema( diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 31cea1480d..16230fc016 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -343,10 +343,11 @@ def migrate_filters(group: str, filters: List[str]) -> None: if from_engine == 6 and to_engine > 6: # migrate from sealed properties to schema evolution settings schema_dict["settings"].pop("schema_sealed", None) - schema_dict["settings"]["schema_contract_settings"] = None + schema_dict["settings"]["schema_contract_settings"] = {} for table in schema_dict["tables"].values(): table.pop("table_sealed", None) - table["schema_contract_settings"] = None + if not table.get("parent"): + table["schema_contract_settings"] = {} from_engine = 7 schema_dict["engine_version"] = from_engine @@ -660,7 +661,7 @@ def new_table( # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - table["schema_contract_settings"] = schema_contract_settings + table["schema_contract_settings"] = schema_contract_settings or {} if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 1789a02e0f..33787daa17 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -882,7 +882,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para pipeline_schema.update_schema( pipeline_schema.normalize_table_identifiers(table) ) - pipeline_schema._settings["schema_contract_settings"] = source_schema._settings.get("schema_contract_settings") + pipeline_schema.set_schema_contract_settings(source_schema._settings.get("schema_contract_settings", {})) return extract_id diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index a83dd82aa4..c9afb6be76 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -1,152 +1,64 @@ version: 14 -version_hash: 0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4= +version_hash: VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus= engine_version: 7 name: ethereum tables: _dlt_loads: - name: _dlt_loads columns: load_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: load_id data_type: text + name: load_id schema_name: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: schema_name data_type: text + name: schema_name status: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: status data_type: bigint + name: status inserted_at: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: inserted_at data_type: timestamp + name: inserted_at schema_version_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: schema_version_hash data_type: text + name: schema_version_hash write_disposition: skip - resource: _dlt_loads - schema_contract_settings: null description: Created by DLT. Tracks completed loads + schema_contract_settings: {} + name: _dlt_loads + resource: _dlt_loads _dlt_version: columns: version: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: version engine_version: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: engine_version inserted_at: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: inserted_at schema_name: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: schema_name version_hash: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: version_hash schema: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_contract_settings: null + schema_contract_settings: {} name: _dlt_version resource: _dlt_version blocks: @@ -159,258 +71,96 @@ tables: columns: _dlt_load_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false description: load id coming from the extractor data_type: text name: _dlt_load_id _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: number parent_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: parent_hash hash: nullable: false - partition: false cluster: true unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: hash base_fee_per_gas: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: base_fee_per_gas difficulty: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: difficulty extra_data: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: extra_data gas_limit: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_limit gas_used: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_used logs_bloom: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: binary name: logs_bloom miner: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: miner mix_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: mix_hash nonce: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: nonce receipts_root: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: receipts_root sha3_uncles: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: sha3_uncles size: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: size state_root: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: state_root timestamp: nullable: false - partition: false - cluster: false unique: true sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: timestamp total_difficulty: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: total_difficulty transactions_root: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: transactions_root - schema_contract_settings: null + schema_contract_settings: {} name: blocks resource: blocks blocks__transactions: @@ -418,654 +168,259 @@ tables: columns: _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id block_number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: block_number transaction_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: transaction_index hash: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: hash block_hash: nullable: false - partition: false cluster: true - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: block_hash block_timestamp: nullable: false - partition: false - cluster: false - unique: false sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: block_timestamp chain_id: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: chain_id from: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: from gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas gas_price: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_price input: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: input max_fee_per_gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: max_fee_per_gas max_priority_fee_per_gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: max_priority_fee_per_gas nonce: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: nonce r: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: r s: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: s status: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: status to: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: to type: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: type v: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: v value: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: value eth_value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: decimal name: eth_value - schema_contract_settings: null name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions columns: _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id address: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: address block_timestamp: nullable: false - partition: false - cluster: false - unique: false sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: block_timestamp block_hash: nullable: false - partition: false cluster: true - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: block_hash block_number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: block_number transaction_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: transaction_index log_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: log_index data: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: data removed: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bool name: removed transaction_hash: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: transaction_hash - schema_contract_settings: null name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id address: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: address - schema_contract_settings: null name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__uncles settings: default_hints: @@ -1087,7 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_contract_settings: null + schema_contract_settings: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index a0a2f4df9d..695b65de71 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -632,8 +632,8 @@ def test_group_tables_by_resource(schema: Schema) -> None: result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) # both tables with resource "products" must be here assert result == {'products': [ - {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': None}, - {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': None}, + {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': {}}, + {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': {}}, {'columns': {}, 'name': 'mc_products__sub', 'parent': 'mc_products'} ] } diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index f1d75028ac..8e34370180 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -95,7 +95,7 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") + eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") schema = Schema.from_dict(eth_v6) to_save_dict = schema.to_dict(remove_defaults=remove_defaults) diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 985d1b4392..88d3acfc0f 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -269,7 +269,7 @@ def test_schema_from_file() -> None: def prepare_import_folder(storage: SchemaStorage) -> None: - shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v6"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) + shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v7"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: diff --git a/tests/common/utils.py b/tests/common/utils.py index fe46a03923..4c68e32bf3 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -15,7 +15,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V7 = "0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4=" +IMPORTED_VERSION_HASH_ETH_V7 = "VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index a83dd82aa4..c9afb6be76 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,152 +1,64 @@ version: 14 -version_hash: 0Z93TpKZOdz5UpHrSDMKdUMUpQld9aP/9ZFYPRAWOI4= +version_hash: VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus= engine_version: 7 name: ethereum tables: _dlt_loads: - name: _dlt_loads columns: load_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: load_id data_type: text + name: load_id schema_name: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: schema_name data_type: text + name: schema_name status: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: status data_type: bigint + name: status inserted_at: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: inserted_at data_type: timestamp + name: inserted_at schema_version_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false - name: schema_version_hash data_type: text + name: schema_version_hash write_disposition: skip - resource: _dlt_loads - schema_contract_settings: null description: Created by DLT. Tracks completed loads + schema_contract_settings: {} + name: _dlt_loads + resource: _dlt_loads _dlt_version: columns: version: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: version engine_version: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: engine_version inserted_at: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: inserted_at schema_name: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: schema_name version_hash: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: version_hash schema: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_contract_settings: null + schema_contract_settings: {} name: _dlt_version resource: _dlt_version blocks: @@ -159,258 +71,96 @@ tables: columns: _dlt_load_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false description: load id coming from the extractor data_type: text name: _dlt_load_id _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: number parent_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: parent_hash hash: nullable: false - partition: false cluster: true unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: hash base_fee_per_gas: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: base_fee_per_gas difficulty: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: difficulty extra_data: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: extra_data gas_limit: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_limit gas_used: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_used logs_bloom: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: binary name: logs_bloom miner: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: miner mix_hash: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: mix_hash nonce: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: nonce receipts_root: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: receipts_root sha3_uncles: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: sha3_uncles size: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: size state_root: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: state_root timestamp: nullable: false - partition: false - cluster: false unique: true sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: timestamp total_difficulty: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: total_difficulty transactions_root: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: transactions_root - schema_contract_settings: null + schema_contract_settings: {} name: blocks resource: blocks blocks__transactions: @@ -418,654 +168,259 @@ tables: columns: _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id block_number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: block_number transaction_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: transaction_index hash: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: hash block_hash: nullable: false - partition: false cluster: true - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: block_hash block_timestamp: nullable: false - partition: false - cluster: false - unique: false sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: block_timestamp chain_id: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: chain_id from: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: from gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas gas_price: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: gas_price input: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: input max_fee_per_gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: max_fee_per_gas max_priority_fee_per_gas: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: max_priority_fee_per_gas nonce: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: nonce r: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: r s: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: s status: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: status to: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: to type: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: type v: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: v value: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: wei name: value eth_value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: decimal name: eth_value - schema_contract_settings: null name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions columns: _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id address: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: address block_timestamp: nullable: false - partition: false - cluster: false - unique: false sort: true - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: timestamp name: block_timestamp block_hash: nullable: false - partition: false cluster: true - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: block_hash block_number: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: block_number transaction_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true foreign_key: true - root_key: false - merge_key: false data_type: bigint name: transaction_index log_index: nullable: false - partition: false - cluster: false - unique: false - sort: false primary_key: true - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: log_index data: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: data removed: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bool name: removed transaction_hash: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: transaction_hash - schema_contract_settings: null name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id address: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: address - schema_contract_settings: null name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks columns: _dlt_parent_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false foreign_key: true - root_key: false - merge_key: false data_type: text name: _dlt_parent_id _dlt_list_idx: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: bigint name: _dlt_list_idx _dlt_id: nullable: false - partition: false - cluster: false unique: true - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: _dlt_id _dlt_root_id: nullable: false - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false root_key: true - merge_key: false data_type: text name: _dlt_root_id value: nullable: true - partition: false - cluster: false - unique: false - sort: false - primary_key: false - foreign_key: false - root_key: false - merge_key: false data_type: text name: value - schema_contract_settings: null name: blocks__uncles settings: default_hints: @@ -1087,7 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_contract_settings: null + schema_contract_settings: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 2965bd8866..457155f048 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -68,6 +68,7 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash + import json assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V7 diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 2460679737..68802bd9f9 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -113,10 +113,10 @@ def source() -> DltResource: pipeline.run(source(), schema_contract_settings=settings.get("override")) # check updated schema - assert pipeline.default_schema._settings["schema_contract_settings"] == (settings.get("override") or settings.get("source")) + assert pipeline.default_schema._settings.get("schema_contract_settings", {}) == (settings.get("override") or settings.get("source")) # check items table settings - assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == (settings.get("override") or settings.get("resource")) + assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == (settings.get("override") or settings.get("resource") or {}) def get_pipeline(): import duckdb diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index a965201425..488d66b725 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -87,7 +87,7 @@ def test_reserved_property_names() -> None: # print(schema_2.name) # print(schema_2.naming) -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v6") +# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") # eth_v6_schema = dlt.Schema.from_dict(eth_v6) # pipeline.extract(s, schema=eth_v6_schema) @@ -101,7 +101,7 @@ def test_reserved_property_names() -> None: # print(pipeline.dataset_name) # s = small() -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v6") +# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") # eth_v6_schema = dlt.Schema.from_dict(eth_v6) # pipeline.extract(s, schema=eth_v6_schema) From ac8f766bd10df18ec69e774866b8021290bbeccc Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 13 Sep 2023 15:57:19 +0200 Subject: [PATCH 24/73] add nice schema tests --- dlt/common/schema/__init__.py | 2 +- dlt/common/schema/schema.py | 6 +- .../schema/test_contract_mode_functions.py | 296 ++++++++++++++++++ tests/extract/test_decorators.py | 1 - 4 files changed, 300 insertions(+), 5 deletions(-) create mode 100644 tests/common/schema/test_contract_mode_functions.py diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index 6db3f21cef..b48e0a223d 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,4 +1,4 @@ from dlt.common.schema.typing import TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase # noqa: F401 from dlt.common.schema.typing import COLUMN_HINTS # noqa: F401 -from dlt.common.schema.schema import Schema # noqa: F401 +from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE # noqa: F401 from dlt.common.schema.utils import verify_schema_hash # noqa: F401 diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 95dd51fd76..cf8bf1c001 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -229,8 +229,6 @@ def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: # check case where we have a new table if not table_exists: - if contract_modes == "discard-value": - return None, None if contract_modes["table"] in ["discard-row", "discard-value"]: return None, None if contract_modes["table"] == "freeze": @@ -240,10 +238,12 @@ def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: for item in list(row.keys()): for item in list(row.keys()): # if this is a new column for an existing table... + if table_exists and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): - is_variant = item in partial_table["columns"] and partial_table["columns"][item].get("variant") + is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["column"] == "discard-value" or (is_variant and contract_modes["data_type"] == "discard-value"): row.pop(item) + partial_table = deepcopy(partial_table) partial_table["columns"].pop(item) elif contract_modes["column"] == "discard-row" or (is_variant and contract_modes["data_type"] == "discard-row"): return None, None diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py new file mode 100644 index 0000000000..dfdf88da42 --- /dev/null +++ b/tests/common/schema/test_contract_mode_functions.py @@ -0,0 +1,296 @@ +import pytest +import copy + +from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema.exceptions import SchemaFrozenException + + +def get_schema() -> Schema: + s = Schema("event") + + columns = { + "column_1": { + "name": "column_1", + "data_type": "string" + }, + "column_2": { + "name": "column_2", + "data_type": "number", + "is_variant": True + } + } + + incomplete_columns = { + "incomplete_column_1": { + "name": "incomplete_column_1", + }, + "incomplete_column_2": { + "name": "incomplete_column_2", + } + } + + + # add some tables + s.update_schema({ + "name": "table", + "columns": columns + }) + + s.update_schema({ + "name": "child_table", + "parent": "table", + "columns": columns + }) + + s.update_schema({ + "name": "incomplete_table", + "columns": incomplete_columns + }) + + s.update_schema({ + "name": "mixed_table", + "columns": {**incomplete_columns, **columns} + }) + + return s + + +def test_resolve_contract_settings() -> None: + + # defaults + schema = get_schema() + assert schema.resolve_contract_settings_for_table(None, "table") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("table", "child_table") == DEFAULT_SCHEMA_CONTRACT_MODE + + # table specific full setting + schema = get_schema() + schema.tables["table"]["schema_contract_settings"] = "freeze" + assert schema.resolve_contract_settings_for_table(None, "table") == { + "table": "freeze", + "column": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("table", "child_table") == { + "table": "freeze", + "column": "freeze", + "data_type": "freeze" + } + + # table specific single setting + schema = get_schema() + schema.tables["table"]["schema_contract_settings"] = { + "table": "freeze", + "column": "discard-value", + } + assert schema.resolve_contract_settings_for_table(None, "table") == { + "table": "freeze", + "column": "discard-value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("table", "child_table") == { + "table": "freeze", + "column": "discard-value", + "data_type": "evolve" + } + + # schema specific full setting + schema = get_schema() + schema._settings["schema_contract_settings"] = "freeze" + assert schema.resolve_contract_settings_for_table(None, "table") == { + "table": "freeze", + "column": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("table", "child_table") == { + "table": "freeze", + "column": "freeze", + "data_type": "freeze" + } + + # schema specific single setting + schema = get_schema() + schema._settings["schema_contract_settings"] = { + "table": "freeze", + "column": "discard-value", + } + assert schema.resolve_contract_settings_for_table(None, "table") == { + "table": "freeze", + "column": "discard-value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("table", "child_table") == { + "table": "freeze", + "column": "discard-value", + "data_type": "evolve" + } + + # mixed settings + schema = get_schema() + schema._settings["schema_contract_settings"] = "freeze" + schema.tables["table"]["schema_contract_settings"] = { + "table": "evolve", + "column": "discard-value", + } + assert schema.resolve_contract_settings_for_table(None, "table") == { + "table": "evolve", + "column": "discard-value", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("table", "child_table") == { + "table": "evolve", + "column": "discard-value", + "data_type": "freeze" + } + + +# ensure other settings do not interfere with the main setting we are testing +base_settings = [{ + "table": "evolve", + "column": "evolve", + "data_type": "evolve" + },{ + "table": "discard-row", + "column": "discard-row", + "data_type": "discard-row" + }, { + "table": "discard-value", + "column": "discard-value", + "data_type": "discard-value" + }, { + "table": "freeze", + "column": "freeze", + "data_type": "freeze" + } +] + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_table(base_settings) -> None: + + schema = get_schema() + data = { + "column_1": "some string", + "column_2": 123 + } + new_table = copy.deepcopy(schema.tables["table"]) + new_table["name"] = "new_table" + + # + # check adding new table + # + assert schema.check_schema_update({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table) == (data, new_table) + assert schema.check_schema_update({**base_settings, **{"table": "discard-row"}}, "new_table", data, new_table) == (None, None) + assert schema.check_schema_update({**base_settings, **{"table": "discard-value"}}, "new_table", data, new_table) == (None, None) + + with pytest.raises(SchemaFrozenException): + schema.check_schema_update({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table) + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_new_columns(base_settings) -> None: + schema = get_schema() + + # + # check adding new column + # + data = { + "column_1": "some string", + "column_2": 123 + } + data_with_new_row = { + **data, + "new_column": "some string" + } + table_update = { + "name": "table", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "string" + } + } + } + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop("new_column") + + assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.check_schema_update({**base_settings, **{"column": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**base_settings, **{"column": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + + with pytest.raises(SchemaFrozenException): + schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + + + # + # check adding new column if target column is not complete + # + data = { + "column_1": "some string", + "column_2": 123, + } + data_with_new_row = { + **data, + "incomplete_column_1": "some other string", + } + table_update = { + "name": "mixed_table", + "columns": { + "incomplete_column_1": { + "name": "incomplete_column_1", + "data_type": "string" + } + } + } + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop("incomplete_column_1") + + # incomplete columns should be treated like new columns + assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.check_schema_update({**base_settings, **{"column": "discard-row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**base_settings, **{"column": "discard-value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + + with pytest.raises(SchemaFrozenException): + schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) + + + +def test_check_adding_new_variant() -> None: + schema = get_schema() + + # + # check adding new variant column + # + data = { + "column_1": "some string", + "column_2": 123 + } + data_with_new_row = { + **data, + "column_2_variant": 345345 + } + table_update = { + "name": "table", + "columns": { + "column_2_variant": { + "name": "column_2_variant", + "data_type": "number", + "variant": True + } + } + } + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop("column_2_variant") + + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + + with pytest.raises(SchemaFrozenException): + schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + + # check interaction with new columns settings, variants are new columns.. + with pytest.raises(SchemaFrozenException): + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 457155f048..2965bd8866 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -68,7 +68,6 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - import json assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V7 From 869f2786ea5eba5f5818c75ec7be54b137f759f9 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 13 Sep 2023 16:45:33 +0200 Subject: [PATCH 25/73] add docs page --- dlt/common/schema/schema.py | 6 +- dlt/common/schema/typing.py | 2 +- docs/website/docs/general-usage/schema.md | 73 +++++++++++++++++++ docs/website/docs/reference/performance.md | 2 +- .../schema/test_contract_mode_functions.py | 50 ++++++------- tests/load/test_freeze_and_data_contract.py | 28 +++---- 6 files changed, 117 insertions(+), 44 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index cf8bf1c001..05a0a897cb 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -229,7 +229,7 @@ def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: # check case where we have a new table if not table_exists: - if contract_modes["table"] in ["discard-row", "discard-value"]: + if contract_modes["table"] in ["discard_row", "discard_value"]: return None, None if contract_modes["table"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") @@ -241,11 +241,11 @@ def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: if table_exists and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") - if contract_modes["column"] == "discard-value" or (is_variant and contract_modes["data_type"] == "discard-value"): + if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) partial_table = deepcopy(partial_table) partial_table["columns"].pop(item) - elif contract_modes["column"] == "discard-row" or (is_variant and contract_modes["data_type"] == "discard-row"): + elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None elif contract_modes["column"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name}  but columns are frozen.") diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 08d89b1f13..d19a730b46 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -65,7 +65,7 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" -TSchemaEvolutionMode = Literal["evolve", "discard-value", "freeze", "discard-row"] +TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] class TSchemaContractModes(TypedDict, total=False): """TypedDict defining the schema update settings""" diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index fd2b1cf801..d04af370a7 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -152,6 +152,79 @@ or create a child table out of it. `time` data type is saved in destination without timezone info, if timezone is included it is stripped. E.g. `'14:01:02+02:00` -> `'14:01:02'`. +## Data contracts and controlling schema evolution + +`dlt` will evolve the schema of the destination to accomodate the structure and data types of the extracted data. There are several settings +that you can use to control this automatic schema evolution, from the default settings where all changes to the schema are accepted to +a frozen schema that does not change at all. + +Consider this example: + +```py +@dlt.resource(schema_contract_settings={"table": "evolve", "columns": "freeze"}) +def items(): + ... +``` + +This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which +contains a new column. + +The `schema_contract_settings` exists on the `source` decorator as a directive for all resources of that source and on the +`resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. +The `schema_contract_settings` is a dictionary with keys that control the following: + +* `table` creating of new tables and subtables +* `columns` creating of new columns on an existing table +* `data_type` creating of new variant columns, which happens if a different datatype is discovered in the extracted data than exists in the schema + +Each property can be set to one of three values: +* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination +* `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. All other rows will be. +* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. + +### Code Examples + +The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. + +```py +@dlt.resource(schema_contract_settings={"table": "discard_row", "columns": "evolve", "data_type": "freeze"}) +def items(): + ... +``` + +The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. + +```py +pipeline.run(my_source(), schema_contract_settings="freeze") +``` + +The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. +Here for all resources variant columns are frozen and raise an error if encountered, on `items` new columns are allowed but `other_items` inherits the `freeze` setting from +the source, thus new columns are frozen there. New tables are allowed. + +```py +@dlt.resource(schema_contract_settings={"columns": "evolve"}) +def items(): + ... + +@dlt.resource() +def other_items(): + ... + +@dlt.source(schema_contract_settings={"columns": "freeze", "data_type": "freeze"}): +def source(): + return [items(), other_items()] + + +# this will use the settings defined by the decorators +pipeline.run(source()) + +# this will freeze the whole schema, regardless of the decorator settings +pipeline.run(source(), schema_contract_settings="freeze") + +``` + + ## Schema settings The `settings` section of schema file lets you define various global rules that impact how tables diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 7a089ce7a6..e299a6e138 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -286,7 +286,7 @@ def read_table(limit): yield [{"row": _id, "description": "this is row with id {_id}", "timestamp": now} for _id in item_slice] -# this prevents the process pool to run the initialization code again +# this prevents process pool to run the initialization code again if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: pipeline = dlt.pipeline("parallel_load", destination="duckdb", full_refresh=True) pipeline.extract(read_table(1000000)) diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index dfdf88da42..44534c2501 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -80,16 +80,16 @@ def test_resolve_contract_settings() -> None: schema = get_schema() schema.tables["table"]["schema_contract_settings"] = { "table": "freeze", - "column": "discard-value", + "column": "discard_value", } assert schema.resolve_contract_settings_for_table(None, "table") == { "table": "freeze", - "column": "discard-value", + "column": "discard_value", "data_type": "evolve" } assert schema.resolve_contract_settings_for_table("table", "child_table") == { "table": "freeze", - "column": "discard-value", + "column": "discard_value", "data_type": "evolve" } @@ -111,16 +111,16 @@ def test_resolve_contract_settings() -> None: schema = get_schema() schema._settings["schema_contract_settings"] = { "table": "freeze", - "column": "discard-value", + "column": "discard_value", } assert schema.resolve_contract_settings_for_table(None, "table") == { "table": "freeze", - "column": "discard-value", + "column": "discard_value", "data_type": "evolve" } assert schema.resolve_contract_settings_for_table("table", "child_table") == { "table": "freeze", - "column": "discard-value", + "column": "discard_value", "data_type": "evolve" } @@ -129,16 +129,16 @@ def test_resolve_contract_settings() -> None: schema._settings["schema_contract_settings"] = "freeze" schema.tables["table"]["schema_contract_settings"] = { "table": "evolve", - "column": "discard-value", + "column": "discard_value", } assert schema.resolve_contract_settings_for_table(None, "table") == { "table": "evolve", - "column": "discard-value", + "column": "discard_value", "data_type": "freeze" } assert schema.resolve_contract_settings_for_table("table", "child_table") == { "table": "evolve", - "column": "discard-value", + "column": "discard_value", "data_type": "freeze" } @@ -149,13 +149,13 @@ def test_resolve_contract_settings() -> None: "column": "evolve", "data_type": "evolve" },{ - "table": "discard-row", - "column": "discard-row", - "data_type": "discard-row" + "table": "discard_row", + "column": "discard_row", + "data_type": "discard_row" }, { - "table": "discard-value", - "column": "discard-value", - "data_type": "discard-value" + "table": "discard_value", + "column": "discard_value", + "data_type": "discard_value" }, { "table": "freeze", "column": "freeze", @@ -179,8 +179,8 @@ def test_check_adding_table(base_settings) -> None: # check adding new table # assert schema.check_schema_update({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table) == (data, new_table) - assert schema.check_schema_update({**base_settings, **{"table": "discard-row"}}, "new_table", data, new_table) == (None, None) - assert schema.check_schema_update({**base_settings, **{"table": "discard-value"}}, "new_table", data, new_table) == (None, None) + assert schema.check_schema_update({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table) == (None, None) + assert schema.check_schema_update({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): schema.check_schema_update({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table) @@ -214,8 +214,8 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("new_column") assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**base_settings, **{"column": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**base_settings, **{"column": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.check_schema_update({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) @@ -246,8 +246,8 @@ def test_check_adding_new_columns(base_settings) -> None: # incomplete columns should be treated like new columns assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**base_settings, **{"column": "discard-row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**base_settings, **{"column": "discard-value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.check_schema_update({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) @@ -282,8 +282,8 @@ def test_check_adding_new_variant() -> None: popped_table_update["columns"].pop("column_2_variant") assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) @@ -292,5 +292,5 @@ def test_check_adding_new_variant() -> None: with pytest.raises(SchemaFrozenException): assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard-row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard-value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 68802bd9f9..24a2a9de91 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -14,7 +14,7 @@ skip_if_not_active("duckdb") -schema_contract_settings = ["evolve", "discard-value", "discard-row", "freeze"] +schema_contract_settings = ["evolve", "discard_value", "discard_row", "freeze"] LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["table", "column", "data_type"] @@ -211,7 +211,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non else: assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard-value"] else 20) + assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard_value"] else 20) # test adding variant column if contract_setting == "freeze": @@ -226,7 +226,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard-value"] else 20) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 20) @pytest.mark.parametrize("contract_setting", schema_contract_settings) @@ -274,7 +274,7 @@ def test_freeze_variants(contract_setting: str, setting_location: str) -> None: else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard-value"] else 30) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) def test_settings_precedence() -> None: @@ -285,7 +285,7 @@ def test_settings_precedence() -> None: # trying to add new column when forbidden on resource will fail run_resource(pipeline, items_with_new_column, {"resource": { - "column": "discard-row" + "column": "discard_row" }}) # when allowed on override it will work @@ -300,14 +300,14 @@ def test_settings_precedence_2() -> None: # load some data run_resource(pipeline, items, {"source": { - "data_type": "discard-row" + "data_type": "discard_row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add variant when forbidden on source will fail run_resource(pipeline, items_with_variant, {"source": { - "data_type": "discard-row" + "data_type": "discard_row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 @@ -315,15 +315,15 @@ def test_settings_precedence_2() -> None: # if allowed on resource it will pass run_resource(pipeline, items_with_variant, { "resource": {"data_type": "evolve"}, - "source": {"data_type": "discard-row"} + "source": {"data_type": "discard_row"} }) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 # if allowed on override it will also pass run_resource(pipeline, items_with_variant, { - "resource": {"data_type": "discard-row"}, - "source": {"data_type": "discard-row"}, + "resource": {"data_type": "discard_row"}, + "source": {"data_type": "discard_row"}, "override": {"data_type": "evolve"}, }) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -340,7 +340,7 @@ def test_change_mode(setting_location: str) -> None: # trying to add variant when forbidden will fail run_resource(pipeline, items_with_variant, {setting_location: { - "data_type": "discard-row" + "data_type": "discard_row" }}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 @@ -361,17 +361,17 @@ def test_single_settings_value(setting_location: str) -> None: assert table_counts["items"] == 10 # trying to add variant when forbidden will fail - run_resource(pipeline, items_with_variant, {setting_location: "discard-row"}) + run_resource(pipeline, items_with_variant, {setting_location: "discard_row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add new column will fail - run_resource(pipeline, items_with_new_column, {setting_location: "discard-row"}) + run_resource(pipeline, items_with_new_column, {setting_location: "discard_row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 # trying to add new table will fail - run_resource(pipeline, new_items, {setting_location: "discard-row"}) + run_resource(pipeline, new_items, {setting_location: "discard_row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert "new_items" not in table_counts From 0f22ba0e8997588670947d53278b975c60a525e4 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 13 Sep 2023 17:21:33 +0200 Subject: [PATCH 26/73] small test fix --- .github/workflows/test_local_destinations.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 50d973bad4..60b48e2118 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -84,7 +84,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E pydantic - run: poetry run pytest tests/load tests/cli name: Run tests Linux From 463c447e629a44026ff70d6220ea1cebf59c70af Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 14 Sep 2023 15:18:08 +0200 Subject: [PATCH 27/73] smaller PR fixes --- dlt/common/schema/schema.py | 51 +++++++++++++------ dlt/common/schema/utils.py | 4 -- dlt/extract/decorators.py | 8 +-- dlt/extract/extract.py | 2 +- dlt/extract/schema.py | 7 ++- dlt/extract/source.py | 2 +- dlt/normalize/normalize.py | 12 ++--- dlt/pipeline/__init__.py | 3 +- dlt/pipeline/pipeline.py | 32 ++++++------ .../normalizers/test_json_relational.py | 8 +-- .../schema/test_contract_mode_functions.py | 46 ++++++++--------- tests/common/schema/test_filtering.py | 8 +-- tests/common/schema/test_inference.py | 48 ++++++++--------- tests/common/schema/test_merges.py | 3 ++ tests/common/schema/test_schema.py | 28 +++++----- tests/common/schema/test_versioning.py | 4 +- tests/common/storages/test_schema_storage.py | 2 +- tests/extract/test_decorators.py | 8 +-- tests/load/pipeline/test_restore_state.py | 2 +- tests/load/test_freeze_and_data_contract.py | 37 +++++++++----- tests/load/test_job_client.py | 24 ++++----- tests/load/utils.py | 2 +- tests/load/weaviate/test_weaviate_client.py | 14 ++--- 23 files changed, 196 insertions(+), 159 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 05a0a897cb..3bbdbd51e3 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -196,6 +196,7 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaContractModes: + """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: settings = settings or {} @@ -216,12 +217,27 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: return settings - def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: - """Checks if schema update mode allows for the requested changes, filter row or reject update, depending on the mode""" + def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + """ + Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out + columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: + + { + "table": "freeze", + "column": "evolve", + "data_type": "discard_row" + } + + Settings for table affects new tables, settings for column affects new columns and settings for data_type affects new variant columns. Each setting can be set to one of: + * evolve: allow all changes + * freeze: allow no change and fail the load + * discard_row: allow no schema change and filter out the row + * discard_value: allow no schema change and filter out the value but load the rest of the row + """ assert partial_table - # default settings allow all evolutions, skipp all else + # default settings allow all evolutions, skip all else if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table @@ -243,19 +259,30 @@ def check_schema_update(self, contract_modes: TSchemaContractModes, table_name: is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) - partial_table = deepcopy(partial_table) partial_table["columns"].pop(item) elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None - elif contract_modes["column"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name}  but columns are frozen.") elif is_variant and contract_modes["data_type"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name}  data_types are frozen.") - + print(contract_modes) + raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") + elif contract_modes["column"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") return row, partial_table - def update_schema(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: + def update_schema(self, schema: "Schema") -> None: + """ + Update schema from another schema + note we are not merging props like max nesting or column propagation + """ + + for table in schema.data_tables(include_incomplete=True): + self.update_table( + self.normalize_table_identifiers(table) + ) + self.set_schema_contract_settings(schema._settings.get("schema_contract_settings", {})) + + def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] parent_table_name = partial_table.get("parent") # check if parent table present @@ -419,12 +446,6 @@ def tables(self) -> TSchemaTables: def settings(self) -> TSchemaSettings: return self._settings - @property - def has_data_columns(self) -> bool: - for table in self.data_tables(): - return bool(table.get("columns", None)) - return False - def to_pretty_json(self, remove_defaults: bool = True) -> str: d = self.to_dict(remove_defaults=remove_defaults) return json.dumps(d, pretty=True) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 16230fc016..6afc34f6d4 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -435,7 +435,6 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl continue existing_v = tab_a.get(k) if existing_v != v: - print(f"{k} ==? {v} ==? {existing_v}") partial_table[k] = v # type: ignore # this should not really happen @@ -476,9 +475,6 @@ def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPa table.update(diff_table) table["columns"] = updated_columns - # always update evolution settings - table["schema_contract_settings"] = partial_table.get("schema_contract_settings") - return diff_table diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 653d57d13f..b160f6b5e3 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -114,6 +114,8 @@ def source( schema (Schema, optional): An explicit `Schema` instance to be associated with the source. If not present, `dlt` creates a new `Schema` object with provided `name`. If such `Schema` already exists in the same folder as the module containing the decorated function, such schema will be loaded from file. + schema_contract_settings (TSchemaContractSettings, optional): Schema contract settings that will be applied to this resource. + spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. Returns: @@ -153,7 +155,6 @@ def decorator(f: Callable[TSourceFunParams, Any]) -> Callable[TSourceFunParams, @wraps(conf_f) def _wrap(*args: Any, **kwargs: Any) -> DltSource: - nonlocal schema # make schema available to the source with Container().injectable_context(SourceSchemaInjectableContext(schema)): @@ -168,9 +169,6 @@ def _wrap(*args: Any, **kwargs: Any) -> DltSource: if inspect.isgenerator(rv): rv = list(rv) - # prepare schema - schema = schema.clone(update_normalizers=True) - # convert to source s = DltSource.from_data(name, source_section, schema.clone(update_normalizers=True), rv) # apply hints @@ -303,6 +301,8 @@ def resource( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + schema_contract_settings (TSchemaContractSettings, optional): Schema contract settings that will be applied to all resources of this source (if not overriden in the resource itself) + selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index e2240e5f74..ae60c25dc4 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -193,7 +193,7 @@ def extract_with_schema( # iterate over all items in the pipeline and update the schema if dynamic table hints were present for _, partials in extractor.items(): for partial in partials: - schema.update_schema(schema.normalize_table_identifiers(partial)) + schema.update_table(schema.normalize_table_identifiers(partial)) return extract_id diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index e07fdfb40f..bcbd17707f 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -96,7 +96,8 @@ def apply_hints( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - incremental: Incremental[Any] = None + incremental: Incremental[Any] = None, + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -113,7 +114,7 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract_settings) else: # set single hints t = deepcopy(self._table_schema_template) @@ -129,6 +130,8 @@ def apply_hints( t.pop("parent", None) if write_disposition: t["write_disposition"] = write_disposition + if schema_contract_settings: + t["schema_contract_settings"] = schema_contract_settings # type: ignore if columns is not None: # if callable then override existing if callable(columns) or callable(t["columns"]): diff --git a/dlt/extract/source.py b/dlt/extract/source.py index da7119d3f4..fa71003fc7 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -672,7 +672,7 @@ def discover_schema(self, item: TDataItem = None) -> Schema: partial_table = self._schema.normalize_table_identifiers( r.compute_table_schema(item) ) - schema.update_schema(partial_table) + schema.update_table(partial_table) return schema def with_resources(self, *resource_names: str) -> "DltSource": diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 73a192e6e1..7525fd588e 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -151,14 +151,14 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, the check update if partial_table: - row, partial_table = schema.check_schema_update(schema_contract_modes, table_name, row, partial_table) + row, partial_table = schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table) if not row: continue # theres a new table or new columns in existing table if partial_table: # update schema and save the change - schema.update_schema(partial_table) + schema.update_table(partial_table) table_updates = schema_update.setdefault(table_name, []) table_updates.append(partial_table) # update our columns @@ -177,13 +177,13 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, signals.raise_if_signalled() return schema_update, items_count, row_counts - def update_schema(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: + def update_table(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: for schema_update in schema_updates: for table_name, table_updates in schema_update.items(): logger.info(f"Updating schema for table {table_name} with {len(table_updates)} deltas") for partial_table in table_updates: # merge columns - schema.update_schema(partial_table) + schema.update_table(partial_table) @staticmethod def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[str]]: @@ -229,7 +229,7 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM result: TWorkerRV = pending.get() try: # gather schema from all manifests, validate consistency and combine - self.update_schema(schema, result[0]) + self.update_table(schema, result[0]) schema_updates.extend(result[0]) # update metrics self.collector.update("Files", len(result[2])) @@ -266,7 +266,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap load_id, files ) - self.update_schema(schema, result[0]) + self.update_table(schema, result[0]) self.collector.update("Files", len(result[2])) self.collector.update("Items", result[1]) return result[0], result[3] diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 3dd7bd8310..9987d5b50c 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -238,7 +238,8 @@ def run( write_disposition=write_disposition, columns=columns, schema=schema, - loader_file_format=loader_file_format + loader_file_format=loader_file_format, + schema_contract_settings=schema_contract_settings ) # plug default tracking module diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 33787daa17..2e96224e0d 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -266,8 +266,10 @@ def extract( primary_key: TColumnNames = None, schema: Schema = None, max_parallel_items: int = None, - workers: int = None + workers: int = None, + schema_contract_settings: TSchemaContractSettings = None ) -> ExtractInfo: + print(schema_contract_settings) """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted storage = ExtractorStorage(self._normalize_storage_config) @@ -286,6 +288,11 @@ def extract( # TODO: if we fail here we should probably wipe out the whole extract folder for extract_id in extract_ids: storage.commit_extract_files(extract_id) + + # update global schema contract settings + if schema_contract_settings is not None: + self.default_schema.set_schema_contract_settings(schema_contract_settings, True) + return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor @@ -294,7 +301,7 @@ def extract( @with_runtime_trace @with_schemas_sync @with_config_section((known_sections.NORMALIZE,)) - def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None, schema_contract_settings: TSchemaContractSettings = None) -> NormalizeInfo: + def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None) -> NormalizeInfo: """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive(): workers = 1 @@ -304,10 +311,6 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No if not self.default_schema_name: return None - # update global schema contract settings, could be moved into def normalize() - if schema_contract_settings is not None: - self.default_schema.set_schema_contract_settings(schema_contract_settings, True) - # make sure destination capabilities are available self._get_destination_capabilities() # create default normalize config @@ -444,6 +447,8 @@ def run( loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. + schema_contract_settings (TSchemaContractSettings, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. + ### Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. ### Returns: @@ -452,7 +457,6 @@ def run( signals.raise_if_signalled() self._set_destinations(destination, staging) self._set_dataset_name(dataset_name) - # sync state with destination if self.config.restore_from_destination and not self.full_refresh and not self._state_restored and (self.destination or destination): self.sync_destination(destination, staging, dataset_name) @@ -461,17 +465,18 @@ def run( # normalize and load pending data if self.list_extracted_resources(): - self.normalize(loader_file_format=loader_file_format, schema_contract_settings=schema_contract_settings) + self.normalize(loader_file_format=loader_file_format) if self.list_normalized_load_packages(): # if there were any pending loads, load them and **exit** if data is not None: logger.warn("The pipeline `run` method will now load the pending load packages. The data you passed to the run function will not be loaded. In order to do that you must run the pipeline again") return self.load(destination, dataset_name, credentials=credentials) + # extract from the source if data is not None: - self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema) - self.normalize(loader_file_format=loader_file_format, schema_contract_settings=schema_contract_settings) + self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract_settings=schema_contract_settings) + self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) else: return None @@ -877,12 +882,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para self._schema_storage.save_import_schema_if_not_exists(source_schema) # get the current schema and merge tables from source_schema - # note we are not merging props like max nesting or column propagation - for table in source_schema.data_tables(include_incomplete=True): - pipeline_schema.update_schema( - pipeline_schema.normalize_table_identifiers(table) - ) - pipeline_schema.set_schema_contract_settings(source_schema._settings.get("schema_contract_settings", {})) + pipeline_schema.update_schema(source_schema) return extract_id diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index e344ca28d2..59519154c6 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -46,7 +46,7 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None: def test_preserve_complex_value(norm: RelationalNormalizer) -> None: # add table with complex column - norm.schema.update_schema( + norm.schema.update_table( new_table("with_complex", columns = [{ "name": "value", @@ -371,10 +371,10 @@ def test_list_in_list() -> None: # test the same setting webpath__list to complex zen_table = new_table("zen") - schema.update_schema(zen_table) + schema.update_table(zen_table) path_table = new_table("zen__webpath", parent_table_name="zen", columns=[{"name": "list", "data_type": "complex"}]) - schema.update_schema(path_table) + schema.update_table(path_table) rows = list(schema.normalize_data_item(chats, "1762162.1212", "zen")) # both lists are complex types now assert len(rows) == 3 @@ -543,7 +543,7 @@ def test_preserves_complex_types_list(norm: RelationalNormalizer) -> None: # the exception to test_removes_normalized_list # complex types should be left as they are # add table with complex column - norm.schema.update_schema(new_table("event_slot", + norm.schema.update_table(new_table("event_slot", columns = [{ "name": "value", "data_type": "complex", diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index 44534c2501..d33fdedeb5 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -31,23 +31,23 @@ def get_schema() -> Schema: # add some tables - s.update_schema({ + s.update_table({ "name": "table", "columns": columns }) - s.update_schema({ + s.update_table({ "name": "child_table", "parent": "table", "columns": columns }) - s.update_schema({ + s.update_table({ "name": "incomplete_table", "columns": incomplete_columns }) - s.update_schema({ + s.update_table({ "name": "mixed_table", "columns": {**incomplete_columns, **columns} }) @@ -178,12 +178,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.check_schema_update({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table) == (data, new_table) - assert schema.check_schema_update({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table) == (None, None) - assert schema.check_schema_update({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table) == (data, new_table) + assert schema.apply_schema_contract({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): - schema.check_schema_update({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table) + schema.apply_schema_contract({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table) @pytest.mark.parametrize("base_settings", base_settings) @@ -213,12 +213,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) # @@ -245,12 +245,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.check_schema_update({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.check_schema_update({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) @@ -281,16 +281,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.check_schema_update({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/common/schema/test_filtering.py b/tests/common/schema/test_filtering.py index 8ab9df877d..3c0f02ed30 100644 --- a/tests/common/schema/test_filtering.py +++ b/tests/common/schema/test_filtering.py @@ -84,10 +84,10 @@ def test_filter_parent_table_schema_update(schema: Schema) -> None: # try to apply updates assert len(updates) == 2 # event bot table - schema.update_schema(updates[0]) + schema.update_table(updates[0]) # event_bot__metadata__elvl1__elvl2 with pytest.raises(ParentTableNotFoundException) as e: - schema.update_schema(updates[1]) + schema.update_table(updates[1]) assert e.value.table_name == "event_bot__metadata__elvl1__elvl2" assert e.value.parent_table_name == "event_bot__metadata__elvl1" @@ -106,7 +106,7 @@ def test_filter_parent_table_schema_update(schema: Schema) -> None: assert set(row.keys()).issuperset(["_dlt_id", "_dlt_parent_id", "_dlt_list_idx"]) row, partial_table = schema.coerce_row(t, p, row) updates.append(partial_table) - schema.update_schema(partial_table) + schema.update_table(partial_table) assert len(updates) == 4 # we must have leaf table @@ -117,5 +117,5 @@ def _add_excludes(schema: Schema) -> None: bot_table = new_table("event_bot") bot_table.setdefault("filters", {})["excludes"] = ["re:^metadata", "re:^is_flagged$", "re:^data", "re:^custom_data"] bot_table["filters"]["includes"] = ["re:^data__custom$", "re:^custom_data__included_object__", "re:^metadata__elvl1__elvl2__"] - schema.update_schema(bot_table) + schema.update_table(bot_table) schema._compile_settings() diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index 8f17863c1b..19119b366e 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -96,7 +96,7 @@ def test_coerce_row(schema: Schema) -> None: assert new_row_1 == {"timestamp": pendulum.parse(timestamp_str), "confidence": 0.1, "value": 255, "number": Decimal("128.67")} # update schema - schema.update_schema(new_table) + schema.update_table(new_table) # no coercion on confidence row_2 = {"timestamp": timestamp_float, "confidence": 0.18721} @@ -119,7 +119,7 @@ def test_coerce_row(schema: Schema) -> None: assert new_columns[0]["name"] == "confidence__v_text" assert new_columns[0]["variant"] is True assert new_row_4 == {"timestamp": pendulum.parse(timestamp_str), "confidence__v_text": "STR"} - schema.update_schema(new_table) + schema.update_table(new_table) # add against variant new_row_4, new_table = schema.coerce_row("event_user", None, row_4) @@ -133,11 +133,11 @@ def test_coerce_row(schema: Schema) -> None: assert new_columns[0]["name"] == "confidence__v_bool" assert new_columns[0]["variant"] is True assert new_row_5 == {"confidence__v_bool": False} - schema.update_schema(new_table) + schema.update_table(new_table) # variant column clashes with existing column - create new_colbool_v_binary column that would be created for binary variant, but give it a type datetime _, new_table = schema.coerce_row("event_user", None, {"new_colbool": False, "new_colbool__v_timestamp": b"not fit"}) - schema.update_schema(new_table) + schema.update_table(new_table) with pytest.raises(CannotCoerceColumnException) as exc_val: # now pass the binary that would create binary variant - but the column is occupied by text type schema.coerce_row("event_user", None, {"new_colbool": pendulum.now()}) @@ -158,7 +158,7 @@ def test_coerce_row_iso_timestamp(schema: Schema) -> None: new_columns = list(new_table["columns"].values()) assert new_columns[0]["data_type"] == "timestamp" assert new_columns[0]["name"] == "timestamp" - schema.update_schema(new_table) + schema.update_table(new_table) # will coerce float row_2 = {"timestamp": 78172.128} @@ -181,7 +181,7 @@ def test_shorten_variant_column(schema: Schema) -> None: row_1 = {"timestamp": timestamp_float, "confidence": "0.1", "value": "0xFF", "number": Decimal("128.67")} _, new_table = schema.coerce_row("event_user", None, row_1) # schema assumes that identifiers are already normalized so confidence even if it is longer than 9 chars - schema.update_schema(new_table) + schema.update_table(new_table) assert "confidence" in schema.tables["event_user"]["columns"] # confidence_123456 # now variant is created and this will be normalized @@ -198,7 +198,7 @@ def test_coerce_complex_variant(schema: Schema) -> None: row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} new_row, new_table = schema.coerce_row("event_user", None, row) assert new_row == row - schema.update_schema(new_table) + schema.update_table(new_table) # add two more complex columns that should be coerced to text v_list = [1, 2, "3", {"complex": True}] @@ -213,7 +213,7 @@ def test_coerce_complex_variant(schema: Schema) -> None: assert c_new_columns[1]["data_type"] == "complex" assert "variant" not in c_new_columns[1] assert c_new_row["c_list"] == v_list - schema.update_schema(c_new_table) + schema.update_table(c_new_table) # add same row again c_new_row, c_new_table = schema.coerce_row("event_user", None, c_row) @@ -234,7 +234,7 @@ def test_coerce_complex_variant(schema: Schema) -> None: assert c_new_row_v["floatX__v_complex"] == v_list assert c_new_row_v["confidenceX__v_complex"] == v_dict assert c_new_row_v["strX"] == json.dumps(v_dict) - schema.update_schema(c_new_table_v) + schema.update_table(c_new_table_v) # add that row again c_row_v = {"floatX": v_list, "confidenceX": v_dict, "strX": v_dict} @@ -276,7 +276,7 @@ def test_supports_variant(schema: Schema) -> None: assert c_row["evm"] == Wei("21.37") assert new_table["columns"]["evm"]["data_type"] == "wei" assert "variant" not in new_table["columns"]["evm"] - schema.update_schema(new_table) + schema.update_table(new_table) # coerce row that should expand to variant c_row, new_table = schema.coerce_row("eth", None, normalized_rows[1][1]) assert isinstance(c_row["evm__v_str"], str) @@ -327,11 +327,11 @@ def __call__(self) -> Any: assert normalized_rows[1][1]["pv"]() == ("text", 21.37) # first normalized row fits into schema (pv is int) _, new_table = schema.coerce_row("pure_variant", None, normalized_rows[0][1]) - schema.update_schema(new_table) + schema.update_table(new_table) assert new_table["columns"]["pv"]["data_type"] == "bigint" _, new_table = schema.coerce_row("pure_variant", None, normalized_rows[1][1]) # we trick the normalizer to create text variant but actually provide double value - schema.update_schema(new_table) + schema.update_table(new_table) assert new_table["columns"]["pv__v_text"]["data_type"] == "double" # second row does not coerce: there's `pv__v_bool` field in it of type double but we already have a column that is text @@ -354,7 +354,7 @@ def test_corece_new_null_value(schema: Schema) -> None: def test_coerce_null_value_over_existing(schema: Schema) -> None: row = {"timestamp": 82178.1298812} new_row, new_table = schema.coerce_row("event_user", None, row) - schema.update_schema(new_table) + schema.update_table(new_table) row = {"timestamp": None} new_row, _ = schema.coerce_row("event_user", None, row) assert "timestamp" not in new_row @@ -363,7 +363,7 @@ def test_coerce_null_value_over_existing(schema: Schema) -> None: def test_corece_null_value_over_not_null(schema: Schema) -> None: row = {"timestamp": 82178.1298812} _, new_table = schema.coerce_row("event_user", None, row) - schema.update_schema(new_table) + schema.update_table(new_table) schema.get_table_columns("event_user", include_incomplete=True)["timestamp"]["nullable"] = False row = {"timestamp": None} with pytest.raises(CannotCoerceNullException): @@ -389,7 +389,7 @@ def test_update_schema_parent_missing(schema: Schema) -> None: tab1 = utils.new_table("tab1", parent_table_name="tab_parent") # tab_parent is missing in schema with pytest.raises(ParentTableNotFoundException) as exc_val: - schema.update_schema(tab1) + schema.update_table(tab1) assert exc_val.value.parent_table_name == "tab_parent" assert exc_val.value.table_name == "tab1" @@ -398,12 +398,12 @@ def test_update_schema_table_prop_conflict(schema: Schema) -> None: # parent table conflict tab1 = utils.new_table("tab1", write_disposition="append") tab_parent = utils.new_table("tab_parent", write_disposition="replace") - schema.update_schema(tab1) - schema.update_schema(tab_parent) + schema.update_table(tab1) + schema.update_table(tab_parent) tab1_u1 = deepcopy(tab1) tab1_u1["parent"] = "tab_parent" with pytest.raises(TablePropertiesConflictException) as exc_val: - schema.update_schema(tab1_u1) + schema.update_table(tab1_u1) assert exc_val.value.table_name == "tab1" assert exc_val.value.prop_name == "parent" assert exc_val.value.val1 is None @@ -414,12 +414,12 @@ def test_update_schema_column_conflict(schema: Schema) -> None: tab1 = utils.new_table("tab1", write_disposition="append", columns=[ {"name": "col1", "data_type": "text", "nullable": False}, ]) - schema.update_schema(tab1) + schema.update_table(tab1) tab1_u1 = deepcopy(tab1) # simulate column that had other datatype inferred tab1_u1["columns"]["col1"]["data_type"] = "bool" with pytest.raises(CannotCoerceColumnException) as exc_val: - schema.update_schema(tab1_u1) + schema.update_table(tab1_u1) assert exc_val.value.column_name == "col1" assert exc_val.value.from_type == "bool" assert exc_val.value.to_type == "text" @@ -448,7 +448,7 @@ def test_autodetect_convert_type(schema: Schema) -> None: assert c_row["evm"] == 1.0 assert isinstance(c_row["evm"], float) assert new_table["columns"]["evm"]["data_type"] == "double" - schema.update_schema(new_table) + schema.update_table(new_table) # add another row row = {"evm": Wei("21.37")} c_row, new_table = schema.coerce_row("eth", None, row) @@ -477,7 +477,7 @@ def __call__(self) -> Any: assert c_row["evm2__v_up"] == 22.0 assert isinstance(c_row["evm2__v_up"], float) assert new_table["columns"]["evm2__v_up"]["data_type"] == "double" - schema.update_schema(new_table) + schema.update_table(new_table) # add again row = {"evm2": AlwaysWei(22.2)} c_row, new_table = schema.coerce_row("eth", None, row) @@ -488,7 +488,7 @@ def __call__(self) -> Any: row = {"evm2": 22.1} _, new_table = schema.coerce_row("eth", None, row) assert new_table["columns"]["evm2"]["data_type"] == "double" - schema.update_schema(new_table) + schema.update_table(new_table) # and add variant again row = {"evm2": AlwaysWei(22.2)} # and this time variant will not be expanded @@ -505,7 +505,7 @@ def test_infer_on_incomplete_column(schema: Schema) -> None: incomplete_col["primary_key"] = True incomplete_col["x-special"] = "spec" table = utils.new_table("table", columns=[incomplete_col]) - schema.update_schema(table) + schema.update_table(table) # make sure that column is still incomplete and has no default hints assert schema.get_table("table")["columns"]["I"] == { 'name': 'I', diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 2856923319..222f7d679e 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -202,14 +202,17 @@ def test_diff_tables() -> None: # ignore identical table props existing = deepcopy(table) changed["write_disposition"] = "append" + changed["schema_contract_settings"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", "write_disposition": "append", + "schema_contract_settings": "freeze", "columns": {} } existing["write_disposition"] = "append" + existing["schema_contract_settings"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 695b65de71..741861bf8e 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -253,7 +253,7 @@ def test_save_load_incomplete_column(schema: Schema, schema_storage_no_import: S incomplete_col["primary_key"] = True incomplete_col["x-special"] = "spec" table = utils.new_table("table", columns=[incomplete_col]) - schema.update_schema(table) + schema.update_table(table) schema_storage_no_import.save_schema(schema) schema_copy = schema_storage_no_import.load_schema("event") assert schema_copy.get_table("table")["columns"]["I"] == { @@ -307,7 +307,7 @@ def test_unknown_engine_upgrade() -> None: def test_preserve_column_order(schema: Schema, schema_storage: SchemaStorage) -> None: # python dicts are ordered from v3.6, add 50 column with random names update: List[TColumnSchema] = [schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50)] - schema.update_schema(utils.new_table("event_test_order", columns=update)) + schema.update_table(utils.new_table("event_test_order", columns=update)) def verify_items(table, update) -> None: assert [i[0] for i in table.items()] == list(table.keys()) == [u["name"] for u in update] @@ -322,7 +322,7 @@ def verify_items(table, update) -> None: verify_items(table, update) # add more columns update2: List[TColumnSchema] = [schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50)] - loaded_schema.update_schema(utils.new_table("event_test_order", columns=update2)) + loaded_schema.update_table(utils.new_table("event_test_order", columns=update2)) table = loaded_schema.get_table_columns("event_test_order") verify_items(table, update + update2) # save and load @@ -400,7 +400,7 @@ def test_filter_hints_no_table(schema_storage: SchemaStorage) -> None: # infer table, update schema for the empty bot table coerced_row, update = schema.coerce_row("event_bot", None, bot_case) - schema.update_schema(update) + schema.update_table(update) # not empty anymore assert schema.get_table_columns("event_bot") is not None @@ -471,7 +471,7 @@ def test_write_disposition(schema_storage: SchemaStorage) -> None: # child tables schema.get_table("event_user")["write_disposition"] = "replace" - schema.update_schema(utils.new_table("event_user__intents", "event_user")) + schema.update_table(utils.new_table("event_user__intents", "event_user")) assert schema.get_table("event_user__intents").get("write_disposition") is None assert utils.get_write_disposition(schema.tables, "event_user__intents") == "replace" schema.get_table("event_user__intents")["write_disposition"] = "append" @@ -479,7 +479,7 @@ def test_write_disposition(schema_storage: SchemaStorage) -> None: # same but with merge schema.get_table("event_bot")["write_disposition"] = "merge" - schema.update_schema(utils.new_table("event_bot__message", "event_bot")) + schema.update_table(utils.new_table("event_bot__message", "event_bot")) assert utils.get_write_disposition(schema.tables, "event_bot__message") == "merge" schema.get_table("event_bot")["write_disposition"] = "skip" assert utils.get_write_disposition(schema.tables, "event_bot__message") == "skip" @@ -601,12 +601,12 @@ def assert_new_schema_values(schema: Schema) -> None: def test_group_tables_by_resource(schema: Schema) -> None: - schema.update_schema(utils.new_table("a_events", columns=[])) - schema.update_schema(utils.new_table("b_events", columns=[])) - schema.update_schema(utils.new_table("c_products", columns=[], resource="products")) - schema.update_schema(utils.new_table("a_events__1", columns=[], parent_table_name="a_events")) - schema.update_schema(utils.new_table("a_events__1__2", columns=[], parent_table_name="a_events__1")) - schema.update_schema(utils.new_table("b_events__1", columns=[], parent_table_name="b_events")) + schema.update_table(utils.new_table("a_events", columns=[])) + schema.update_table(utils.new_table("b_events", columns=[])) + schema.update_table(utils.new_table("c_products", columns=[], resource="products")) + schema.update_table(utils.new_table("a_events__1", columns=[], parent_table_name="a_events")) + schema.update_table(utils.new_table("a_events__1__2", columns=[], parent_table_name="a_events__1")) + schema.update_table(utils.new_table("b_events__1", columns=[], parent_table_name="b_events")) # All resources without filter expected_tables = { @@ -627,8 +627,8 @@ def test_group_tables_by_resource(schema: Schema) -> None: } # With resources that has many top level tables - schema.update_schema(utils.new_table("mc_products", columns=[], resource="products")) - schema.update_schema(utils.new_table("mc_products__sub", columns=[], parent_table_name="mc_products")) + schema.update_table(utils.new_table("mc_products", columns=[], resource="products")) + schema.update_table(utils.new_table("mc_products__sub", columns=[], parent_table_name="mc_products")) result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) # both tables with resource "products" must be here assert result == {'products': [ diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 8e34370180..299d144d74 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -62,7 +62,7 @@ def test_infer_column_bumps_version() -> None: schema = Schema("event") row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} _, new_table = schema.coerce_row("event_user", None, row) - schema.update_schema(new_table) + schema.update_table(new_table) # schema version will be recomputed assert schema.version == 2 assert schema.version_hash is not None @@ -70,7 +70,7 @@ def test_infer_column_bumps_version() -> None: # another table _, new_table = schema.coerce_row("event_bot", None, row) - schema.update_schema(new_table) + schema.update_table(new_table) # version is still 2 (increment of 1) assert schema.version == 2 # but the hash changed diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 88d3acfc0f..a4b6c5c89f 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -78,7 +78,7 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch # evolve schema row = {"floatX": 78172.128, "confidenceX": 1.2, "strX": "STR"} _, new_table = storage_schema.coerce_row("event_user", None, row) - storage_schema.update_schema(new_table) + storage_schema.update_table(new_table) storage.save_schema(storage_schema) # now use synced storage to load schema again reloaded_schema = synced_storage.load_schema("ethereum") diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 2965bd8866..8f4184731b 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -508,7 +508,7 @@ def created_ad_hoc(): schema = dlt.current.source_schema() assert schema.name == "created_ad_hoc" # modify schema in place - schema.update_schema(new_table("source_table")) + schema.update_table(new_table("source_table")) return dlt.resource([1, 2, 3], name="res") _assert_source_schema(created_ad_hoc(), "created_ad_hoc") @@ -519,7 +519,7 @@ def created_explicit(): schema = dlt.current.source_schema() assert schema.name == "explicit" # modify schema in place - schema.update_schema(new_table("source_table")) + schema.update_table(new_table("source_table")) return dlt.resource([1, 2, 3], name="res") _assert_source_schema(created_explicit(), "explicit") @@ -530,7 +530,7 @@ def created_global(): schema = dlt.current.source_schema() assert schema.name == "global" # modify schema in place - schema.update_schema(new_table("source_table")) + schema.update_table(new_table("source_table")) return dlt.resource([1, 2, 3], name="res") _assert_source_schema(created_global(), "global") @@ -575,7 +575,7 @@ def schema_test(): s = schema_test() schema = s.discover_schema() - schema.update_schema(new_table("table")) + schema.update_table(new_table("table")) s = schema_test() assert "table" not in s.discover_schema().tables diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 9848105f1a..a41deb2157 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -59,7 +59,7 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, **STATE_TABLE_COLUMNS }) - schema.update_schema(schema.normalize_table_identifiers(resource.compute_table_schema())) + schema.update_table(schema.normalize_table_identifiers(resource.compute_table_schema())) # do not bump version here or in sync_schema, dlt won't recognize that schema changed and it won't update it in storage # so dlt in normalize stage infers _state_version table again but with different column order and the column order in schema is different # then in database. parquet is created in schema order and in Redshift it must exactly match the order. diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 24a2a9de91..24c872ec41 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -389,6 +389,16 @@ class Items(BaseModel): name: str amount: int + + @dlt.resource(name="items", columns=Items) + def get_items(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 50 + }] + + @dlt.resource(name="items", columns=Items) def get_items_variant(): yield from [{ @@ -415,26 +425,29 @@ def get_items_subtable(): "sub": [{"hello": "dave"}] }] + pipeline.run([get_items()]) + pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + # disallow variants - with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "freeze"}) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) + pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "discard_row"}) + pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 # without settings it will pass - pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "evolve"}) + pipeline.run([get_items_variant()], schema_contract_settings="evolve") + pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 # disallow new col - with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_new_col()], schema_contract_settings={"column": "freeze"}) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) + pipeline.run([get_items_new_col()], schema_contract_settings={"column": "discard_row"}) + pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 # without settings it will pass - pipeline.run([get_items_new_col()], schema_contract_settings={"column": "evolve"}) + pipeline.run([get_items_new_col()], schema_contract_settings="evolve") + pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 # disallow new tables - with pytest.raises(PipelineStepFailed) as py_ex: - pipeline.run([get_items_subtable()], schema_contract_settings={"table": "freeze"}) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) + pipeline.run([get_items_subtable()], schema_contract_settings={"table": "discard_row"}) + pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 # without settings it will pass - pipeline.run([get_items_subtable()], schema_contract_settings={"table": "evolve"}) \ No newline at end of file + pipeline.run([get_items_subtable()], schema_contract_settings="evolve") + pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 133921d503..f77f8577b6 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -164,7 +164,7 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: # this will be not null record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") assert record_hash["unique"] is True - schema.update_schema(new_table(table_name, columns=[timestamp, sender_id, record_hash])) + schema.update_table(new_table(table_name, columns=[timestamp, sender_id, record_hash])) schema.bump_version() schema_update = client.update_stored_schema() # check hints in schema update @@ -186,7 +186,7 @@ def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: sender_id = schema._infer_column("sender_id", "982398490809324") # this will be not null record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") - schema.update_schema(new_table("event_test_table", columns=[timestamp, sender_id, record_hash])) + schema.update_table(new_table("event_test_table", columns=[timestamp, sender_id, record_hash])) schema.bump_version() schema_update = client.update_stored_schema() # check hints in schema update @@ -210,7 +210,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: schema = client.schema col1 = schema._infer_column("col1", "string") table_name = "event_test_table" + uniq_id() - schema.update_schema(new_table(table_name, columns=[col1])) + schema.update_table(new_table(table_name, columns=[col1])) schema.bump_version() schema_update = client.update_stored_schema() assert table_name in schema_update @@ -218,7 +218,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: assert schema_update[table_name]["columns"]["col1"]["data_type"] == "text" # with single alter table col2 = schema._infer_column("col2", 1) - schema.update_schema(new_table(table_name, columns=[col2])) + schema.update_table(new_table(table_name, columns=[col2])) schema.bump_version() schema_update = client.update_stored_schema() assert len(schema_update) == 1 @@ -229,7 +229,7 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: col3 = schema._infer_column("col3", 1.2) col4 = schema._infer_column("col4", 182879721.182912) col4["data_type"] = "timestamp" - schema.update_schema(new_table(table_name, columns=[col3, col4])) + schema.update_table(new_table(table_name, columns=[col3, col4])) schema.bump_version() schema_update = client.update_stored_schema() assert len(schema_update[table_name]["columns"]) == 2 @@ -304,7 +304,7 @@ def test_drop_tables(client: SqlJobClientBase) -> None: def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: schema = client.schema table_name = "event_test_table" + uniq_id() - schema.update_schema(new_table(table_name, columns=TABLE_UPDATE)) + schema.update_table(new_table(table_name, columns=TABLE_UPDATE)) schema.bump_version() schema_update = client.update_stored_schema() # we have all columns in the update @@ -338,7 +338,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: columns = deepcopy(TABLE_UPDATE) random.shuffle(columns) print(columns) - schema.update_schema(new_table(table_name, columns=columns)) + schema.update_table(new_table(table_name, columns=columns)) schema.bump_version() def _assert_columns_order(sql_: str) -> None: @@ -424,7 +424,7 @@ def test_load_with_all_types(client: SqlJobClientBase, write_disposition: str, f pytest.skip("preferred loader file format not set, destination will only work with staging") table_name = "event_test_table" + uniq_id() # we should have identical content with all disposition types - client.schema.update_schema(new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) + client.schema.update_table(new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) client.schema.bump_version() client.update_stored_schema() @@ -459,12 +459,12 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: str, re os.environ['DESTINATION__REPLACE_STRATEGY'] = replace_strategy table_name = "event_test_table" + uniq_id() - client.schema.update_schema( + client.schema.update_table( new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE) ) child_table = client.schema.naming.make_path(table_name, "child") # add child table without write disposition so it will be inferred from the parent - client.schema.update_schema( + client.schema.update_table( new_table(child_table, columns=TABLE_UPDATE, parent_table_name=table_name) ) client.schema.bump_version() @@ -590,7 +590,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: pytest.skip("preferred loader file format not set, destination will only work with staging") user_table = load_table("event_user")["event_user"] - client.schema.update_schema(new_table("event_user", columns=user_table.values())) + client.schema.update_table(new_table("event_user", columns=user_table.values())) client.schema.bump_version() schema_update = client.update_stored_schema() assert len(schema_update) > 0 @@ -644,7 +644,7 @@ def prepare_schema(client: SqlJobClientBase, case: str) -> None: # use first row to infer table table: TTableSchemaColumns = {k: client.schema._infer_column(k, v) for k, v in rows[0].items()} table_name = f"event_{case}_{uniq_id()}" - client.schema.update_schema(new_table(table_name, columns=table.values())) + client.schema.update_table(new_table(table_name, columns=table.values())) client.schema.bump_version() client.update_stored_schema() return rows, table_name diff --git a/tests/load/utils.py b/tests/load/utils.py index 6790a816fb..bcd95a31a2 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -187,7 +187,7 @@ def prepare_table(client: JobClientBase, case_name: str = "event_user", table_na user_table_name = table_name + uniq_id() else: user_table_name = table_name - client.schema.update_schema(new_table(user_table_name, columns=user_table.values())) + client.schema.update_table(new_table(user_table_name, columns=user_table.values())) client.schema.bump_version() client.update_stored_schema() return user_table_name diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index 8ef3ddd660..5cc823769b 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -63,7 +63,7 @@ def file_storage() -> FileStorage: def test_all_data_types(client: WeaviateClient, write_disposition: str, file_storage: FileStorage) -> None: class_name = "AllTypes" # we should have identical content with all disposition types - client.schema.update_schema(new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) + client.schema.update_table(new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) client.schema.bump_version() client.update_stored_schema() @@ -103,7 +103,7 @@ def test_case_sensitive_properties_create(client: WeaviateClient) -> None: "nullable": False }, ] - client.schema.update_schema(client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) + client.schema.update_table(client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) client.schema.bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -124,7 +124,7 @@ def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: "nullable": False }, ] - ci_client.schema.update_schema(ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) + ci_client.schema.update_table(ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) ci_client.schema.bump_version() ci_client.update_stored_schema() _, table_columns = ci_client.get_storage_table("ColClass") @@ -146,13 +146,13 @@ def test_case_sensitive_properties_add(client: WeaviateClient) -> None: "nullable": False }, ] - client.schema.update_schema( + client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) ) client.schema.bump_version() client.update_stored_schema() - client.schema.update_schema( + client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_update)) ) client.schema.bump_version() @@ -172,7 +172,7 @@ def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStor "data_type": "bigint", "nullable": False }} - client.schema.update_schema(new_table(class_name, columns=[table_create["col1"]])) + client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) client.schema.bump_version() client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI @@ -194,7 +194,7 @@ def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: Fi "data_type": "bigint", "nullable": False }} - ci_client.schema.update_schema(new_table(class_name, columns=[table_create["col1"]])) + ci_client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) ci_client.schema.bump_version() ci_client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI From db3f4478f763a87c8dfe4564fec1b953e1345877 Mon Sep 17 00:00:00 2001 From: Dave Date: Sun, 17 Sep 2023 23:46:53 +0200 Subject: [PATCH 28/73] more work --- dlt/common/schema/schema.py | 1 - dlt/extract/extract.py | 6 +- dlt/normalize/normalize.py | 3 +- tests/load/test_freeze_and_data_contract.py | 87 +++++++++++++++++++++ 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 3bbdbd51e3..92bb951702 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -254,7 +254,6 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name for item in list(row.keys()): for item in list(row.keys()): # if this is a new column for an existing table... - if table_exists and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index ae60c25dc4..fc1731c354 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -191,9 +191,13 @@ def extract_with_schema( extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) # iterate over all items in the pipeline and update the schema if dynamic table hints were present + # original_schema = schema.clone() for _, partials in extractor.items(): for partial in partials: - schema.update_table(schema.normalize_table_identifiers(partial)) + normalized_partial = schema.normalize_table_identifiers(partial) + # contract_modes = schema.resolve_contract_settings_for_table(normalized_partial.get("parent"), normalized_partial["name"]) + # _, normalized_partial = original_schema.apply_schema_contract(contract_modes, normalized_partial["name"], {}, normalized_partial) + schema.update_table(normalized_partial) return extract_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 7525fd588e..ff408f0271 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -133,6 +133,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, items_count = 0 row_counts: TRowCount = {} schema_contract_modes: TSchemaContractModes = None + original_schema = schema.clone() for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): @@ -151,7 +152,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, the check update if partial_table: - row, partial_table = schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table) + row, partial_table = original_schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table) if not row: continue diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 24c872ec41..be0bfd2793 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -451,3 +451,90 @@ def get_items_subtable(): # without settings it will pass pipeline.run([get_items_subtable()], schema_contract_settings="evolve") pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + + +def test_different_objects_in_one_load() -> None: + + pipeline = get_pipeline() + + @dlt.resource(name="items", schema_contract_settings={"column": "freeze", "table":"evolve"}) + def get_items(): + yield { + "id": 1, + "name": "dave", + "amount": 50 + } + yield { + "id": 2, + "name": "dave", + "amount": 50, + "new_column": "some val" + } + + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("table_mode", ["discard_row", "evolve"]) +def test_dynamic_tables(table_mode: str) -> None: + + pipeline = get_pipeline() + + @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}) + def get_items(): + yield { + "id": 1, + "table": "one", + } + yield { + "id": 2, + "table": "two", + "new_column": "some val" + } + + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["one"] == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts["two"] == (1 if table_mode == "evolve" else 0) + + +@pytest.mark.parametrize("column_mode", ["discard_row", "evolve"]) +def test_defined_column_in_new_table(column_mode: str) -> None: + pipeline = get_pipeline() + + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract_settings={"column": column_mode}) + def get_items(): + yield { + "id": 1, + "key": "value", + } + + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + + +@pytest.mark.parametrize("column_mode", ["discard_row", "evolve"]) +def test_dynamic_columns(column_mode: str) -> None: + + pipeline = get_pipeline() + + def columns(item): + if item["id"] == 1: + return [{"name": "col1", "data_type": "text", "nullable": True}] + if item["id"] == 2: + return [{"name": "col2", "data_type": "bigint", "nullable": True}] + + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract_settings={"column": column_mode}) + def get_items(): + yield { + "id": 1, + "key": "value", + } + yield { + "id": 2, + "key": "value", + } + + items = get_items() + items.apply_hints(columns=columns) + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 From c17577a5dd72734b901320c99fc3518305943e41 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 18 Sep 2023 15:54:35 +0200 Subject: [PATCH 29/73] tests update --- dlt/common/schema/schema.py | 4 +- dlt/common/schema/utils.py | 7 ++- dlt/common/validation.py | 2 +- dlt/normalize/normalize.py | 1 + tests/load/test_freeze_and_data_contract.py | 69 ++++++++++++++++++--- 5 files changed, 73 insertions(+), 10 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 92bb951702..e43c5a0410 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -241,7 +241,7 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - table_exists = table_name in self.tables and self.get_table_columns(table_name, include_incomplete=False) + table_exists = table_name in self.tables and len(self.get_table_columns(table_name, include_incomplete=False)) > 0 # check case where we have a new table if not table_exists: @@ -294,9 +294,11 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem table = self._schema_tables.get(table_name) if table is None: # add the whole new table to SchemaTables + print(f"NEW update_table: {table}") self._schema_tables[table_name] = partial_table else: # merge tables performing additional checks + print(f"MERGE merge_table: {table}") partial_table = utils.merge_tables(table, partial_table) return partial_table diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 6afc34f6d4..5f6de02593 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -657,7 +657,12 @@ def new_table( # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - table["schema_contract_settings"] = schema_contract_settings or {} + # TODO: do not write empty settings or they land in the schema file like this + # description: Created by DLT. Tracks completed loads + # schema_contract_settings: {} + # alternatively use apply/remove defaults in utils to remove/add on save! + if schema_contract_settings: + table["schema_contract_settings"] = schema_contract_settings if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 01e1f9f5b9..b746fda361 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -67,7 +67,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: except DictValidationException: pass if not has_passed: - type_names = [ut.__name__ for ut in union_types] + type_names = [str(get_args(ut)) if is_literal_type(ut) else ut.__name__ for ut in union_types] raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}. One of these types expected: {', '.join(type_names)}.", path, pk, pv) elif is_literal_type(t): a_l = get_args(t) diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index ff408f0271..b3df9c46b5 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -133,6 +133,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, items_count = 0 row_counts: TRowCount = {} schema_contract_modes: TSchemaContractModes = None + # TODO: better to mark tables as new and not clone original_schema = schema.clone() for item in items: diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index be0bfd2793..a24903b93f 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -475,12 +475,16 @@ def get_items(): assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 -@pytest.mark.parametrize("table_mode", ["discard_row", "evolve"]) +@pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "raise"]) def test_dynamic_tables(table_mode: str) -> None: pipeline = get_pipeline() - @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}) + # adding columns with a data type makes this columns complete which makes this table complete -> it fails in the normalize because + # the tables is NOT new according to normalizer so the row is not discarded + # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new + # if you uncomment update code in the extract the problem probably goes away + @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}, columns={"id": {"data_type": "bigint"}}) def get_items(): yield { "id": 1, @@ -512,16 +516,66 @@ def get_items(): assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 -@pytest.mark.parametrize("column_mode", ["discard_row", "evolve"]) -def test_dynamic_columns(column_mode: str) -> None: +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_new_column_from_hint_and_data(column_mode: str) -> None: + + pipeline = get_pipeline() + + # we define complete column on id, this creates a complete table + # normalizer does not know that it is a new table and discards the row + # and it also excepts on column freeze + + @dlt.resource( + name="items", + schema_contract_settings={"column": column_mode}, + columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + def get_items(): + yield { + "id": 1, + "key": "value", + } + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_two_new_columns_from_two_rows(column_mode: str) -> None: + + pipeline = get_pipeline() + + # this creates a complete table in first row + # and adds a new column to complete tables in 2nd row + # the test does not fail only because you clone schema in normalize + + @dlt.resource( + schema_contract_settings={"column": column_mode} + ) + def items(): + yield { + "id": 1, + } + yield { + "id": 1, + "key": "value", + } + pipeline.run([items()]) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_dynamic_new_columns(column_mode: str) -> None: pipeline = get_pipeline() + # fails because dlt is not able to add _dlt_load_id to tables. I think we should do an exception for those + # 1. schema.dlt_tables() - everything evolve + # 2. is_dlt_column (I hope we have helper) - column evolve, data_type freeze + def columns(item): if item["id"] == 1: - return [{"name": "col1", "data_type": "text", "nullable": True}] + return [{"name": "key", "data_type": "text", "nullable": True}] if item["id"] == 2: - return [{"name": "col2", "data_type": "bigint", "nullable": True}] + return [{"name": "id", "data_type": "bigint", "nullable": True}] @dlt.resource(name="items", table_name=lambda i: "items", schema_contract_settings={"column": column_mode}) def get_items(): @@ -536,5 +590,6 @@ def get_items(): items = get_items() items.apply_hints(columns=columns) - pipeline.run([get_items()]) + # apply hints apply to `items` not the original resource, so doing get_items() below removed them completely + pipeline.run(items) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 From fb1d22403be4b3691ca5229267a5942a4204ca81 Mon Sep 17 00:00:00 2001 From: Dave Date: Sat, 23 Sep 2023 18:21:46 +0200 Subject: [PATCH 30/73] almost there --- dlt/common/schema/schema.py | 14 +++-- dlt/common/schema/typing.py | 1 + dlt/common/schema/utils.py | 4 ++ dlt/extract/extract.py | 10 ++-- dlt/extract/schema.py | 13 +++-- dlt/normalize/normalize.py | 19 ++++--- dlt/pipeline/pipeline.py | 1 - tests/load/test_freeze_and_data_contract.py | 60 +++++++++------------ 8 files changed, 63 insertions(+), 59 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index e43c5a0410..5d03c6b4a1 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -205,6 +205,7 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: return settings # find table settings + # TODO: get root table... table_with_settings = parent_table or table_name # modes @@ -216,8 +217,10 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: return settings + def is_table_populated(self, table_name: str) -> bool: + return table_name in self.tables and (self.tables[table_name].get("populated") is True) - def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, table_populated: bool) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -241,10 +244,8 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - table_exists = table_name in self.tables and len(self.get_table_columns(table_name, include_incomplete=False)) > 0 - # check case where we have a new table - if not table_exists: + if not table_populated: if contract_modes["table"] in ["discard_row", "discard_value"]: return None, None if contract_modes["table"] == "freeze": @@ -254,7 +255,7 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name for item in list(row.keys()): for item in list(row.keys()): # if this is a new column for an existing table... - if table_exists and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): + if table_populated and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) @@ -262,7 +263,6 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None elif is_variant and contract_modes["data_type"] == "freeze": - print(contract_modes) raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") elif contract_modes["column"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") @@ -294,11 +294,9 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem table = self._schema_tables.get(table_name) if table is None: # add the whole new table to SchemaTables - print(f"NEW update_table: {table}") self._schema_tables[table_name] = partial_table else: # merge tables performing additional checks - print(f"MERGE merge_table: {table}") partial_table = utils.merge_tables(table, partial_table) return partial_table diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index d19a730b46..9919aec460 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -90,6 +90,7 @@ class TTableSchema(TypedDict, total=False): filters: Optional[TRowFilters] columns: TTableSchemaColumns resource: Optional[str] + populated: Optional[bool] class TPartialTableSchema(TTableSchema): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 5f6de02593..ac161356c2 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -498,6 +498,8 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl # aggregate schema updates aggregated_table = aggregated_update.setdefault(table_name, partial_table) aggregated_table["columns"].update(partial_table["columns"]) + if partial_table.get("populated") is True: + aggregated_table["populated"] = True return aggregated_update @@ -642,6 +644,7 @@ def new_table( validate_schema: bool = False, resource: str = None, schema_contract_settings: TSchemaContractSettings = None, + populated: bool = None ) -> TTableSchema: table: TTableSchema = { @@ -663,6 +666,7 @@ def new_table( # alternatively use apply/remove defaults in utils to remove/add on save! if schema_contract_settings: table["schema_contract_settings"] = schema_contract_settings + table["populated"] = populated if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index fc1731c354..36a134ba25 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -191,13 +191,15 @@ def extract_with_schema( extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) # iterate over all items in the pipeline and update the schema if dynamic table hints were present - # original_schema = schema.clone() + original_schema = schema.clone() for _, partials in extractor.items(): for partial in partials: normalized_partial = schema.normalize_table_identifiers(partial) - # contract_modes = schema.resolve_contract_settings_for_table(normalized_partial.get("parent"), normalized_partial["name"]) - # _, normalized_partial = original_schema.apply_schema_contract(contract_modes, normalized_partial["name"], {}, normalized_partial) - schema.update_table(normalized_partial) + contract_modes = schema.resolve_contract_settings_for_table(normalized_partial.get("parent"), normalized_partial["name"]) + table_populated = schema.is_table_populated(normalized_partial["name"]) + _, normalized_partial = original_schema.apply_schema_contract(contract_modes, normalized_partial["name"], {}, normalized_partial, table_populated) + if normalized_partial: + schema.update_table(normalized_partial) return extract_id diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index bcbd17707f..6ebf117720 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -24,7 +24,8 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] - schema_contract_settings: TSchemaContractSettings + schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] + populated: TTableHintTemplate[bool] class DltResourceSchema: @@ -98,6 +99,7 @@ def apply_hints( merge_key: TTableHintTemplate[TColumnNames] = None, incremental: Incremental[Any] = None, schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + populated: TTableHintTemplate[bool] = None ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -114,7 +116,7 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract_settings) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract_settings, populated) else: # set single hints t = deepcopy(self._table_schema_template) @@ -128,10 +130,12 @@ def apply_hints( t["parent"] = parent_table_name else: t.pop("parent", None) + if populated is not None: + t["populated"] = populated if write_disposition: t["write_disposition"] = write_disposition if schema_contract_settings: - t["schema_contract_settings"] = schema_contract_settings # type: ignore + t["schema_contract_settings"] = schema_contract_settings if columns is not None: # if callable then override existing if callable(columns) or callable(t["columns"]): @@ -212,6 +216,7 @@ def new_table_template( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + populated: TTableHintTemplate[bool] = None ) -> TTableSchemaTemplate: if not table_name: raise TableNameMissing() @@ -221,7 +226,7 @@ def new_table_template( if not callable(columns): columns = columns.values() # type: ignore # create a table schema template where hints can be functions taking TDataItem - new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract_settings=schema_contract_settings) # type: ignore + new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract_settings=schema_contract_settings, populated=populated) # type: ignore if primary_key: new_template["primary_key"] = primary_key if merge_key: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index b3df9c46b5..63214041ec 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -16,7 +16,7 @@ from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration from dlt.common.typing import TDataItem -from dlt.common.schema import TSchemaUpdate, Schema +from dlt.common.schema import TSchemaUpdate, Schema, utils from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo from dlt.common.utils import chunks, TRowCount, merge_row_count, increase_row_count @@ -133,13 +133,13 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, items_count = 0 row_counts: TRowCount = {} schema_contract_modes: TSchemaContractModes = None - # TODO: better to mark tables as new and not clone - original_schema = schema.clone() + is_table_populated: bool = False for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): if not schema_contract_modes: schema_contract_modes = schema.resolve_contract_settings_for_table(parent_table, table_name) + is_table_populated = schema.is_table_populated(table_name) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) @@ -151,9 +151,9 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) - # if we detect a migration, the check update + # if we detect a migration, check schema contract if partial_table: - row, partial_table = original_schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table) + row, partial_table = schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table, is_table_populated) if not row: continue @@ -275,11 +275,16 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None: schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) - # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) + # set all populated tables to populated + populated_updated = False + for table_name, count in row_counts.items(): + if count > 0 and schema.tables[table_name]["populated"] is not True: + schema.tables[table_name]["populated"] = True + populated_updated = True # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) - if len(schema_updates) > 0: + if len(schema_updates) > 0 or populated_updated: logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") # schema is updated, save it to schema volume self.schema_storage.save_schema(schema) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 2e96224e0d..c03fea8915 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -269,7 +269,6 @@ def extract( workers: int = None, schema_contract_settings: TSchemaContractSettings = None ) -> ExtractInfo: - print(schema_contract_settings) """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted storage = ExtractorStorage(self._normalize_storage_config) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index a24903b93f..4a3c9ee1ee 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -3,6 +3,7 @@ from dlt.common.utils import uniq_id from typing import Any from dlt.extract.source import DltSource, DltResource +import contextlib from tests.load.pipeline.utils import load_table_counts from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -18,6 +19,15 @@ LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["table", "column", "data_type"] +@contextlib.contextmanager +def raises_frozen_exception(check_raise: bool = True) -> Any: + if not check_raise: + yield + return + with pytest.raises(PipelineStepFailed) as py_exc: + yield + assert isinstance(py_exc.value.__context__, SchemaFrozenException) + def items(settings: TSchemaContractSettings) -> Any: @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) @@ -116,7 +126,7 @@ def source() -> DltResource: assert pipeline.default_schema._settings.get("schema_contract_settings", {}) == (settings.get("override") or settings.get("source")) # check items table settings - assert pipeline.default_schema.tables["items"]["schema_contract_settings"] == (settings.get("override") or settings.get("resource") or {}) + assert pipeline.default_schema.tables["items"].get("schema_contract_settings", {}) == (settings.get("override") or settings.get("resource") or {}) def get_pipeline(): import duckdb @@ -149,11 +159,7 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding new subtable - if contract_setting == "freeze": - with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_subtable, full_settings) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - else: + with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_subtable, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -161,11 +167,7 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) # test adding new table - if contract_setting == "freeze": - with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, new_items, full_settings) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - else: + with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, new_items, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) @@ -199,11 +201,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column - if contract_setting == "freeze": - with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_new_column, full_settings) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - else: + with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_new_column, full_settings) if contract_setting == "evolve": @@ -214,11 +212,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard_value"] else 20) # test adding variant column - if contract_setting == "freeze": - with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_variant, full_settings) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - else: + with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_variant, full_settings) if contract_setting == "evolve": @@ -262,11 +256,7 @@ def test_freeze_variants(contract_setting: str, setting_location: str) -> None: assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding variant column - if contract_setting == "freeze": - with pytest.raises(PipelineStepFailed) as py_ex: - run_resource(pipeline, items_with_variant, full_settings) - assert isinstance(py_ex.value.__context__, SchemaFrozenException) - else: + with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_variant, full_settings) if contract_setting == "evolve": @@ -475,7 +465,7 @@ def get_items(): assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 -@pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "raise"]) +@pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "freeze"]) def test_dynamic_tables(table_mode: str) -> None: pipeline = get_pipeline() @@ -484,7 +474,7 @@ def test_dynamic_tables(table_mode: str) -> None: # the tables is NOT new according to normalizer so the row is not discarded # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new # if you uncomment update code in the extract the problem probably goes away - @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}, columns={"id": {"data_type": "bigint"}}) + @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}, columns={"id": {}}) def get_items(): yield { "id": 1, @@ -497,11 +487,11 @@ def get_items(): } pipeline.run([get_items()]) - assert pipeline.last_trace.last_normalize_info.row_counts["one"] == (1 if table_mode == "evolve" else 0) - assert pipeline.last_trace.last_normalize_info.row_counts["two"] == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) -@pytest.mark.parametrize("column_mode", ["discard_row", "evolve"]) +@pytest.mark.parametrize("column_mode", ["discard_row", "evolve", "freeze"]) def test_defined_column_in_new_table(column_mode: str) -> None: pipeline = get_pipeline() @@ -511,9 +501,8 @@ def get_items(): "id": 1, "key": "value", } - pipeline.run([get_items()]) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 @pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) @@ -534,8 +523,9 @@ def get_items(): "id": 1, "key": "value", } + pipeline.run([get_items()]) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 @pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) @@ -592,4 +582,4 @@ def get_items(): items.apply_hints(columns=columns) # apply hints apply to `items` not the original resource, so doing get_items() below removed them completely pipeline.run(items) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 2 From 6a15fa2360e153f3a4985a1ee32eb499de073851 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 26 Sep 2023 13:35:38 +0200 Subject: [PATCH 31/73] tmp --- dlt/common/schema/utils.py | 3 ++- dlt/normalize/normalize.py | 2 +- dlt/pipeline/pipeline.py | 2 ++ tests/load/test_freeze_and_data_contract.py | 3 +++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index ac161356c2..d3aaff0c11 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -666,7 +666,8 @@ def new_table( # alternatively use apply/remove defaults in utils to remove/add on save! if schema_contract_settings: table["schema_contract_settings"] = schema_contract_settings - table["populated"] = populated + if populated is not None: + table["populated"] = populated if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 63214041ec..c557922285 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -280,7 +280,7 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files # set all populated tables to populated populated_updated = False for table_name, count in row_counts.items(): - if count > 0 and schema.tables[table_name]["populated"] is not True: + if count > 0 and schema.tables[table_name].get("populated") is not True: schema.tables[table_name]["populated"] = True populated_updated = True # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index c03fea8915..edc8097424 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -295,6 +295,7 @@ def extract( return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor + raise exc raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc @with_runtime_trace @@ -328,6 +329,7 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No runner.run_pool(normalize.config, normalize) return normalize.get_normalize_info() except Exception as n_ex: + raise n_ex raise PipelineStepFailed(self, "normalize", n_ex, normalize.get_normalize_info()) from n_ex @with_runtime_trace diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 4a3c9ee1ee..abcb925e03 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -19,6 +19,9 @@ LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["table", "column", "data_type"] +LOCATIONS = ["source"] +schema_contract_settings = ["freeze"] + @contextlib.contextmanager def raises_frozen_exception(check_raise: bool = True) -> Any: if not check_raise: From d66c2e6cdc67812c0a78351f1188171c970e093b Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 26 Sep 2023 15:16:50 +0200 Subject: [PATCH 32/73] fix freeze tests --- dlt/common/schema/schema.py | 25 ++++++------ dlt/common/schema/utils.py | 2 - dlt/extract/extract.py | 7 +--- dlt/normalize/normalize.py | 2 +- dlt/pipeline/pipeline.py | 2 - .../schema/test_contract_mode_functions.py | 38 +++++++++---------- tests/common/schema/test_schema.py | 4 +- tests/load/test_freeze_and_data_contract.py | 11 +++--- 8 files changed, 41 insertions(+), 50 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 5d03c6b4a1..152b888b29 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -253,19 +253,18 @@ def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name # check columns for item in list(row.keys()): - for item in list(row.keys()): - # if this is a new column for an existing table... - if table_populated and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): - is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") - if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): - row.pop(item) - partial_table["columns"].pop(item) - elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): - return None, None - elif is_variant and contract_modes["data_type"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") - elif contract_modes["column"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") + # if this is a new column for an existing table... + if table_populated and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): + is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") + if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): + row.pop(item) + partial_table["columns"].pop(item) + elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): + return None, None + elif is_variant and contract_modes["data_type"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") + elif contract_modes["column"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") return row, partial_table diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index d3aaff0c11..bf572d1a42 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -498,8 +498,6 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl # aggregate schema updates aggregated_table = aggregated_update.setdefault(table_name, partial_table) aggregated_table["columns"].update(partial_table["columns"]) - if partial_table.get("populated") is True: - aggregated_table["populated"] = True return aggregated_update diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 36a134ba25..704823980c 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -191,15 +191,10 @@ def extract_with_schema( extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) # iterate over all items in the pipeline and update the schema if dynamic table hints were present - original_schema = schema.clone() for _, partials in extractor.items(): for partial in partials: normalized_partial = schema.normalize_table_identifiers(partial) - contract_modes = schema.resolve_contract_settings_for_table(normalized_partial.get("parent"), normalized_partial["name"]) - table_populated = schema.is_table_populated(normalized_partial["name"]) - _, normalized_partial = original_schema.apply_schema_contract(contract_modes, normalized_partial["name"], {}, normalized_partial, table_populated) - if normalized_partial: - schema.update_table(normalized_partial) + schema.update_table(normalized_partial) return extract_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index c557922285..0512686026 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -139,7 +139,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): if not schema_contract_modes: schema_contract_modes = schema.resolve_contract_settings_for_table(parent_table, table_name) - is_table_populated = schema.is_table_populated(table_name) + is_table_populated = schema.is_table_populated(table_name) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index edc8097424..c03fea8915 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -295,7 +295,6 @@ def extract( return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor - raise exc raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc @with_runtime_trace @@ -329,7 +328,6 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No runner.run_pool(normalize.config, normalize) return normalize.get_normalize_info() except Exception as n_ex: - raise n_ex raise PipelineStepFailed(self, "normalize", n_ex, normalize.get_normalize_info()) from n_ex @with_runtime_trace diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index d33fdedeb5..2621f04fe1 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -178,12 +178,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.apply_schema_contract({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table) == (data, new_table) - assert schema.apply_schema_contract({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table, False) == (data, new_table) + assert schema.apply_schema_contract({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table, False) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table, False) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table) + schema.apply_schema_contract({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table, False) @pytest.mark.parametrize("base_settings", base_settings) @@ -213,12 +213,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) # @@ -245,12 +245,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) @@ -281,16 +281,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 741861bf8e..8c19397028 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -632,8 +632,8 @@ def test_group_tables_by_resource(schema: Schema) -> None: result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) # both tables with resource "products" must be here assert result == {'products': [ - {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': {}}, - {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append', 'schema_contract_settings': {}}, + {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append'}, + {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append'}, {'columns': {}, 'name': 'mc_products__sub', 'parent': 'mc_products'} ] } diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index abcb925e03..b6577783ce 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -19,16 +19,15 @@ LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["table", "column", "data_type"] -LOCATIONS = ["source"] -schema_contract_settings = ["freeze"] - @contextlib.contextmanager def raises_frozen_exception(check_raise: bool = True) -> Any: if not check_raise: yield return with pytest.raises(PipelineStepFailed) as py_exc: + print("yield") yield + print("after") assert isinstance(py_exc.value.__context__, SchemaFrozenException) def items(settings: TSchemaContractSettings) -> Any: @@ -146,10 +145,12 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None setting_location: { "table": contract_setting }} + print("RUN 1") run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + print("RUN 2") run_resource(pipeline, items_with_new_column, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -488,8 +489,8 @@ def get_items(): "table": "two", "new_column": "some val" } - - pipeline.run([get_items()]) + with raises_frozen_exception(table_mode == "freeze"): + pipeline.run([get_items()]) assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) From bf9da7e7421d883bf31856a31c482f879f508248 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 26 Sep 2023 17:07:20 +0200 Subject: [PATCH 33/73] cleanup --- dlt/common/schema/schema.py | 7 ++- dlt/common/schema/utils.py | 4 -- dlt/normalize/normalize.py | 6 +- tests/load/test_freeze_and_data_contract.py | 66 +++++++++++---------- tests/load/test_job_client.py | 2 +- 5 files changed, 43 insertions(+), 42 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 152b888b29..3871dc3e26 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -205,11 +205,12 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: return settings # find table settings - # TODO: get root table... - table_with_settings = parent_table or table_name + table = parent_table or table_name + if table in self.tables: + table = utils.get_top_level_table(self.tables, parent_table or table_name)["name"] # modes - table_contract_modes = resolve_single(self.tables.get(table_with_settings, {}).get("schema_contract_settings", {})) + table_contract_modes = resolve_single(self.tables.get(table, {}).get("schema_contract_settings", {})) schema_contract_modes = resolve_single(self._settings.get("schema_contract_settings", {})) # resolve to correct settings dict diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index bf572d1a42..6916996860 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -658,10 +658,6 @@ def new_table( # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - # TODO: do not write empty settings or they land in the schema file like this - # description: Created by DLT. Tracks completed loads - # schema_contract_settings: {} - # alternatively use apply/remove defaults in utils to remove/add on save! if schema_contract_settings: table["schema_contract_settings"] = schema_contract_settings if populated is not None: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 0512686026..c9fb2ea83b 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -278,13 +278,13 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) # set all populated tables to populated - populated_updated = False + needs_schema_save = len(schema_updates) > 0 for table_name, count in row_counts.items(): if count > 0 and schema.tables[table_name].get("populated") is not True: schema.tables[table_name]["populated"] = True - populated_updated = True + needs_schema_save = True # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) - if len(schema_updates) > 0 or populated_updated: + if needs_schema_save: logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") # schema is updated, save it to schema volume self.schema_storage.save_schema(schema) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index b6577783ce..a41a7cc3e3 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -1,7 +1,7 @@ import dlt, os, pytest from dlt.common.schema.typing import TSchemaContractSettings from dlt.common.utils import uniq_id -from typing import Any +from typing import Any, Union, Optional from dlt.extract.source import DltSource, DltResource import contextlib @@ -145,12 +145,10 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None setting_location: { "table": contract_setting }} - print("RUN 1") run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - print("RUN 2") run_resource(pipeline, items_with_new_column, full_settings) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -376,24 +374,26 @@ def test_data_contract_interaction() -> None: ensure data contracts with pydantic are enforced properly """ from pydantic import BaseModel - pipeline = get_pipeline() class Items(BaseModel): id: int # noqa: A003 - name: str - amount: int + name: Optional[str] + amount: Optional[int] + @dlt.resource(name="items", columns=Items) + def get_items_simple(): + yield from [{ + "id": 5 + }] @dlt.resource(name="items", columns=Items) def get_items(): yield from [{ "id": 5, "name": "dave", - "amount": 50 }] - - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items_variant(): yield from [{ "id": 5, @@ -401,7 +401,7 @@ def get_items_variant(): "amount": "HELLO" }] - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items_new_col(): yield from [{ "id": 5, @@ -410,7 +410,7 @@ def get_items_new_col(): "new_col": "hello" }] - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items_subtable(): yield from [{ "id": 5, @@ -419,32 +419,36 @@ def get_items_subtable(): "sub": [{"hello": "dave"}] }] - pipeline.run([get_items()]) - pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - - # disallow variants + # test variants + pipeline = get_pipeline() + pipeline.run([get_items_simple()]) + pipeline.run([get_items()], schema_contract_settings={"data_type": "discard_row"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "discard_row"}) - pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "evolve"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - # without settings it will pass - pipeline.run([get_items_variant()], schema_contract_settings="evolve") - pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - - # disallow new col + # test new column + pipeline = get_pipeline() + pipeline.run([get_items_simple()]) + pipeline.run([get_items()], schema_contract_settings={"column": "discard_row"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 pipeline.run([get_items_new_col()], schema_contract_settings={"column": "discard_row"}) - pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + pipeline.run([get_items_new_col()], schema_contract_settings={"column": "evolve"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - # without settings it will pass - pipeline.run([get_items_new_col()], schema_contract_settings="evolve") - pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - - # disallow new tables + # test new subtable + pipeline = get_pipeline() + pipeline.run([get_items_simple()]) pipeline.run([get_items_subtable()], schema_contract_settings={"table": "discard_row"}) - pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 0 - # without settings it will pass - pipeline.run([get_items_subtable()], schema_contract_settings="evolve") - pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + pipeline.run([get_items_subtable()], schema_contract_settings={"table": "evolve"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 1 def test_different_objects_in_one_load() -> None: diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index f77f8577b6..684a43b82b 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -337,7 +337,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: import random columns = deepcopy(TABLE_UPDATE) random.shuffle(columns) - print(columns) + schema.update_table(new_table(table_name, columns=columns)) schema.bump_version() From 038d03a466683a23ad6e7c7d36f682af0fbc3957 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 26 Sep 2023 17:32:55 +0200 Subject: [PATCH 34/73] create data contracts page --- .../docs/general-usage/data-contracts.md | 77 +++++++++++++++++++ docs/website/docs/reference/performance.md | 1 - docs/website/sidebars.js | 1 + 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 docs/website/docs/general-usage/data-contracts.md diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md new file mode 100644 index 0000000000..eaa220d000 --- /dev/null +++ b/docs/website/docs/general-usage/data-contracts.md @@ -0,0 +1,77 @@ +--- +title: Data Contracts +description: Data contracts and controlling schema evolution +keywords: [data contracts, schema, dlt schema, pydantic] +--- + +## Data contracts and controlling schema evolution + +`dlt` will evolve the schema of the destination to accomodate the structure and data types of the extracted data. There are several settings +that you can use to control this automatic schema evolution, from the default settings where all changes to the schema are accepted to +a frozen schema that does not change at all. + +Consider this example: + +```py +@dlt.resource(schema_contract_settings={"table": "evolve", "columns": "freeze"}) +def items(): + ... +``` + +This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which +contains a new column. + +The `schema_contract_settings` exists on the `source` decorator as a directive for all resources of that source and on the +`resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. +The `schema_contract_settings` is a dictionary with keys that control the following: + +* `table` creating of new tables and subtables +* `columns` creating of new columns on an existing table +* `data_type` creating of new variant columns, which happens if a different datatype is discovered in the extracted data than exists in the schema + +Each property can be set to one of three values: +* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination +* `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. All other rows will be. +* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. + +### Code Examples + +The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. + +```py +@dlt.resource(schema_contract_settings={"table": "discard_row", "columns": "evolve", "data_type": "freeze"}) +def items(): + ... +``` + +The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. + +```py +pipeline.run(my_source(), schema_contract_settings="freeze") +``` + +The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. +Here for all resources variant columns are frozen and raise an error if encountered, on `items` new columns are allowed but `other_items` inherits the `freeze` setting from +the source, thus new columns are frozen there. New tables are allowed. + +```py +@dlt.resource(schema_contract_settings={"columns": "evolve"}) +def items(): + ... + +@dlt.resource() +def other_items(): + ... + +@dlt.source(schema_contract_settings={"columns": "freeze", "data_type": "freeze"}): +def source(): + return [items(), other_items()] + + +# this will use the settings defined by the decorators +pipeline.run(source()) + +# this will freeze the whole schema, regardless of the decorator settings +pipeline.run(source(), schema_contract_settings="freeze") + +``` \ No newline at end of file diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 86e4d8d575..7ae5121489 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -309,7 +309,6 @@ def read_table(limit): now = pendulum.now().isoformat() yield [{"row": _id, "description": "this is row with id {_id}", "timestamp": now} for _id in item_slice] - # this prevents process pool to run the initialization code again if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: pipeline = dlt.pipeline("parallel_load", destination="duckdb", full_refresh=True) diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 953a7f6372..545d6daa3e 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -190,6 +190,7 @@ const sidebars = { 'general-usage/full-loading', 'general-usage/credentials', 'general-usage/schema', + 'general-usage/data-contracts', 'general-usage/configuration', 'general-usage/glossary', { From 84384c349b3fb1972a5d42c27ea59ec12d84b7e9 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 26 Sep 2023 17:36:36 +0200 Subject: [PATCH 35/73] small cleanup --- tests/load/test_freeze_and_data_contract.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index a41a7cc3e3..5b80e838c2 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -25,9 +25,7 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: yield return with pytest.raises(PipelineStepFailed) as py_exc: - print("yield") yield - print("after") assert isinstance(py_exc.value.__context__, SchemaFrozenException) def items(settings: TSchemaContractSettings) -> Any: From 44dfb691e184936f5bff2e389c389c88ec6a571e Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 27 Sep 2023 00:49:18 +0200 Subject: [PATCH 36/73] add pydantic dep to destination tests --- .github/workflows/test_destinations.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 04240b6ec1..ecdbb91b44 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -81,7 +81,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E pydantic # - name: Install self # run: poetry install --no-interaction From 7b8f2d2b2a5cfa11e44bf0eca522e0087b1448c9 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 29 Sep 2023 18:57:41 +0200 Subject: [PATCH 37/73] rename contract settings --- dlt/common/pipeline.py | 4 +- dlt/common/schema/schema.py | 26 +++---- dlt/common/schema/typing.py | 8 +-- dlt/common/schema/utils.py | 14 ++-- dlt/extract/decorators.py | 24 +++---- dlt/extract/schema.py | 16 ++--- dlt/extract/source.py | 12 ++-- dlt/normalize/normalize.py | 4 +- dlt/pipeline/__init__.py | 6 +- dlt/pipeline/pipeline.py | 14 ++-- .../docs/general-usage/data-contracts.md | 16 ++--- .../cases/schemas/eth/ethereum_schema_v7.yml | 8 +-- .../schema/test_contract_mode_functions.py | 12 ++-- tests/common/schema/test_merges.py | 6 +- .../cases/eth_source/ethereum.schema.yaml | 8 +-- tests/load/test_freeze_and_data_contract.py | 70 +++++++++---------- 16 files changed, 124 insertions(+), 124 deletions(-) diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index ee141c6ae8..ef3b7b985e 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -16,7 +16,7 @@ from dlt.common.destination import DestinationReference, TDestinationReferenceArg from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContractSettings +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.typing import DictStrAny, REPattern @@ -210,7 +210,7 @@ def run( primary_key: TColumnNames = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: ... diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 3871dc3e26..4d734214fc 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -11,14 +11,14 @@ from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractModes, TSchemaContractSettings) + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict from dlt.common.schema.exceptions import SchemaFrozenException -DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractModes = { +DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { "table": "evolve", "column": "evolve", "data_type": "evolve" @@ -195,13 +195,13 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaContractModes: + def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" - def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: + def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: settings = settings or {} if isinstance(settings, str): - return TSchemaContractModes(table=settings, column=settings, data_type=settings) + return TSchemaContractDict(table=settings, column=settings, data_type=settings) return settings # find table settings @@ -210,18 +210,18 @@ def resolve_single(settings: TSchemaContractSettings) -> TSchemaContractModes: table = utils.get_top_level_table(self.tables, parent_table or table_name)["name"] # modes - table_contract_modes = resolve_single(self.tables.get(table, {}).get("schema_contract_settings", {})) - schema_contract_modes = resolve_single(self._settings.get("schema_contract_settings", {})) + table_contract_modes = resolve_single(self.tables.get(table, {}).get("schema_contract", {})) + schema_contract_modes = resolve_single(self._settings.get("schema_contract", {})) # resolve to correct settings dict - settings = cast(TSchemaContractModes, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes}) + settings = cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes}) return settings def is_table_populated(self, table_name: str) -> bool: return table_name in self.tables and (self.tables[table_name].get("populated") is True) - def apply_schema_contract(self, contract_modes: TSchemaContractModes, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, table_populated: bool) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, table_populated: bool) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -279,7 +279,7 @@ def update_schema(self, schema: "Schema") -> None: self.update_table( self.normalize_table_identifiers(table) ) - self.set_schema_contract_settings(schema._settings.get("schema_contract_settings", {})) + self.set_schema_contract(schema._settings.get("schema_contract", {})) def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] @@ -470,12 +470,12 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) - def set_schema_contract_settings(self, settings: TSchemaContractSettings, update_table_settings: bool = False) -> None: - self._settings["schema_contract_settings"] = settings + def set_schema_contract(self, settings: TSchemaContract, update_table_settings: bool = False) -> None: + self._settings["schema_contract"] = settings if update_table_settings: for table in self.tables.values(): if not table.get("parent"): - table["schema_contract_settings"] = settings + table["schema_contract"] = settings def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: column_schema = TColumnSchema( diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index bcad23fa92..a516295e71 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -72,13 +72,13 @@ class TColumnSchema(TColumnSchemaBase, total=False): TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] -class TSchemaContractModes(TypedDict, total=False): +class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" table: Optional[TSchemaEvolutionMode] column: Optional[TSchemaEvolutionMode] data_type: Optional[TSchemaEvolutionMode] -TSchemaContractSettings = Union[TSchemaEvolutionMode, TSchemaContractModes] +TSchemaContract = Union[TSchemaEvolutionMode, TSchemaContractDict] class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] @@ -90,7 +90,7 @@ class TTableSchema(TypedDict, total=False): name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] - schema_contract_settings: Optional[TSchemaContractSettings] + schema_contract: Optional[TSchemaContract] parent: Optional[str] filters: Optional[TRowFilters] columns: TTableSchemaColumns @@ -107,7 +107,7 @@ class TPartialTableSchema(TTableSchema): class TSchemaSettings(TypedDict, total=False): - schema_contract_settings: Optional[TSchemaContractSettings] + schema_contract: Optional[TSchemaContract] detections: Optional[List[TTypeDetections]] default_hints: Optional[Dict[TColumnHint, List[TSimpleRegex]]] preferred_types: Optional[Dict[TSimpleRegex, TDataType]] diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index dcbce2b617..1a52fe1c09 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -16,7 +16,7 @@ from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContractSettings, TSchemaContractModes) + TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContract, TSchemaContractDict) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName) @@ -343,11 +343,11 @@ def migrate_filters(group: str, filters: List[str]) -> None: if from_engine == 6 and to_engine > 6: # migrate from sealed properties to schema evolution settings schema_dict["settings"].pop("schema_sealed", None) - schema_dict["settings"]["schema_contract_settings"] = {} + schema_dict["settings"]["schema_contract"] = {} for table in schema_dict["tables"].values(): table.pop("table_sealed", None) if not table.get("parent"): - table["schema_contract_settings"] = {} + table["schema_contract"] = {} from_engine = 7 schema_dict["engine_version"] = from_engine @@ -646,7 +646,7 @@ def new_table( columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, resource: str = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, populated: bool = None ) -> TTableSchema: @@ -658,13 +658,13 @@ def new_table( table["parent"] = parent_table_name assert write_disposition is None assert resource is None - assert schema_contract_settings is None + assert schema_contract is None else: # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - if schema_contract_settings: - table["schema_contract_settings"] = schema_contract_settings + if schema_contract: + table["schema_contract"] = schema_contract if populated is not None: table["populated"] = populated if validate_schema: diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 16a7bf60d2..e5342c1c68 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -14,7 +14,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContract from dlt.extract.utils import ensure_table_schema_columns_hint from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -52,7 +52,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None ) -> Callable[TSourceFunParams, DltSource]: ... @@ -66,7 +66,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, DltSource]]: ... @@ -79,7 +79,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. @@ -114,7 +114,7 @@ def source( schema (Schema, optional): An explicit `Schema` instance to be associated with the source. If not present, `dlt` creates a new `Schema` object with provided `name`. If such `Schema` already exists in the same folder as the module containing the decorated function, such schema will be loaded from file. - schema_contract_settings (TSchemaContractSettings, optional): Schema contract settings that will be applied to this resource. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to this resource. spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. @@ -174,7 +174,7 @@ def _wrap(*args: Any, **kwargs: Any) -> DltSource: # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting - s.schema_contract_settings = schema_contract_settings + s.schema_contract = schema_contract # enable root propagation s.root_key = root_key return s @@ -206,7 +206,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[TResourceFunParams, DltResource]: @@ -222,7 +222,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: @@ -238,7 +238,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> DltResource: @@ -254,7 +254,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, data_from: TUnboundDltResource = None, @@ -303,7 +303,7 @@ def resource( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - schema_contract_settings (TSchemaContractSettings, optional): Schema contract settings that will be applied to all resources of this source (if not overriden in the resource itself) + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to all resources of this source (if not overriden in the resource itself) selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. @@ -325,7 +325,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=columns, primary_key=primary_key, merge_key=merge_key, - schema_contract_settings=schema_contract_settings + schema_contract=schema_contract ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index a21d647e1f..4d80e2ba9a 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -3,7 +3,7 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContract from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -25,7 +25,7 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] + schema_contract: TTableHintTemplate[TSchemaContract] populated: TTableHintTemplate[bool] validator: ValidateItem @@ -100,7 +100,7 @@ def apply_hints( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, incremental: Incremental[Any] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, populated: TTableHintTemplate[bool] = None ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -118,7 +118,7 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract_settings, populated) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract, populated) else: # set single hints t = deepcopy(self._table_schema_template) @@ -136,8 +136,8 @@ def apply_hints( t["populated"] = populated if write_disposition: t["write_disposition"] = write_disposition - if schema_contract_settings: - t["schema_contract_settings"] = schema_contract_settings + if schema_contract: + t["schema_contract"] = schema_contract if columns is not None: t['validator'] = get_column_validator(columns) # if callable then override existing @@ -218,7 +218,7 @@ def new_table_template( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - schema_contract_settings: TTableHintTemplate[TSchemaContractSettings] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, populated: TTableHintTemplate[bool] = None ) -> TTableSchemaTemplate: if not table_name: @@ -232,7 +232,7 @@ def new_table_template( else: validator = None # create a table schema template where hints can be functions taking TDataItem - new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract_settings=schema_contract_settings, populated=populated) # type: ignore + new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract, populated=populated) # type: ignore if primary_key: new_template["primary_key"] = primary_key diff --git a/dlt/extract/source.py b/dlt/extract/source.py index c9fb96147c..49baee0508 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -11,7 +11,7 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer, RelationalNormalizerConfigPropagation from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnName, TSchemaContractSettings +from dlt.common.schema.typing import TColumnName, TSchemaContract from dlt.common.typing import AnyFun, StrAny, TDataItem, TDataItems, NoneType from dlt.common.configuration.container import Container from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, resource_state, source_state, pipeline_state @@ -626,12 +626,12 @@ def max_table_nesting(self, value: int) -> None: RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) @property - def schema_contract_settings(self) -> TSchemaContractSettings: - return self.schema.settings["schema_contract_settings"] + def schema_contract(self) -> TSchemaContract: + return self.schema.settings["schema_contract"] - @schema_contract_settings.setter - def schema_contract_settings(self, settings: TSchemaContractSettings) -> None: - self.schema.set_schema_contract_settings(settings) + @schema_contract.setter + def schema_contract(self, settings: TSchemaContract) -> None: + self.schema.set_schema_contract(settings) @property def exhausted(self) -> bool: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index c9fb2ea83b..5b189c87bd 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -11,7 +11,7 @@ from dlt.common.runners import TRunMetrics, Runnable from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractModes +from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractDict from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration @@ -132,7 +132,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, schema_name = schema.name items_count = 0 row_counts: TRowCount = {} - schema_contract_modes: TSchemaContractModes = None + schema_contract_modes: TSchemaContractDict = None is_table_populated: bool = False for item in items: diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 01bd0859f2..fdf8c2af06 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,7 +1,7 @@ from typing import Sequence, cast, overload from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContractSettings +from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config @@ -177,7 +177,7 @@ def run( columns: Sequence[TColumnSchema] = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_contract_settings: TSchemaContractSettings = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -239,7 +239,7 @@ def run( columns=columns, schema=schema, loader_file_format=loader_file_format, - schema_contract_settings=schema_contract_settings + schema_contract=schema_contract ) # plug default tracking module diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 4f7f414c1a..9e5531947e 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -18,7 +18,7 @@ MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContractSettings +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -267,7 +267,7 @@ def extract( schema: Schema = None, max_parallel_items: int = None, workers: int = None, - schema_contract_settings: TSchemaContractSettings = None + schema_contract: TSchemaContract = None ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted @@ -289,8 +289,8 @@ def extract( storage.commit_extract_files(extract_id) # update global schema contract settings - if schema_contract_settings is not None: - self.default_schema.set_schema_contract_settings(schema_contract_settings, True) + if schema_contract is not None: + self.default_schema.set_schema_contract(schema_contract, True) return ExtractInfo(describe_extract_data(data)) except Exception as exc: @@ -396,7 +396,7 @@ def run( primary_key: TColumnNames = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_contract_settings: TSchemaContractSettings = None + schema_contract: TSchemaContract = None ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -446,7 +446,7 @@ def run( loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. - schema_contract_settings (TSchemaContractSettings, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. + schema_contract (TSchemaContract, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. @@ -474,7 +474,7 @@ def run( # extract from the source if data is not None: - self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract_settings=schema_contract_settings) + self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract=schema_contract) self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) else: diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md index eaa220d000..b1bfb00ea9 100644 --- a/docs/website/docs/general-usage/data-contracts.md +++ b/docs/website/docs/general-usage/data-contracts.md @@ -13,7 +13,7 @@ a frozen schema that does not change at all. Consider this example: ```py -@dlt.resource(schema_contract_settings={"table": "evolve", "columns": "freeze"}) +@dlt.resource(schema_contract={"table": "evolve", "columns": "freeze"}) def items(): ... ``` @@ -21,9 +21,9 @@ def items(): This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which contains a new column. -The `schema_contract_settings` exists on the `source` decorator as a directive for all resources of that source and on the +The `schema_contract` exists on the `source` decorator as a directive for all resources of that source and on the `resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. -The `schema_contract_settings` is a dictionary with keys that control the following: +The `schema_contract` is a dictionary with keys that control the following: * `table` creating of new tables and subtables * `columns` creating of new columns on an existing table @@ -39,7 +39,7 @@ Each property can be set to one of three values: The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. ```py -@dlt.resource(schema_contract_settings={"table": "discard_row", "columns": "evolve", "data_type": "freeze"}) +@dlt.resource(schema_contract={"table": "discard_row", "columns": "evolve", "data_type": "freeze"}) def items(): ... ``` @@ -47,7 +47,7 @@ def items(): The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. ```py -pipeline.run(my_source(), schema_contract_settings="freeze") +pipeline.run(my_source(), schema_contract="freeze") ``` The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. @@ -55,7 +55,7 @@ Here for all resources variant columns are frozen and raise an error if encounte the source, thus new columns are frozen there. New tables are allowed. ```py -@dlt.resource(schema_contract_settings={"columns": "evolve"}) +@dlt.resource(schema_contract={"columns": "evolve"}) def items(): ... @@ -63,7 +63,7 @@ def items(): def other_items(): ... -@dlt.source(schema_contract_settings={"columns": "freeze", "data_type": "freeze"}): +@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}): def source(): return [items(), other_items()] @@ -72,6 +72,6 @@ def source(): pipeline.run(source()) # this will freeze the whole schema, regardless of the decorator settings -pipeline.run(source(), schema_contract_settings="freeze") +pipeline.run(source(), schema_contract="freeze") ``` \ No newline at end of file diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index c9afb6be76..5a8db47163 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -27,7 +27,7 @@ tables: name: schema_version_hash write_disposition: skip description: Created by DLT. Tracks completed loads - schema_contract_settings: {} + schema_contract: {} name: _dlt_loads resource: _dlt_loads _dlt_version: @@ -58,7 +58,7 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_contract_settings: {} + schema_contract: {} name: _dlt_version resource: _dlt_version blocks: @@ -160,7 +160,7 @@ tables: nullable: false data_type: text name: transactions_root - schema_contract_settings: {} + schema_contract: {} name: blocks resource: blocks blocks__transactions: @@ -442,7 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_contract_settings: {} + schema_contract: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index 2621f04fe1..d07618db7e 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -64,7 +64,7 @@ def test_resolve_contract_settings() -> None: # table specific full setting schema = get_schema() - schema.tables["table"]["schema_contract_settings"] = "freeze" + schema.tables["table"]["schema_contract"] = "freeze" assert schema.resolve_contract_settings_for_table(None, "table") == { "table": "freeze", "column": "freeze", @@ -78,7 +78,7 @@ def test_resolve_contract_settings() -> None: # table specific single setting schema = get_schema() - schema.tables["table"]["schema_contract_settings"] = { + schema.tables["table"]["schema_contract"] = { "table": "freeze", "column": "discard_value", } @@ -95,7 +95,7 @@ def test_resolve_contract_settings() -> None: # schema specific full setting schema = get_schema() - schema._settings["schema_contract_settings"] = "freeze" + schema._settings["schema_contract"] = "freeze" assert schema.resolve_contract_settings_for_table(None, "table") == { "table": "freeze", "column": "freeze", @@ -109,7 +109,7 @@ def test_resolve_contract_settings() -> None: # schema specific single setting schema = get_schema() - schema._settings["schema_contract_settings"] = { + schema._settings["schema_contract"] = { "table": "freeze", "column": "discard_value", } @@ -126,8 +126,8 @@ def test_resolve_contract_settings() -> None: # mixed settings schema = get_schema() - schema._settings["schema_contract_settings"] = "freeze" - schema.tables["table"]["schema_contract_settings"] = { + schema._settings["schema_contract"] = "freeze" + schema.tables["table"]["schema_contract"] = { "table": "evolve", "column": "discard_value", } diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 222f7d679e..1426fe98c9 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -202,17 +202,17 @@ def test_diff_tables() -> None: # ignore identical table props existing = deepcopy(table) changed["write_disposition"] = "append" - changed["schema_contract_settings"] = "freeze" + changed["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", "write_disposition": "append", - "schema_contract_settings": "freeze", + "schema_contract": "freeze", "columns": {} } existing["write_disposition"] = "append" - existing["schema_contract_settings"] = "freeze" + existing["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index c9afb6be76..5a8db47163 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -27,7 +27,7 @@ tables: name: schema_version_hash write_disposition: skip description: Created by DLT. Tracks completed loads - schema_contract_settings: {} + schema_contract: {} name: _dlt_loads resource: _dlt_loads _dlt_version: @@ -58,7 +58,7 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates - schema_contract_settings: {} + schema_contract: {} name: _dlt_version resource: _dlt_version blocks: @@ -160,7 +160,7 @@ tables: nullable: false data_type: text name: transactions_root - schema_contract_settings: {} + schema_contract: {} name: blocks resource: blocks blocks__transactions: @@ -442,7 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp - schema_contract_settings: {} + schema_contract: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 5b80e838c2..4286221e1f 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -1,5 +1,5 @@ import dlt, os, pytest -from dlt.common.schema.typing import TSchemaContractSettings +from dlt.common.schema.typing import TSchemaContract from dlt.common.utils import uniq_id from typing import Any, Union, Optional from dlt.extract.source import DltSource, DltResource @@ -15,7 +15,7 @@ skip_if_not_active("duckdb") -schema_contract_settings = ["evolve", "discard_value", "discard_row", "freeze"] +schema_contract = ["evolve", "discard_value", "discard_row", "freeze"] LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["table", "column", "data_type"] @@ -28,9 +28,9 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: yield assert isinstance(py_exc.value.__context__, SchemaFrozenException) -def items(settings: TSchemaContractSettings) -> Any: +def items(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -41,9 +41,9 @@ def load_items(): return load_items -def items_with_variant(settings: TSchemaContractSettings) -> Any: +def items_with_variant(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -54,9 +54,9 @@ def load_items(): return load_items -def items_with_new_column(settings: TSchemaContractSettings) -> Any: +def items_with_new_column(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -68,9 +68,9 @@ def load_items(): return load_items -def items_with_subtable(settings: TSchemaContractSettings) -> Any: +def items_with_subtable(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract_settings=settings) + @dlt.resource(name="items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -84,9 +84,9 @@ def load_items(): return load_items -def new_items(settings: TSchemaContractSettings) -> Any: +def new_items(settings: TSchemaContract) -> Any: - @dlt.resource(name="new_items", write_disposition="append", schema_contract_settings=settings) + @dlt.resource(name="new_items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -109,31 +109,31 @@ def run_resource(pipeline, resource_fun, settings) -> DltSource: for item in settings.keys(): assert item in LOCATIONS ev_settings = settings[item] - if ev_settings in schema_contract_settings: + if ev_settings in schema_contract: continue for key, val in ev_settings.items(): - assert val in schema_contract_settings + assert val in schema_contract assert key in SCHEMA_ELEMENTS - @dlt.source(name="freeze_tests", schema_contract_settings=settings.get("source")) + @dlt.source(name="freeze_tests", schema_contract=settings.get("source")) def source() -> DltResource: return resource_fun(settings.get("resource")) # run pipeline - pipeline.run(source(), schema_contract_settings=settings.get("override")) + pipeline.run(source(), schema_contract=settings.get("override")) # check updated schema - assert pipeline.default_schema._settings.get("schema_contract_settings", {}) == (settings.get("override") or settings.get("source")) + assert pipeline.default_schema._settings.get("schema_contract", {}) == (settings.get("override") or settings.get("source")) # check items table settings - assert pipeline.default_schema.tables["items"].get("schema_contract_settings", {}) == (settings.get("override") or settings.get("resource") or {}) + assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("override") or settings.get("resource") or {}) def get_pipeline(): import duckdb return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) -@pytest.mark.parametrize("contract_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None: @@ -173,7 +173,7 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) -@pytest.mark.parametrize("contract_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_new_columns(contract_setting: str, setting_location: str) -> None: @@ -223,7 +223,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 20) -@pytest.mark.parametrize("contract_setting", schema_contract_settings) +@pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_variants(contract_setting: str, setting_location: str) -> None: @@ -420,31 +420,31 @@ def get_items_subtable(): # test variants pipeline = get_pipeline() pipeline.run([get_items_simple()]) - pipeline.run([get_items()], schema_contract_settings={"data_type": "discard_row"}) + pipeline.run([get_items()], schema_contract={"data_type": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "discard_row"}) + pipeline.run([get_items_variant()], schema_contract={"data_type": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 - pipeline.run([get_items_variant()], schema_contract_settings={"data_type": "evolve"}) + pipeline.run([get_items_variant()], schema_contract={"data_type": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 # test new column pipeline = get_pipeline() pipeline.run([get_items_simple()]) - pipeline.run([get_items()], schema_contract_settings={"column": "discard_row"}) + pipeline.run([get_items()], schema_contract={"column": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - pipeline.run([get_items_new_col()], schema_contract_settings={"column": "discard_row"}) + pipeline.run([get_items_new_col()], schema_contract={"column": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 - pipeline.run([get_items_new_col()], schema_contract_settings={"column": "evolve"}) + pipeline.run([get_items_new_col()], schema_contract={"column": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 # test new subtable pipeline = get_pipeline() pipeline.run([get_items_simple()]) - pipeline.run([get_items_subtable()], schema_contract_settings={"table": "discard_row"}) + pipeline.run([get_items_subtable()], schema_contract={"table": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 0 - pipeline.run([get_items_subtable()], schema_contract_settings={"table": "evolve"}) + pipeline.run([get_items_subtable()], schema_contract={"table": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 1 @@ -453,7 +453,7 @@ def test_different_objects_in_one_load() -> None: pipeline = get_pipeline() - @dlt.resource(name="items", schema_contract_settings={"column": "freeze", "table":"evolve"}) + @dlt.resource(name="items", schema_contract={"column": "freeze", "table":"evolve"}) def get_items(): yield { "id": 1, @@ -480,7 +480,7 @@ def test_dynamic_tables(table_mode: str) -> None: # the tables is NOT new according to normalizer so the row is not discarded # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new # if you uncomment update code in the extract the problem probably goes away - @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract_settings={"table": table_mode}, columns={"id": {}}) + @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract={"table": table_mode}, columns={"id": {}}) def get_items(): yield { "id": 1, @@ -501,7 +501,7 @@ def get_items(): def test_defined_column_in_new_table(column_mode: str) -> None: pipeline = get_pipeline() - @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract_settings={"column": column_mode}) + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract={"column": column_mode}) def get_items(): yield { "id": 1, @@ -522,7 +522,7 @@ def test_new_column_from_hint_and_data(column_mode: str) -> None: @dlt.resource( name="items", - schema_contract_settings={"column": column_mode}, + schema_contract={"column": column_mode}, columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) def get_items(): yield { @@ -544,7 +544,7 @@ def test_two_new_columns_from_two_rows(column_mode: str) -> None: # the test does not fail only because you clone schema in normalize @dlt.resource( - schema_contract_settings={"column": column_mode} + schema_contract={"column": column_mode} ) def items(): yield { @@ -573,7 +573,7 @@ def columns(item): if item["id"] == 2: return [{"name": "id", "data_type": "bigint", "nullable": True}] - @dlt.resource(name="items", table_name=lambda i: "items", schema_contract_settings={"column": column_mode}) + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"column": column_mode}) def get_items(): yield { "id": 1, From 00c540ba5117f919a0b20f897b15f1f6b5e48416 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 29 Sep 2023 19:06:53 +0200 Subject: [PATCH 38/73] rename schema contract dict keys --- dlt/common/schema/schema.py | 18 +-- dlt/common/schema/typing.py | 4 +- .../docs/general-usage/data-contracts.md | 4 +- .../schema/test_contract_mode_functions.py | 146 +++++++++--------- tests/load/test_freeze_and_data_contract.py | 38 ++--- 5 files changed, 105 insertions(+), 105 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 4d734214fc..5f17a83e50 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -19,8 +19,8 @@ DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { - "table": "evolve", - "column": "evolve", + "tables": "evolve", + "columns": "evolve", "data_type": "evolve" } @@ -227,8 +227,8 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: { - "table": "freeze", - "column": "evolve", + "tables": "freeze", + "columns": "evolve", "data_type": "discard_row" } @@ -247,9 +247,9 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: # check case where we have a new table if not table_populated: - if contract_modes["table"] in ["discard_row", "discard_value"]: + if contract_modes["tables"] in ["discard_row", "discard_value"]: return None, None - if contract_modes["table"] == "freeze": + if contract_modes["tables"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") # check columns @@ -257,14 +257,14 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: # if this is a new column for an existing table... if table_populated and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") - if contract_modes["column"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): + if contract_modes["columns"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) partial_table["columns"].pop(item) - elif contract_modes["column"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): + elif contract_modes["columns"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None elif is_variant and contract_modes["data_type"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") - elif contract_modes["column"] == "freeze": + elif contract_modes["columns"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") return row, partial_table diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index a516295e71..dd208792c4 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -74,8 +74,8 @@ class TColumnSchema(TColumnSchemaBase, total=False): class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" - table: Optional[TSchemaEvolutionMode] - column: Optional[TSchemaEvolutionMode] + tables: Optional[TSchemaEvolutionMode] + columns: Optional[TSchemaEvolutionMode] data_type: Optional[TSchemaEvolutionMode] TSchemaContract = Union[TSchemaEvolutionMode, TSchemaContractDict] diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md index b1bfb00ea9..cc6fd14f10 100644 --- a/docs/website/docs/general-usage/data-contracts.md +++ b/docs/website/docs/general-usage/data-contracts.md @@ -13,7 +13,7 @@ a frozen schema that does not change at all. Consider this example: ```py -@dlt.resource(schema_contract={"table": "evolve", "columns": "freeze"}) +@dlt.resource(schema_contract={"tables": "evolve", "columns": "freeze"}) def items(): ... ``` @@ -39,7 +39,7 @@ Each property can be set to one of three values: The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. ```py -@dlt.resource(schema_contract={"table": "discard_row", "columns": "evolve", "data_type": "freeze"}) +@dlt.resource(schema_contract={"tables": "discard_row", "columns": "evolve", "data_type": "freeze"}) def items(): ... ``` diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index d07618db7e..155ad918e2 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -32,13 +32,13 @@ def get_schema() -> Schema: # add some tables s.update_table({ - "name": "table", + "name": "tables", "columns": columns }) s.update_table({ "name": "child_table", - "parent": "table", + "parent": "tables", "columns": columns }) @@ -59,106 +59,106 @@ def test_resolve_contract_settings() -> None: # defaults schema = get_schema() - assert schema.resolve_contract_settings_for_table(None, "table") == DEFAULT_SCHEMA_CONTRACT_MODE - assert schema.resolve_contract_settings_for_table("table", "child_table") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table(None, "tables") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("tables", "child_table") == DEFAULT_SCHEMA_CONTRACT_MODE # table specific full setting schema = get_schema() - schema.tables["table"]["schema_contract"] = "freeze" - assert schema.resolve_contract_settings_for_table(None, "table") == { - "table": "freeze", - "column": "freeze", + schema.tables["tables"]["schema_contract"] = "freeze" + assert schema.resolve_contract_settings_for_table(None, "tables") == { + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } - assert schema.resolve_contract_settings_for_table("table", "child_table") == { - "table": "freeze", - "column": "freeze", + assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } # table specific single setting schema = get_schema() - schema.tables["table"]["schema_contract"] = { - "table": "freeze", - "column": "discard_value", + schema.tables["tables"]["schema_contract"] = { + "tables": "freeze", + "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "table") == { - "table": "freeze", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table(None, "tables") == { + "tables": "freeze", + "columns": "discard_value", "data_type": "evolve" } - assert schema.resolve_contract_settings_for_table("table", "child_table") == { - "table": "freeze", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + "tables": "freeze", + "columns": "discard_value", "data_type": "evolve" } # schema specific full setting schema = get_schema() schema._settings["schema_contract"] = "freeze" - assert schema.resolve_contract_settings_for_table(None, "table") == { - "table": "freeze", - "column": "freeze", + assert schema.resolve_contract_settings_for_table(None, "tables") == { + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } - assert schema.resolve_contract_settings_for_table("table", "child_table") == { - "table": "freeze", - "column": "freeze", + assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } # schema specific single setting schema = get_schema() schema._settings["schema_contract"] = { - "table": "freeze", - "column": "discard_value", + "tables": "freeze", + "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "table") == { - "table": "freeze", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table(None, "tables") == { + "tables": "freeze", + "columns": "discard_value", "data_type": "evolve" } - assert schema.resolve_contract_settings_for_table("table", "child_table") == { - "table": "freeze", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + "tables": "freeze", + "columns": "discard_value", "data_type": "evolve" } # mixed settings schema = get_schema() schema._settings["schema_contract"] = "freeze" - schema.tables["table"]["schema_contract"] = { - "table": "evolve", - "column": "discard_value", + schema.tables["tables"]["schema_contract"] = { + "tables": "evolve", + "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "table") == { - "table": "evolve", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table(None, "tables") == { + "tables": "evolve", + "columns": "discard_value", "data_type": "freeze" } - assert schema.resolve_contract_settings_for_table("table", "child_table") == { - "table": "evolve", - "column": "discard_value", + assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + "tables": "evolve", + "columns": "discard_value", "data_type": "freeze" } # ensure other settings do not interfere with the main setting we are testing base_settings = [{ - "table": "evolve", - "column": "evolve", + "tables": "evolve", + "columns": "evolve", "data_type": "evolve" },{ - "table": "discard_row", - "column": "discard_row", + "tables": "discard_row", + "columns": "discard_row", "data_type": "discard_row" }, { - "table": "discard_value", - "column": "discard_value", + "tables": "discard_value", + "columns": "discard_value", "data_type": "discard_value" }, { - "table": "freeze", - "column": "freeze", + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } ] @@ -172,18 +172,18 @@ def test_check_adding_table(base_settings) -> None: "column_1": "some string", "column_2": 123 } - new_table = copy.deepcopy(schema.tables["table"]) + new_table = copy.deepcopy(schema.tables["tables"]) new_table["name"] = "new_table" # # check adding new table # - assert schema.apply_schema_contract({**base_settings, **{"table": "evolve"}}, "new_table", data, new_table, False) == (data, new_table) - assert schema.apply_schema_contract({**base_settings, **{"table": "discard_row"}}, "new_table", data, new_table, False) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"table": "discard_value"}}, "new_table", data, new_table, False) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"tables": "evolve"}}, "new_table", data, new_table, False) == (data, new_table) + assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_row"}}, "new_table", data, new_table, False) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_value"}}, "new_table", data, new_table, False) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"table": "freeze"}}, "new_table", data, new_table, False) + schema.apply_schema_contract({**base_settings, **{"tables": "freeze"}}, "new_table", data, new_table, False) @pytest.mark.parametrize("base_settings", base_settings) @@ -202,7 +202,7 @@ def test_check_adding_new_columns(base_settings) -> None: "new_column": "some string" } table_update = { - "name": "table", + "name": "tables", "columns": { "new_column": { "name": "new_column", @@ -213,12 +213,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), table_update, True) + schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) # @@ -245,12 +245,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract({**base_settings, **{"column": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"column": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"column": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) + schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) @@ -269,7 +269,7 @@ def test_check_adding_new_variant() -> None: "column_2_variant": 345345 } table_update = { - "name": "table", + "name": "tables", "columns": { "column_2_variant": { "name": "column_2_variant", @@ -281,16 +281,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) + schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "freeze"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_row"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "column": "discard_value"}}, "table", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 4286221e1f..3c9674acf6 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -17,7 +17,7 @@ schema_contract = ["evolve", "discard_value", "discard_row", "freeze"] LOCATIONS = ["source", "resource", "override"] -SCHEMA_ELEMENTS = ["table", "column", "data_type"] +SCHEMA_ELEMENTS = ["tables", "columns", "data_type"] @contextlib.contextmanager def raises_frozen_exception(check_raise: bool = True) -> Any: @@ -141,7 +141,7 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None full_settings = { setting_location: { - "table": contract_setting + "tables": contract_setting }} run_resource(pipeline, items, {}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) @@ -179,7 +179,7 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non full_settings = { setting_location: { - "column": contract_setting + "columns": contract_setting }} pipeline = get_pipeline() @@ -275,13 +275,13 @@ def test_settings_precedence() -> None: # trying to add new column when forbidden on resource will fail run_resource(pipeline, items_with_new_column, {"resource": { - "column": "discard_row" + "columns": "discard_row" }}) # when allowed on override it will work run_resource(pipeline, items_with_new_column, { - "resource": {"column": "freeze"}, - "override": {"column": "evolve"} + "resource": {"columns": "freeze"}, + "override": {"columns": "evolve"} }) @@ -430,21 +430,21 @@ def get_items_subtable(): # test new column pipeline = get_pipeline() pipeline.run([get_items_simple()]) - pipeline.run([get_items()], schema_contract={"column": "discard_row"}) + pipeline.run([get_items()], schema_contract={"columns": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - pipeline.run([get_items_new_col()], schema_contract={"column": "discard_row"}) + pipeline.run([get_items_new_col()], schema_contract={"columns": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 - pipeline.run([get_items_new_col()], schema_contract={"column": "evolve"}) + pipeline.run([get_items_new_col()], schema_contract={"columns": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 # test new subtable pipeline = get_pipeline() pipeline.run([get_items_simple()]) - pipeline.run([get_items_subtable()], schema_contract={"table": "discard_row"}) + pipeline.run([get_items_subtable()], schema_contract={"tables": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 0 - pipeline.run([get_items_subtable()], schema_contract={"table": "evolve"}) + pipeline.run([get_items_subtable()], schema_contract={"tables": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 1 @@ -453,7 +453,7 @@ def test_different_objects_in_one_load() -> None: pipeline = get_pipeline() - @dlt.resource(name="items", schema_contract={"column": "freeze", "table":"evolve"}) + @dlt.resource(name="items", schema_contract={"columns": "freeze", "tables":"evolve"}) def get_items(): yield { "id": 1, @@ -480,15 +480,15 @@ def test_dynamic_tables(table_mode: str) -> None: # the tables is NOT new according to normalizer so the row is not discarded # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new # if you uncomment update code in the extract the problem probably goes away - @dlt.resource(name="items", table_name=lambda i: i["table"], schema_contract={"table": table_mode}, columns={"id": {}}) + @dlt.resource(name="items", table_name=lambda i: i["tables"], schema_contract={"tables": table_mode}, columns={"id": {}}) def get_items(): yield { "id": 1, - "table": "one", + "tables": "one", } yield { "id": 2, - "table": "two", + "tables": "two", "new_column": "some val" } with raises_frozen_exception(table_mode == "freeze"): @@ -501,7 +501,7 @@ def get_items(): def test_defined_column_in_new_table(column_mode: str) -> None: pipeline = get_pipeline() - @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract={"column": column_mode}) + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract={"columns": column_mode}) def get_items(): yield { "id": 1, @@ -522,7 +522,7 @@ def test_new_column_from_hint_and_data(column_mode: str) -> None: @dlt.resource( name="items", - schema_contract={"column": column_mode}, + schema_contract={"columns": column_mode}, columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) def get_items(): yield { @@ -544,7 +544,7 @@ def test_two_new_columns_from_two_rows(column_mode: str) -> None: # the test does not fail only because you clone schema in normalize @dlt.resource( - schema_contract={"column": column_mode} + schema_contract={"columns": column_mode} ) def items(): yield { @@ -573,7 +573,7 @@ def columns(item): if item["id"] == 2: return [{"name": "id", "data_type": "bigint", "nullable": True}] - @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"column": column_mode}) + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"columns": column_mode}) def get_items(): yield { "id": 1, From 3ed4630dd08f1a77535756782717764a986c69df Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 29 Sep 2023 21:40:52 +0200 Subject: [PATCH 39/73] some work --- dlt/common/schema/schema.py | 46 ++++++++++----------- dlt/common/schema/typing.py | 31 +++++++------- dlt/common/schema/utils.py | 3 -- dlt/extract/schema.py | 9 +--- dlt/normalize/normalize.py | 16 ++++--- dlt/pipeline/pipeline.py | 23 ++++++++++- tests/load/test_freeze_and_data_contract.py | 15 ++----- 7 files changed, 72 insertions(+), 71 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 5f17a83e50..6527d79769 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -195,13 +195,13 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> TSchemaContractDict: + def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> Tuple[bool, TSchemaContractDict]: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: settings = settings or {} if isinstance(settings, str): - return TSchemaContractDict(table=settings, column=settings, data_type=settings) + return TSchemaContractDict(tables=settings, columns=settings, data_type=settings) return settings # find table settings @@ -210,18 +210,21 @@ def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: table = utils.get_top_level_table(self.tables, parent_table or table_name)["name"] # modes - table_contract_modes = resolve_single(self.tables.get(table, {}).get("schema_contract", {})) - schema_contract_modes = resolve_single(self._settings.get("schema_contract", {})) + explicit_table_contract: bool = False + settings: TSchemaContract = {} + if table_contract_modes := resolve_single(self.tables.get(table, {}).get("schema_contract", {})): + explicit_table_contract = True + settings = table_contract_modes + elif schema_contract_modes := resolve_single(self._settings.get("schema_contract", {})): + settings = schema_contract_modes - # resolve to correct settings dict - settings = cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **schema_contract_modes, **table_contract_modes}) - - return settings + # fill in defaults + return explicit_table_contract, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings}) def is_table_populated(self, table_name: str) -> bool: return table_name in self.tables and (self.tables[table_name].get("populated") is True) - def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, table_populated: bool) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, explicit_table_contract: bool) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -245,8 +248,14 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table + # if evolve once is set, allow all + if (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False): + return row, partial_table + + is_new_table = (table_name not in self.tables) or not self.tables[table_name]["columns"] + # check case where we have a new table - if not table_populated: + if is_new_table: if contract_modes["tables"] in ["discard_row", "discard_value"]: return None, None if contract_modes["tables"] == "freeze": @@ -254,8 +263,11 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: # check columns for item in list(row.keys()): + # dlt cols may always be added + if item.startswith(self._dlt_tables_prefix): + continue # if this is a new column for an existing table... - if table_populated and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): + if not is_new_table and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["columns"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) @@ -269,18 +281,6 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: return row, partial_table - def update_schema(self, schema: "Schema") -> None: - """ - Update schema from another schema - note we are not merging props like max nesting or column propagation - """ - - for table in schema.data_tables(include_incomplete=True): - self.update_table( - self.normalize_table_identifiers(table) - ) - self.set_schema_contract(schema._settings.get("schema_contract", {})) - def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] parent_table_name = partial_table.get("parent") diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index dd208792c4..4f3841630c 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -70,7 +70,7 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" -TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] +TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row", "evolve_once"] class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" @@ -84,24 +84,25 @@ class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] includes: Optional[List[TSimpleRegex]] - -class TTableSchema(TypedDict, total=False): - """TypedDict that defines properties of a table""" - name: Optional[str] - description: Optional[str] - write_disposition: Optional[TWriteDisposition] - schema_contract: Optional[TSchemaContract] - parent: Optional[str] - filters: Optional[TRowFilters] - columns: TTableSchemaColumns - resource: Optional[str] - populated: Optional[bool] - +class NormalizerInfo(TypedDict, total=True): + evolve_once: bool + +# TypedDict that defines properties of a table +TTableSchema = TypedDict("TTableSchema", { + "name": Optional[str], + "description": Optional[str], + "write_disposition": Optional[TWriteDisposition], + "schema_contract": Optional[TSchemaContract], + "parent": Optional[str], + "filters": Optional[TRowFilters], + "columns": TTableSchemaColumns, + "resource": Optional[str], + "x-normalizer": Optional[NormalizerInfo], +}) class TPartialTableSchema(TTableSchema): pass - TSchemaTables = Dict[str, TTableSchema] TSchemaUpdate = Dict[str, List[TPartialTableSchema]] diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 1a52fe1c09..c96fd6271c 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -647,7 +647,6 @@ def new_table( validate_schema: bool = False, resource: str = None, schema_contract: TSchemaContract = None, - populated: bool = None ) -> TTableSchema: table: TTableSchema = { @@ -665,8 +664,6 @@ def new_table( table["resource"] = resource or table_name if schema_contract: table["schema_contract"] = schema_contract - if populated is not None: - table["populated"] = populated if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 4d80e2ba9a..3db3ac22f6 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -26,7 +26,6 @@ class TTableSchemaTemplate(TypedDict, total=False): merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] schema_contract: TTableHintTemplate[TSchemaContract] - populated: TTableHintTemplate[bool] validator: ValidateItem class DltResourceSchema: @@ -101,7 +100,6 @@ def apply_hints( merge_key: TTableHintTemplate[TColumnNames] = None, incremental: Incremental[Any] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, - populated: TTableHintTemplate[bool] = None ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -118,7 +116,7 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract, populated) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract) else: # set single hints t = deepcopy(self._table_schema_template) @@ -132,8 +130,6 @@ def apply_hints( t["parent"] = parent_table_name else: t.pop("parent", None) - if populated is not None: - t["populated"] = populated if write_disposition: t["write_disposition"] = write_disposition if schema_contract: @@ -219,7 +215,6 @@ def new_table_template( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, - populated: TTableHintTemplate[bool] = None ) -> TTableSchemaTemplate: if not table_name: raise TableNameMissing() @@ -232,7 +227,7 @@ def new_table_template( else: validator = None # create a table schema template where hints can be functions taking TDataItem - new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract, populated=populated) # type: ignore + new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract) # type: ignore if primary_key: new_template["primary_key"] = primary_key diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 5b189c87bd..52c4878ffb 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -132,14 +132,12 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, schema_name = schema.name items_count = 0 row_counts: TRowCount = {} - schema_contract_modes: TSchemaContractDict = None - is_table_populated: bool = False + schema_contract: TSchemaContractDict = None for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): - if not schema_contract_modes: - schema_contract_modes = schema.resolve_contract_settings_for_table(parent_table, table_name) - is_table_populated = schema.is_table_populated(table_name) + if not schema_contract: + explicit_table_contract, schema_contract = schema.resolve_contract_settings_for_table(parent_table, table_name) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) @@ -153,7 +151,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, check schema contract if partial_table: - row, partial_table = schema.apply_schema_contract(schema_contract_modes, table_name, row, partial_table, is_table_populated) + row, partial_table = schema.apply_schema_contract(schema_contract, table_name, row, partial_table, explicit_table_contract) if not row: continue @@ -279,9 +277,9 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files schema_updates, row_counts = map_f(schema, load_id, files) # set all populated tables to populated needs_schema_save = len(schema_updates) > 0 - for table_name, count in row_counts.items(): - if count > 0 and schema.tables[table_name].get("populated") is not True: - schema.tables[table_name]["populated"] = True + # remove normalizer specific info + for table in schema.tables.values(): + if table.pop("x-normalizer", None): needs_schema_save = True # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) if needs_schema_save: diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 9e5531947e..e8333c3cea 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -19,6 +19,7 @@ from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract +from dlt.common.schema.utils import diff_tables from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -866,8 +867,10 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) # if source schema does not exist in the pipeline + is_new_schema = False if source_schema.name not in self._schema_storage: # save schema into the pipeline + is_new_schema = True self._schema_storage.save_schema(source_schema) # and set as default if this is first schema in pipeline @@ -880,8 +883,24 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para # initialize import with fully discovered schema self._schema_storage.save_import_schema_if_not_exists(source_schema) - # get the current schema and merge tables from source_schema - pipeline_schema.update_schema(source_schema) + # update the pipeline schema + for table in source_schema.data_tables(include_incomplete=True): + # create table diff + normalized_table = pipeline_schema.normalize_table_identifiers(table) + if table["name"] in pipeline_schema.tables: + partial_table = diff_tables(pipeline_schema.tables[table["name"]], normalized_table) + else: + partial_table = normalized_table + # figure out wether this is a new table + is_new_table = is_new_schema or (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) + if is_new_table: + partial_table["x-normalizer"] = {"evolve_once": True} + # update pipeline schema + pipeline_schema.update_table( + partial_table + ) + + pipeline_schema.set_schema_contract(source_schema._settings.get("schema_contract", {})) return extract_id diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 3c9674acf6..6d0dcfd5f6 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -378,12 +378,6 @@ class Items(BaseModel): name: Optional[str] amount: Optional[int] - @dlt.resource(name="items", columns=Items) - def get_items_simple(): - yield from [{ - "id": 5 - }] - @dlt.resource(name="items", columns=Items) def get_items(): yield from [{ @@ -391,7 +385,7 @@ def get_items(): "name": "dave", }] - @dlt.resource(name="items") + @dlt.resource(name="items", columns=Items) def get_items_variant(): yield from [{ "id": 5, @@ -399,7 +393,7 @@ def get_items_variant(): "amount": "HELLO" }] - @dlt.resource(name="items") + @dlt.resource(name="items", columns=Items) def get_items_new_col(): yield from [{ "id": 5, @@ -408,7 +402,7 @@ def get_items_new_col(): "new_col": "hello" }] - @dlt.resource(name="items") + @dlt.resource(name="items", columns=Items) def get_items_subtable(): yield from [{ "id": 5, @@ -419,7 +413,6 @@ def get_items_subtable(): # test variants pipeline = get_pipeline() - pipeline.run([get_items_simple()]) pipeline.run([get_items()], schema_contract={"data_type": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 pipeline.run([get_items_variant()], schema_contract={"data_type": "discard_row"}) @@ -429,7 +422,6 @@ def get_items_subtable(): # test new column pipeline = get_pipeline() - pipeline.run([get_items_simple()]) pipeline.run([get_items()], schema_contract={"columns": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 pipeline.run([get_items_new_col()], schema_contract={"columns": "discard_row"}) @@ -439,7 +431,6 @@ def get_items_subtable(): # test new subtable pipeline = get_pipeline() - pipeline.run([get_items_simple()]) pipeline.run([get_items_subtable()], schema_contract={"tables": "discard_row"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 0 From 333217c5b3ad11ece35e53ea67befeb993fdaebd Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 29 Sep 2023 23:32:12 +0200 Subject: [PATCH 40/73] more work... --- dlt/common/schema/schema.py | 15 +++++--- dlt/common/schema/typing.py | 4 +- dlt/pipeline/pipeline.py | 42 ++++++++++++++------- tests/load/test_freeze_and_data_contract.py | 12 +++--- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 6527d79769..5d3e70bcf2 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -248,11 +248,7 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - # if evolve once is set, allow all - if (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False): - return row, partial_table - - is_new_table = (table_name not in self.tables) or not self.tables[table_name]["columns"] + is_new_table = (table_name not in self.tables) or (not self.tables[table_name]["columns"]) # check case where we have a new table if is_new_table: @@ -261,6 +257,15 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: if contract_modes["tables"] == "freeze": raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") + # in case we only check table creation in pipeline + if not row: + return row, partial_table + + # if evolve once is set, allow all column changes + evolve_once = (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) + if evolve_once: + return row, partial_table + # check columns for item in list(row.keys()): # dlt cols may always be added diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 4f3841630c..c5f1f62e1e 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -70,7 +70,7 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" -TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row", "evolve_once"] +TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" @@ -85,7 +85,7 @@ class TRowFilters(TypedDict, total=True): includes: Optional[List[TSimpleRegex]] class NormalizerInfo(TypedDict, total=True): - evolve_once: bool + new_table: bool # TypedDict that defines properties of a table TTableSchema = TypedDict("TTableSchema", { diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index e8333c3cea..ffed285cbf 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -31,6 +31,7 @@ from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo, PipelineContext, SupportsPipeline, TPipelineLocalState, TPipelineState, StateInjectableContext from dlt.common.schema import Schema +from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.utils import is_interactive from dlt.common.data_writers import TLoaderFileFormat @@ -282,17 +283,13 @@ def extract( raise SourceExhausted(source.name) # TODO: merge infos for all the sources extract_ids.append( - self._extract_source(storage, source, max_parallel_items, workers) + self._extract_source(storage, source, max_parallel_items, workers, schema_contract) ) # commit extract ids # TODO: if we fail here we should probably wipe out the whole extract folder for extract_id in extract_ids: storage.commit_extract_files(extract_id) - # update global schema contract settings - if schema_contract is not None: - self.default_schema.set_schema_contract(schema_contract, True) - return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor @@ -859,7 +856,7 @@ def append_data(data_item: Any) -> None: return sources - def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: + def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int, global_contract: TSchemaContract) -> str: # discover the schema from source source_schema = source.schema source_schema.update_normalizers() @@ -867,11 +864,9 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) # if source schema does not exist in the pipeline - is_new_schema = False if source_schema.name not in self._schema_storage: - # save schema into the pipeline - is_new_schema = True - self._schema_storage.save_schema(source_schema) + # save new schema into the pipeline + self._schema_storage.save_schema(Schema(source_schema.name)) # and set as default if this is first schema in pipeline if not self.default_schema_name: @@ -885,22 +880,41 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para # update the pipeline schema for table in source_schema.data_tables(include_incomplete=True): + # create table diff normalized_table = pipeline_schema.normalize_table_identifiers(table) if table["name"] in pipeline_schema.tables: partial_table = diff_tables(pipeline_schema.tables[table["name"]], normalized_table) else: partial_table = normalized_table + # figure out wether this is a new table - is_new_table = is_new_schema or (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) + is_new_table = (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) if is_new_table: partial_table["x-normalizer"] = {"evolve_once": True} + + # update global schema contract settings + if global_contract is not None: + source_schema.set_schema_contract(global_contract, True) + + # apply schema contract, resolve on source schema and apply on pipeline schema + explicit_table_contract, schema_contract = source_schema.resolve_contract_settings_for_table(None, table["name"]) + try: + _, partial_table = pipeline_schema.apply_schema_contract(schema_contract, table["name"], None, partial_table, explicit_table_contract) + except SchemaFrozenException: + partial_table = None + # update pipeline schema - pipeline_schema.update_table( - partial_table - ) + if partial_table: + pipeline_schema.update_table( + partial_table + ) pipeline_schema.set_schema_contract(source_schema._settings.get("schema_contract", {})) + + # globally apply contract override + if global_contract is not None: + pipeline_schema.set_schema_contract(global_contract, True) return extract_id diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 6d0dcfd5f6..dc724981e1 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -364,7 +364,7 @@ def test_single_settings_value(setting_location: str) -> None: run_resource(pipeline, new_items, {setting_location: "discard_row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 - assert "new_items" not in table_counts + assert ("new_items" in table_counts) == (setting_location == "resource") def test_data_contract_interaction() -> None: @@ -471,7 +471,7 @@ def test_dynamic_tables(table_mode: str) -> None: # the tables is NOT new according to normalizer so the row is not discarded # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new # if you uncomment update code in the extract the problem probably goes away - @dlt.resource(name="items", table_name=lambda i: i["tables"], schema_contract={"tables": table_mode}, columns={"id": {}}) + @dlt.resource(name="items", table_name=lambda i: i["tables"], columns={"id": {}}) def get_items(): yield { "id": 1, @@ -483,9 +483,11 @@ def get_items(): "new_column": "some val" } with raises_frozen_exception(table_mode == "freeze"): - pipeline.run([get_items()]) - assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) - assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) + pipeline.run([get_items()], schema_contract={"tables": table_mode}) + + if table_mode != "freeze": + assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) @pytest.mark.parametrize("column_mode", ["discard_row", "evolve", "freeze"]) From d69e54d791c1ee167d91833c615b5991d05fce5b Mon Sep 17 00:00:00 2001 From: Dave Date: Sun, 1 Oct 2023 23:08:16 +0200 Subject: [PATCH 41/73] more work --- dlt/common/schema/schema.py | 54 +++++++-------- dlt/common/schema/utils.py | 1 + dlt/extract/schema.py | 6 ++ dlt/normalize/normalize.py | 6 +- dlt/pipeline/pipeline.py | 12 ++-- tests/load/test_freeze_and_data_contract.py | 75 ++++++++++----------- 6 files changed, 80 insertions(+), 74 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 5d3e70bcf2..5d09145052 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -195,36 +195,10 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def resolve_contract_settings_for_table(self, parent_table: str, table_name: str) -> Tuple[bool, TSchemaContractDict]: - """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" - - def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: - settings = settings or {} - if isinstance(settings, str): - return TSchemaContractDict(tables=settings, columns=settings, data_type=settings) - return settings - - # find table settings - table = parent_table or table_name - if table in self.tables: - table = utils.get_top_level_table(self.tables, parent_table or table_name)["name"] - - # modes - explicit_table_contract: bool = False - settings: TSchemaContract = {} - if table_contract_modes := resolve_single(self.tables.get(table, {}).get("schema_contract", {})): - explicit_table_contract = True - settings = table_contract_modes - elif schema_contract_modes := resolve_single(self._settings.get("schema_contract", {})): - settings = schema_contract_modes - - # fill in defaults - return explicit_table_contract, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings}) - def is_table_populated(self, table_name: str) -> bool: return table_name in self.tables and (self.tables[table_name].get("populated") is True) - def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema, explicit_table_contract: bool) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -675,3 +649,29 @@ def _compile_settings(self) -> None: def __repr__(self) -> str: return f"Schema {self.name} at {id(self)}" + +def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None) -> Tuple[bool, TSchemaContractDict]: + """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" + + def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: + settings = settings or {} + if isinstance(settings, str): + settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) + return {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {} + + # find table settings + table = parent_table or table_name + if table in current_schema.tables: + table = utils.get_top_level_table(current_schema.tables, parent_table or table_name)["name"] + + # modes + current_table_contract_modes = resolve_single(current_schema.tables.get(table, {}).get("schema_contract", {})) + current_schema_contract_modes = resolve_single(current_schema._settings.get("schema_contract", {})) + + if incoming_schema: + if incoming_table_contract_mode := resolve_single(incoming_schema.tables.get(table, {}).get("schema_contract", {})): + return incoming_table_contract_mode + if not current_table_contract_modes and (incoming_schema_contract_modes := resolve_single(incoming_schema._settings.get("schema_contract", {}))): + return incoming_schema_contract_modes + + return current_table_contract_modes or current_schema_contract_modes or DEFAULT_SCHEMA_CONTRACT_MODE \ No newline at end of file diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index c96fd6271c..b8fa6f78a6 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -697,3 +697,4 @@ def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: def standard_type_detections() -> List[TTypeDetections]: return ["timestamp", "iso_timestamp"] + diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 3db3ac22f6..bade6557a9 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -224,8 +224,14 @@ def new_table_template( columns = ensure_table_schema_columns_hint(columns) if not callable(columns): columns = columns.values() # type: ignore + is_table_complete = len([c for c in columns if c.get("name") and c.get("data_type")]) else: validator = None + is_table_complete = False + + # freeze the resource if we have a fully defined table and no other explicit contract + if not schema_contract and is_table_complete: + schema_contract = "freeze" # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract) # type: ignore diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 52c4878ffb..8b294e1f2e 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -13,6 +13,7 @@ from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.schema.typing import TStoredSchema, TTableSchemaColumns, TSchemaContractDict from dlt.common.schema.utils import merge_schema_updates +from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration from dlt.common.typing import TDataItem @@ -137,7 +138,7 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, for item in items: for (table_name, parent_table), row in schema.normalize_data_item(item, load_id, root_table_name): if not schema_contract: - explicit_table_contract, schema_contract = schema.resolve_contract_settings_for_table(parent_table, table_name) + schema_contract = resolve_contract_settings_for_table(parent_table, table_name, schema) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) @@ -149,9 +150,10 @@ def _w_normalize_chunk(load_storage: LoadStorage, schema: Schema, load_id: str, row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row(table_name, parent_table, row) + # if we detect a migration, check schema contract if partial_table: - row, partial_table = schema.apply_schema_contract(schema_contract, table_name, row, partial_table, explicit_table_contract) + row, partial_table = schema.apply_schema_contract(schema_contract, table_name, row, partial_table) if not row: continue diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index ffed285cbf..28b3c1fc76 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -20,6 +20,7 @@ from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract from dlt.common.schema.utils import diff_tables +from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -892,17 +893,14 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para is_new_table = (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) if is_new_table: partial_table["x-normalizer"] = {"evolve_once": True} - + # update global schema contract settings if global_contract is not None: source_schema.set_schema_contract(global_contract, True) - # apply schema contract, resolve on source schema and apply on pipeline schema - explicit_table_contract, schema_contract = source_schema.resolve_contract_settings_for_table(None, table["name"]) - try: - _, partial_table = pipeline_schema.apply_schema_contract(schema_contract, table["name"], None, partial_table, explicit_table_contract) - except SchemaFrozenException: - partial_table = None + # apply schema contractand apply on pipeline schema + schema_contract = resolve_contract_settings_for_table(None, table["name"], pipeline_schema, source_schema) + _, partial_table = pipeline_schema.apply_schema_contract(schema_contract, table["name"], None, partial_table) # update pipeline schema if partial_table: diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index dc724981e1..3a34121d28 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -371,29 +371,32 @@ def test_data_contract_interaction() -> None: """ ensure data contracts with pydantic are enforced properly """ - from pydantic import BaseModel + from pydantic import BaseModel, Extra class Items(BaseModel): id: int # noqa: A003 name: Optional[str] - amount: Optional[int] + amount: Union[int, str, None] + class Config: + extra = Extra.forbid - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items(): yield from [{ "id": 5, "name": "dave", + "amount": 6, }] @dlt.resource(name="items", columns=Items) - def get_items_variant(): + def get_items_with_model(): yield from [{ "id": 5, "name": "dave", - "amount": "HELLO" + "amount": 6, }] - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items_new_col(): yield from [{ "id": 5, @@ -402,7 +405,7 @@ def get_items_new_col(): "new_col": "hello" }] - @dlt.resource(name="items", columns=Items) + @dlt.resource(name="items") def get_items_subtable(): yield from [{ "id": 5, @@ -411,40 +414,39 @@ def get_items_subtable(): "sub": [{"hello": "dave"}] }] - # test variants + + # test get new items pipeline = get_pipeline() - pipeline.run([get_items()], schema_contract={"data_type": "discard_row"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - pipeline.run([get_items_variant()], schema_contract={"data_type": "discard_row"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 - pipeline.run([get_items_variant()], schema_contract={"data_type": "evolve"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - - # test new column + pipeline.run([get_items()]) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + # now items with model work + pipeline.run([get_items_with_model()]) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + # items with model alone does not work, since contract is set to freeze pipeline = get_pipeline() - pipeline.run([get_items()], schema_contract={"columns": "discard_row"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - pipeline.run([get_items_new_col()], schema_contract={"columns": "discard_row"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 0 - pipeline.run([get_items_new_col()], schema_contract={"columns": "evolve"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 + with raises_frozen_exception(True): + pipeline.run([get_items_with_model()]) # test new subtable pipeline = get_pipeline() - pipeline.run([get_items_subtable()], schema_contract={"tables": "discard_row"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 0 + pipeline.run([get_items()]) + pipeline.run([get_items_with_model()]) + with raises_frozen_exception(True): + pipeline.run([get_items_subtable()]) - pipeline.run([get_items_subtable()], schema_contract={"tables": "evolve"}) - assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 1 - assert pipeline.last_trace.last_normalize_info.row_counts.get("items__sub", 0) == 1 + # it is possible to override contract when there is a model + # items with model alone does not work, since contract is set to freeze + pipeline = get_pipeline() + pipeline.run([get_items_with_model()], schema_contract="evolve") + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 def test_different_objects_in_one_load() -> None: pipeline = get_pipeline() - @dlt.resource(name="items", schema_contract={"columns": "freeze", "tables":"evolve"}) + @dlt.resource(name="items") def get_items(): yield { "id": 1, @@ -458,7 +460,7 @@ def get_items(): "new_column": "some val" } - pipeline.run([get_items()]) + pipeline.run([get_items()], schema_contract={"columns": "freeze", "tables":"evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 @@ -494,13 +496,13 @@ def get_items(): def test_defined_column_in_new_table(column_mode: str) -> None: pipeline = get_pipeline() - @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}], schema_contract={"columns": column_mode}) + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) def get_items(): yield { "id": 1, "key": "value", } - pipeline.run([get_items()]) + pipeline.run([get_items()], schema_contract={"columns": column_mode}) assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 @@ -515,7 +517,6 @@ def test_new_column_from_hint_and_data(column_mode: str) -> None: @dlt.resource( name="items", - schema_contract={"columns": column_mode}, columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) def get_items(): yield { @@ -523,7 +524,7 @@ def get_items(): "key": "value", } - pipeline.run([get_items()]) + pipeline.run([get_items()], schema_contract={"columns": column_mode}) assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 @@ -536,9 +537,7 @@ def test_two_new_columns_from_two_rows(column_mode: str) -> None: # and adds a new column to complete tables in 2nd row # the test does not fail only because you clone schema in normalize - @dlt.resource( - schema_contract={"columns": column_mode} - ) + @dlt.resource() def items(): yield { "id": 1, @@ -547,7 +546,7 @@ def items(): "id": 1, "key": "value", } - pipeline.run([items()]) + pipeline.run([items()], schema_contract={"columns": column_mode}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 From 041da6df60e04409893cb1495c53b7db686b07f0 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 2 Oct 2023 12:21:15 +0200 Subject: [PATCH 42/73] move checking of new tables into extract function --- dlt/common/schema/schema.py | 7 ++- dlt/extract/extract.py | 62 ++++++++++++++++----- dlt/extract/schema.py | 6 +- dlt/pipeline/pipeline.py | 43 ++++---------- tests/load/test_freeze_and_data_contract.py | 24 +++----- 5 files changed, 76 insertions(+), 66 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 5d09145052..86338908aa 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -650,7 +650,7 @@ def _compile_settings(self) -> None: def __repr__(self) -> str: return f"Schema {self.name} at {id(self)}" -def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None) -> Tuple[bool, TSchemaContractDict]: +def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> Tuple[bool, TSchemaContractDict]: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: @@ -658,7 +658,10 @@ def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: if isinstance(settings, str): settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) return {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {} - + + if incoming_table and (incoming_table_contract_mode := resolve_single(incoming_table.get("schema_contract", {}))): + return incoming_table_contract_mode + # find table settings table = parent_table or table_name if table in current_schema.tables: diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 704823980c..1742ade224 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,6 +1,6 @@ import contextlib import os -from typing import ClassVar, List, Set +from typing import ClassVar, List, Set, Optional from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section @@ -14,6 +14,8 @@ from dlt.common.schema import Schema, utils, TSchemaUpdate from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage from dlt.common.configuration.specs import known_sections +from dlt.common.schema.typing import TPartialTableSchema +from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.extract.decorators import SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints @@ -61,6 +63,7 @@ def extract( source: DltSource, storage: ExtractorStorage, collector: Collector = NULL_COLLECTOR, + pipeline_schema: Schema = None, *, max_parallel_items: int = None, workers: int = None, @@ -70,6 +73,7 @@ def extract( dynamic_tables: TSchemaUpdate = {} schema = source.schema resources_with_items: Set[str] = set() + disallowed_tables: Set[str] = set() with collector(f"Extract {source.name}"): @@ -90,6 +94,8 @@ def _write_dynamic_table(resource: DltResource, item: TDataItem) -> None: table_name = resource._table_name_hint_fun(item) existing_table = dynamic_tables.get(table_name) if existing_table is None: + if not _add_dynamic_table(resource, data_item=item): + return dynamic_tables[table_name] = [resource.compute_table_schema(item)] else: # quick check if deep table merge is required @@ -103,12 +109,42 @@ def _write_dynamic_table(resource: DltResource, item: TDataItem) -> None: # write to storage with inferred table name _write_item(table_name, resource.name, item) - def _write_static_table(resource: DltResource, table_name: str) -> None: + def _write_static_table(resource: DltResource, table_name: str, item: TDataItem) -> None: existing_table = dynamic_tables.get(table_name) if existing_table is None: - static_table = resource.compute_table_schema() - static_table["name"] = table_name - dynamic_tables[table_name] = [static_table] + if not _add_dynamic_table(resource, table_name=table_name): + return + _write_item(table_name, resource.name, item) + + def _add_dynamic_table(resource: DltResource, data_item: TDataItem = None, table_name: Optional[str] = None) -> bool: + """ + Computes new table and does contract checks + """ + table = resource.compute_table_schema(data_item) + if table_name: + table["name"] = table_name + + # fast exit if we already evaluated this + if table["name"] in disallowed_tables: + return False + + # this is a new table so allow evolve once + # TODO: is this the correct check for a new table, should a table with only incomplete columns be new too? + is_new_table = (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) + if is_new_table: + table["x-normalizer"] = {"evolve_once": True} + + # apply schema contract and apply on pipeline schema + # here we only check that table may be created + schema_contract = resolve_contract_settings_for_table(None, table["name"], pipeline_schema, source.schema, table) + _, checked_table = pipeline_schema.apply_schema_contract(schema_contract, table["name"], None, table) + + if not checked_table: + disallowed_tables.add(table["name"]) + return False + + dynamic_tables[table_name] = [checked_table] + return True # yield from all selected pipes with PipeIterator.from_pipes(source.resources.selected_pipes, max_parallel_items=max_parallel_items, workers=workers, futures_poll_interval=futures_poll_interval) as pipes: @@ -130,8 +166,7 @@ def _write_static_table(resource: DltResource, table_name: str) -> None: table_name: str = None if isinstance(pipe_item.meta, TableNameMeta): table_name = pipe_item.meta.table_name - _write_static_table(resource, table_name) - _write_item(table_name, resource.name, pipe_item.item) + _write_static_table(resource, table_name, pipe_item.item) else: # get partial table from table template if resource._table_name_hint_fun: @@ -143,8 +178,7 @@ def _write_static_table(resource: DltResource, table_name: str) -> None: else: # write item belonging to table with static name table_name = resource.table_name # type: ignore - _write_static_table(resource, table_name) - _write_item(table_name, resource.name, pipe_item.item) + _write_static_table(resource, table_name, pipe_item.item) # find defined resources that did not yield any pipeitems and create empty jobs for them data_tables = {t["name"]: t for t in schema.data_tables()} @@ -173,14 +207,14 @@ def _write_static_table(resource: DltResource, table_name: str) -> None: def extract_with_schema( storage: ExtractorStorage, source: DltSource, - schema: Schema, + pipeline_schema: Schema, collector: Collector, max_parallel_items: int, workers: int, ) -> str: # generate extract_id to be able to commit all the sources together later extract_id = storage.create_extract_id() - with Container().injectable_context(SourceSchemaInjectableContext(schema)): + with Container().injectable_context(SourceSchemaInjectableContext(source.schema)): # inject the config section with the current source name with inject_section(ConfigSectionContext(sections=(known_sections.SOURCES, source.section, source.name), source_state_key=source.name)): # reset resource states @@ -189,12 +223,12 @@ def extract_with_schema( if resource.write_disposition == "replace": _reset_resource_state(resource._name) - extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) + extractor = extract(extract_id, source, storage, collector, pipeline_schema, max_parallel_items=max_parallel_items, workers=workers) # iterate over all items in the pipeline and update the schema if dynamic table hints were present for _, partials in extractor.items(): for partial in partials: - normalized_partial = schema.normalize_table_identifiers(partial) - schema.update_table(normalized_partial) + normalized_partial = source.schema.normalize_table_identifiers(partial) + source.schema.update_table(normalized_partial) return extract_id diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index bade6557a9..81b5751c21 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -229,9 +229,11 @@ def new_table_template( validator = None is_table_complete = False - # freeze the resource if we have a fully defined table and no other explicit contract + # freeze the columns if we have a fully defined table and no other explicit contract if not schema_contract and is_table_complete: - schema_contract = "freeze" + schema_contract = { + "columns": "freeze" + } # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table(table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract) # type: ignore diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 28b3c1fc76..1c8289a74b 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -20,7 +20,6 @@ from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract from dlt.common.schema.utils import diff_tables -from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -862,8 +861,6 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para source_schema = source.schema source_schema.update_normalizers() - extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) - # if source schema does not exist in the pipeline if source_schema.name not in self._schema_storage: # save new schema into the pipeline @@ -876,41 +873,23 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para pipeline_schema = self._schema_storage[source_schema.name] + # update global schema contract settings + if global_contract is not None: + source_schema.set_schema_contract(global_contract, True) + + extract_id = extract_with_schema(storage, source, pipeline_schema, self.collector, max_parallel_items, workers) + # initialize import with fully discovered schema self._schema_storage.save_import_schema_if_not_exists(source_schema) - # update the pipeline schema + # update the pipeline schema with all tables and contract settings for table in source_schema.data_tables(include_incomplete=True): - - # create table diff - normalized_table = pipeline_schema.normalize_table_identifiers(table) - if table["name"] in pipeline_schema.tables: - partial_table = diff_tables(pipeline_schema.tables[table["name"]], normalized_table) - else: - partial_table = normalized_table - - # figure out wether this is a new table - is_new_table = (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) - if is_new_table: - partial_table["x-normalizer"] = {"evolve_once": True} - - # update global schema contract settings - if global_contract is not None: - source_schema.set_schema_contract(global_contract, True) - - # apply schema contractand apply on pipeline schema - schema_contract = resolve_contract_settings_for_table(None, table["name"], pipeline_schema, source_schema) - _, partial_table = pipeline_schema.apply_schema_contract(schema_contract, table["name"], None, partial_table) - - # update pipeline schema - if partial_table: - pipeline_schema.update_table( - partial_table - ) - + pipeline_schema.update_table( + table + ) pipeline_schema.set_schema_contract(source_schema._settings.get("schema_contract", {})) - # globally apply contract override + # globally apply contract override again for all merged tables if global_contract is not None: pipeline_schema.set_schema_contract(global_contract, True) diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index 3a34121d28..ef4d300ed0 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -364,7 +364,7 @@ def test_single_settings_value(setting_location: str) -> None: run_resource(pipeline, new_items, {setting_location: "discard_row"}) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 - assert ("new_items" in table_counts) == (setting_location == "resource") + assert "new_items" not in table_counts def test_data_contract_interaction() -> None: @@ -414,31 +414,23 @@ def get_items_subtable(): "sub": [{"hello": "dave"}] }] - - # test get new items + # test valid object pipeline = get_pipeline() - pipeline.run([get_items()]) - assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 - # now items with model work + # items with model work pipeline.run([get_items_with_model()]) assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 - # items with model alone does not work, since contract is set to freeze + # loading once with pydantic will freeze the cols pipeline = get_pipeline() - with raises_frozen_exception(True): - pipeline.run([get_items_with_model()]) - - # test new subtable - pipeline = get_pipeline() - pipeline.run([get_items()]) pipeline.run([get_items_with_model()]) with raises_frozen_exception(True): - pipeline.run([get_items_subtable()]) + pipeline.run([get_items_new_col()]) - # it is possible to override contract when there is a model + # it is possible to override contract when there are new columns # items with model alone does not work, since contract is set to freeze pipeline = get_pipeline() - pipeline.run([get_items_with_model()], schema_contract="evolve") + pipeline.run([get_items_with_model()]) + pipeline.run([get_items_new_col()], schema_contract="evolve") assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 From b72a1a949e24496b90f8dd51f209e358f7d7271f Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 2 Oct 2023 14:09:54 +0200 Subject: [PATCH 43/73] fix most tests --- dlt/extract/extract.py | 5 +- dlt/pipeline/pipeline.py | 9 +- .../cases/schemas/eth/ethereum_schema_v7.yml | 4 +- .../schema/test_contract_mode_functions.py | 111 ++++++++++++------ tests/common/utils.py | 2 +- tests/extract/test_extract.py | 6 +- tests/extract/test_incremental.py | 1 - tests/pipeline/test_pipeline.py | 12 +- 8 files changed, 101 insertions(+), 49 deletions(-) diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 1742ade224..441f81c7fb 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -120,6 +120,7 @@ def _add_dynamic_table(resource: DltResource, data_item: TDataItem = None, table """ Computes new table and does contract checks """ + # TODO: We have to normalize table identifiers here table = resource.compute_table_schema(data_item) if table_name: table["name"] = table_name @@ -142,8 +143,8 @@ def _add_dynamic_table(resource: DltResource, data_item: TDataItem = None, table if not checked_table: disallowed_tables.add(table["name"]) return False - - dynamic_tables[table_name] = [checked_table] + + dynamic_tables[checked_table["name"]] = [checked_table] return True # yield from all selected pipes diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 1c8289a74b..0c75bc15d4 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -863,8 +863,11 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para # if source schema does not exist in the pipeline if source_schema.name not in self._schema_storage: - # save new schema into the pipeline - self._schema_storage.save_schema(Schema(source_schema.name)) + # TODO: here we should create a new schema but copy hints and possibly other settings + # over from the schema table. Is this the right way? + new_schema = Schema(source_schema.name) + new_schema._settings = source_schema._settings + self._schema_storage.save_schema(new_schema) # and set as default if this is first schema in pipeline if not self.default_schema_name: @@ -879,7 +882,9 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para extract_id = extract_with_schema(storage, source, pipeline_schema, self.collector, max_parallel_items, workers) + # initialize import with fully discovered schema + # TODO: is this the right location for this? self._schema_storage.save_import_schema_if_not_exists(source_schema) # update the pipeline schema with all tables and contract settings diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index 5a8db47163..f8645d78ae 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -1,5 +1,5 @@ -version: 14 -version_hash: VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus= +version: 15 +version_hash: yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= engine_version: 7 name: ethereum tables: diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index 155ad918e2..6091c57e6f 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -2,6 +2,7 @@ import copy from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.schema.exceptions import SchemaFrozenException @@ -59,18 +60,18 @@ def test_resolve_contract_settings() -> None: # defaults schema = get_schema() - assert schema.resolve_contract_settings_for_table(None, "tables") == DEFAULT_SCHEMA_CONTRACT_MODE - assert schema.resolve_contract_settings_for_table("tables", "child_table") == DEFAULT_SCHEMA_CONTRACT_MODE + assert resolve_contract_settings_for_table(None, "tables", schema) == DEFAULT_SCHEMA_CONTRACT_MODE + assert resolve_contract_settings_for_table("tables", "child_table", schema) == DEFAULT_SCHEMA_CONTRACT_MODE # table specific full setting schema = get_schema() schema.tables["tables"]["schema_contract"] = "freeze" - assert schema.resolve_contract_settings_for_table(None, "tables") == { + assert resolve_contract_settings_for_table(None, "tables", schema) == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" } - assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + assert resolve_contract_settings_for_table("tables", "child_table", schema) == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" @@ -82,12 +83,12 @@ def test_resolve_contract_settings() -> None: "tables": "freeze", "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "tables") == { + assert resolve_contract_settings_for_table(None, "tables", schema) == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" } - assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + assert resolve_contract_settings_for_table("tables", "child_table", schema) == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" @@ -96,12 +97,12 @@ def test_resolve_contract_settings() -> None: # schema specific full setting schema = get_schema() schema._settings["schema_contract"] = "freeze" - assert schema.resolve_contract_settings_for_table(None, "tables") == { + assert resolve_contract_settings_for_table(None, "tables", schema) == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" } - assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + assert resolve_contract_settings_for_table("tables", "child_table", schema) == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" @@ -113,35 +114,79 @@ def test_resolve_contract_settings() -> None: "tables": "freeze", "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "tables") == { + assert resolve_contract_settings_for_table(None, "tables", schema) == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" } - assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + assert resolve_contract_settings_for_table("tables", "child_table", schema) == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" } - # mixed settings + # mixed settings: table setting always prevails schema = get_schema() schema._settings["schema_contract"] = "freeze" schema.tables["tables"]["schema_contract"] = { "tables": "evolve", "columns": "discard_value", } - assert schema.resolve_contract_settings_for_table(None, "tables") == { + assert resolve_contract_settings_for_table(None, "tables", schema) == { "tables": "evolve", "columns": "discard_value", - "data_type": "freeze" + "data_type": "evolve" } - assert schema.resolve_contract_settings_for_table("tables", "child_table") == { + assert resolve_contract_settings_for_table("tables", "child_table", schema) == { "tables": "evolve", "columns": "discard_value", + "data_type": "evolve" + } + + # current and incoming schema + current_schema = get_schema() + current_schema._settings["schema_contract"] = "discard_value" + incoming_schema = get_schema() + incoming_schema._settings["schema_contract"] = "discard_row" + incoming_table = {"name": "incomplete_table", "schema_contract": "freeze"} + + + # incoming schema overrides + assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { + "tables": "discard_row", + "columns": "discard_row", + "data_type": "discard_row" + } + + # direct incoming table overrides + assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema, incoming_table) == { + "tables": "freeze", + "columns": "freeze", "data_type": "freeze" } + # table defined on existing schema overrided incoming schema setting + current_schema.tables["tables"]["schema_contract"] = "discard_value" + assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { + "tables": "discard_value", + "columns": "discard_value", + "data_type": "discard_value" + } + + # but table on incoming schema overrides again + incoming_schema.tables["tables"]["schema_contract"] = "discard_row" + assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { + "tables": "discard_row", + "columns": "discard_row", + "data_type": "discard_row" + } + + # incoming table still overrides all + assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema, incoming_table) == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } # ensure other settings do not interfere with the main setting we are testing base_settings = [{ @@ -178,12 +223,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.apply_schema_contract({**base_settings, **{"tables": "evolve"}}, "new_table", data, new_table, False) == (data, new_table) - assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_row"}}, "new_table", data, new_table, False) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_value"}}, "new_table", data, new_table, False) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"tables": "evolve"}}, "new_table", data, new_table) == (data, new_table) + assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_row"}}, "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_value"}}, "new_table", data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"tables": "freeze"}}, "new_table", data, new_table, False) + schema.apply_schema_contract({**base_settings, **{"tables": "freeze"}}, "new_table", data, new_table) @pytest.mark.parametrize("base_settings", base_settings) @@ -213,12 +258,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), table_update, True) + schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), table_update) # @@ -245,12 +290,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) == (data, popped_table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update, True) + schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) @@ -281,16 +326,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) + schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data_with_new_row, table_update) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update), True) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/common/utils.py b/tests/common/utils.py index 4c68e32bf3..c80981df37 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -15,7 +15,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V7 = "VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus=" +IMPORTED_VERSION_HASH_ETH_V7 = "yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 530a089f1c..b35de90c1a 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -3,7 +3,7 @@ from dlt.common.storages import NormalizeStorageConfiguration from dlt.extract.extract import ExtractorStorage, extract from dlt.extract.source import DltResource, DltSource - +from dlt.common.schema import Schema from tests.utils import clean_test_storage from tests.extract.utils import expect_extracted_file @@ -18,7 +18,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) + schema_update = extract(extract_id, source, storage, pipeline_schema=Schema("some_schema")) # odd and even tables assert len(schema_update) == 2 assert "odd_table" in schema_update @@ -42,7 +42,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) + schema_update = extract(extract_id, source, storage, pipeline_schema=Schema("some_schema")) assert len(schema_update) == 1 assert "odd_table" in schema_update for partials in schema_update.values(): diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 3160a2a1ee..8a5ea84982 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -404,7 +404,6 @@ def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): @dlt.resource def standalone_some_data(now=None, last_timestamp=dlt.sources.incremental("item.timestamp")): for i in range(-10, 10): - print(i) yield {"delta": i, "item": {"timestamp": (now or pendulum.now()).add(days=i).timestamp()}} diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index c668d81073..0bcaa2b70b 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -740,11 +740,13 @@ def some_source(): p = dlt.pipeline(pipeline_name=uniq_id(), destination='dummy') p.run(source) - assert source.schema.tables['some_table']['resource'] == 'static_data' - assert source.schema.tables['dynamic_func_table']['resource'] == 'dynamic_func_data' - assert source.schema.tables['dynamic_mark_table']['resource'] == 'dynamic_mark_data' - assert source.schema.tables['parent_table']['resource'] == 'nested_data' - assert 'resource' not in source.schema.tables['parent_table__items'] + schema = p.default_schema + + assert schema.tables['some_table']['resource'] == 'static_data' + assert schema.tables['dynamic_func_table']['resource'] == 'dynamic_func_data' + assert schema.tables['dynamic_mark_table']['resource'] == 'dynamic_mark_data' + assert schema.tables['parent_table']['resource'] == 'nested_data' + assert 'resource' not in schema.tables['parent_table__items'] def test_preserve_fields_order() -> None: From 2ae36e351921f39fe11ed5485e19e8e385cc8979 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 2 Oct 2023 15:05:40 +0200 Subject: [PATCH 44/73] fix linter after merge --- dlt/common/schema/__init__.py | 2 +- dlt/common/schema/schema.py | 10 +-- dlt/common/schema/typing.py | 23 +++--- dlt/extract/extract.py | 4 +- dlt/extract/schema.py | 2 +- dlt/normalize/normalize.py | 2 +- dlt/pipeline/pipeline.py | 2 +- .../schema/test_contract_mode_functions.py | 82 ++++++++++--------- tests/common/schema/test_filtering.py | 2 +- tests/common/schema/test_versioning.py | 2 +- tests/common/test_typing.py | 20 ++--- tests/common/test_validation.py | 4 +- tests/load/test_freeze_and_data_contract.py | 4 +- tests/load/test_job_client.py | 4 +- tests/load/utils.py | 2 +- 15 files changed, 84 insertions(+), 81 deletions(-) diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index b48e0a223d..bd719556c6 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,4 +1,4 @@ -from dlt.common.schema.typing import TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase # noqa: F401 +from dlt.common.schema.typing import TSchemaContractDict, TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase # noqa: F401 from dlt.common.schema.typing import COLUMN_HINTS # noqa: F401 from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE # noqa: F401 from dlt.common.schema.utils import verify_schema_hash # noqa: F401 diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 86338908aa..2075e5789c 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -236,7 +236,7 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: return row, partial_table # if evolve once is set, allow all column changes - evolve_once = (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) + evolve_once = (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] if evolve_once: return row, partial_table @@ -650,18 +650,18 @@ def _compile_settings(self) -> None: def __repr__(self) -> str: return f"Schema {self.name} at {id(self)}" -def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> Tuple[bool, TSchemaContractDict]: +def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: settings = settings or {} if isinstance(settings, str): settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) - return {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {} - + return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {}) + if incoming_table and (incoming_table_contract_mode := resolve_single(incoming_table.get("schema_contract", {}))): return incoming_table_contract_mode - + # find table settings table = parent_table or table_name if table in current_schema.tables: diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index c5f1f62e1e..68a95d0656 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -88,17 +88,18 @@ class NormalizerInfo(TypedDict, total=True): new_table: bool # TypedDict that defines properties of a table -TTableSchema = TypedDict("TTableSchema", { - "name": Optional[str], - "description": Optional[str], - "write_disposition": Optional[TWriteDisposition], - "schema_contract": Optional[TSchemaContract], - "parent": Optional[str], - "filters": Optional[TRowFilters], - "columns": TTableSchemaColumns, - "resource": Optional[str], - "x-normalizer": Optional[NormalizerInfo], -}) + +class TTableSchema(TypedDict, total=False): + """TypedDict that defines properties of a table""" + name: Optional[str] + description: Optional[str] + write_disposition: Optional[TWriteDisposition] + schema_contract: Optional[TSchemaContract] + table_sealed: Optional[bool] + parent: Optional[str] + filters: Optional[TRowFilters] + columns: TTableSchemaColumns + resource: Optional[str] class TPartialTableSchema(TTableSchema): pass diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 441f81c7fb..004851cf1a 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -133,8 +133,8 @@ def _add_dynamic_table(resource: DltResource, data_item: TDataItem = None, table # TODO: is this the correct check for a new table, should a table with only incomplete columns be new too? is_new_table = (table["name"] not in pipeline_schema.tables) or (not pipeline_schema.tables[table["name"]]["columns"]) if is_new_table: - table["x-normalizer"] = {"evolve_once": True} - + table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] + # apply schema contract and apply on pipeline schema # here we only check that table may be created schema_contract = resolve_contract_settings_for_table(None, table["name"], pipeline_schema, source.schema, table) diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 81b5751c21..422ae81333 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -224,7 +224,7 @@ def new_table_template( columns = ensure_table_schema_columns_hint(columns) if not callable(columns): columns = columns.values() # type: ignore - is_table_complete = len([c for c in columns if c.get("name") and c.get("data_type")]) + is_table_complete = len([c for c in columns if c.get("name") and c.get("data_type")]) # type: ignore else: validator = None is_table_complete = False diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 8b294e1f2e..e212a2234f 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -281,7 +281,7 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files needs_schema_save = len(schema_updates) > 0 # remove normalizer specific info for table in schema.tables.values(): - if table.pop("x-normalizer", None): + if table.pop("x-normalizer", None): # type: ignore[typeddict-item] needs_schema_save = True # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) if needs_schema_save: diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 764ae635f0..682399ba73 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -893,7 +893,7 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para table ) pipeline_schema.set_schema_contract(source_schema._settings.get("schema_contract", {})) - + # globally apply contract override again for all merged tables if global_contract is not None: pipeline_schema.set_schema_contract(global_contract, True) diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_contract_mode_functions.py index 6091c57e6f..95f2d0e267 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_contract_mode_functions.py @@ -1,10 +1,12 @@ +from typing import cast + import pytest import copy -from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE, TSchemaContractDict from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.schema.exceptions import SchemaFrozenException - +from dlt.common.schema.typing import TTableSchema def get_schema() -> Schema: s = Schema("event") @@ -12,11 +14,11 @@ def get_schema() -> Schema: columns = { "column_1": { "name": "column_1", - "data_type": "string" + "data_type": "text" }, "column_2": { "name": "column_2", - "data_type": "number", + "data_type": "bigint", "is_variant": True } } @@ -32,26 +34,26 @@ def get_schema() -> Schema: # add some tables - s.update_table({ + s.update_table(cast(TTableSchema, { "name": "tables", "columns": columns - }) + })) - s.update_table({ + s.update_table(cast(TTableSchema, { "name": "child_table", "parent": "tables", "columns": columns - }) + })) - s.update_table({ + s.update_table(cast(TTableSchema, { "name": "incomplete_table", "columns": incomplete_columns - }) + })) - s.update_table({ + s.update_table(cast(TTableSchema, { "name": "mixed_table", "columns": {**incomplete_columns, **columns} - }) + })) return s @@ -148,7 +150,7 @@ def test_resolve_contract_settings() -> None: current_schema._settings["schema_contract"] = "discard_value" incoming_schema = get_schema() incoming_schema._settings["schema_contract"] = "discard_row" - incoming_table = {"name": "incomplete_table", "schema_contract": "freeze"} + incoming_table: TTableSchema = {"name": "incomplete_table", "schema_contract": "freeze"} # incoming schema overrides @@ -223,12 +225,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.apply_schema_contract({**base_settings, **{"tables": "evolve"}}, "new_table", data, new_table) == (data, new_table) - assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_row"}}, "new_table", data, new_table) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"tables": "discard_value"}}, "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), "new_table", data, new_table) == (data, new_table) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), "new_table", data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"tables": "freeze"}}, "new_table", data, new_table) + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), "new_table", data, new_table) @pytest.mark.parametrize("base_settings", base_settings) @@ -243,27 +245,27 @@ def test_check_adding_new_columns(base_settings) -> None: "column_2": 123 } data_with_new_row = { - **data, + **data, # type: ignore "new_column": "some string" } - table_update = { + table_update: TTableSchema = { "name": "tables", "columns": { "new_column": { "name": "new_column", - "data_type": "string" + "data_type": "text" } } } popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), table_update) # @@ -274,7 +276,7 @@ def test_check_adding_new_columns(base_settings) -> None: "column_2": 123, } data_with_new_row = { - **data, + **data, # type: ignore "incomplete_column_1": "some other string", } table_update = { @@ -282,7 +284,7 @@ def test_check_adding_new_columns(base_settings) -> None: "columns": { "incomplete_column_1": { "name": "incomplete_column_1", - "data_type": "string" + "data_type": "text" } } } @@ -290,12 +292,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract({**base_settings, **{"columns": "evolve"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_row"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract({**base_settings, **{"columns": "discard_value"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**base_settings, **{"columns": "freeze"}}, "mixed_table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) @@ -310,15 +312,15 @@ def test_check_adding_new_variant() -> None: "column_2": 123 } data_with_new_row = { - **data, + **data, # type: ignore "column_2_variant": 345345 } - table_update = { + table_update: TTableSchema = { "name": "tables", "columns": { "column_2_variant": { "name": "column_2_variant", - "data_type": "number", + "data_type": "bigint", "variant": True } } @@ -326,16 +328,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract({**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}, "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file diff --git a/tests/common/schema/test_filtering.py b/tests/common/schema/test_filtering.py index 371c3dc5af..9a7fe01f54 100644 --- a/tests/common/schema/test_filtering.py +++ b/tests/common/schema/test_filtering.py @@ -120,5 +120,5 @@ def _add_excludes(schema: Schema) -> None: bot_table["filters"]["includes"] = [ TSimpleRegex("re:^data__custom$"), TSimpleRegex("re:^custom_data__included_object__"), TSimpleRegex("re:^metadata__elvl1__elvl2__") ] - schema.update_schema(bot_table) + schema.update_table(bot_table) schema._compile_settings() diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 7c7e006aae..4e4278a539 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -83,7 +83,7 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") version = eth_v7["version"] version_hash = eth_v7["version_hash"] schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index f3d67e7bf9..41d3d8d274 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -30,7 +30,7 @@ def test_is_list_generic_type() -> None: assert is_list_generic_type(List[str]) is True assert is_list_generic_type(Sequence[str]) is True assert is_list_generic_type(MutableSequence[str]) is True - assert is_list_generic_type(TOptionalUnionLiTyDi) is False + assert is_list_generic_type(TOptionalUnionLiTyDi) is False # type: ignore[arg-type] def test_is_dict_generic_type() -> None: @@ -49,20 +49,20 @@ def test_optional() -> None: assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] assert is_optional_type(TTestTyDi) is False - assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] - assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] def test_union_types() -> None: - assert is_optional_type(TOptionalLi) is True - assert is_optional_type(TOptionalTyDi) is True + assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] + assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] assert is_optional_type(TTestTyDi) is False - assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] - assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] - assert is_optional_type(TOptionalUnionLiTyDi) is True - assert extract_union_types(TOptionalUnionLiTyDi) == [TTestTyDi, TTestLi, type(None)] + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] + assert is_optional_type(TOptionalUnionLiTyDi) is True # type: ignore[arg-type] + assert extract_union_types(TOptionalUnionLiTyDi) == [TTestTyDi, TTestLi, type(None)] # type: ignore[arg-type] assert is_union_type(MutableSequence[str]) is False - + def test_is_newtype() -> None: NT1 = NewType("NT1", str) diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 919ed59f57..0a034dc72f 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -239,7 +239,7 @@ def test_nested_union(test_doc: TTestRecord) -> None: test_doc["f_optional_union"] = {"field": "uno"} validate_dict(TTestRecord, TEST_DOC, ".") - test_doc["f_optional_union"] = {"field": "not valid"} + test_doc["f_optional_union"] = {"field": "not valid"} # type: ignore[typeddict-item] with pytest.raises(DictValidationException) as e: validate_dict(TTestRecord, test_doc, ".") assert e.value.field == "f_optional_union" @@ -248,7 +248,7 @@ def test_nested_union(test_doc: TTestRecord) -> None: test_doc["f_optional_union"] = "dos" validate_dict(TTestRecord, test_doc, ".") - test_doc["f_optional_union"] = "blah" + test_doc["f_optional_union"] = "blah" # type: ignore[typeddict-item] with pytest.raises(DictValidationException) as e: validate_dict(TTestRecord, test_doc, ".") assert e.value.field == "f_optional_union" diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_freeze_and_data_contract.py index ef4d300ed0..7b2beee84c 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_freeze_and_data_contract.py @@ -104,7 +104,7 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline, resource_fun, settings) -> DltSource: +def run_resource(pipeline, resource_fun, settings) -> None: for item in settings.keys(): assert item in LOCATIONS @@ -557,7 +557,7 @@ def columns(item): if item["id"] == 2: return [{"name": "id", "data_type": "bigint", "nullable": True}] - @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"columns": column_mode}) + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"columns": column_mode}) # type: ignore def get_items(): yield { "id": 1, diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 79c1c4719a..67569a89e2 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -592,7 +592,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: pytest.skip("preferred loader file format not set, destination will only work with staging") user_table = load_table("event_user")["event_user"] - client.schema.update_schema(new_table("event_user", columns=list(user_table.values()))) + client.schema.update_table(new_table("event_user", columns=list(user_table.values()))) client.schema.bump_version() schema_update = client.update_stored_schema() assert len(schema_update) > 0 @@ -646,7 +646,7 @@ def prepare_schema(client: SqlJobClientBase, case: str) -> Tuple[List[Dict[str, # use first row to infer table table: TTableSchemaColumns = {k: client.schema._infer_column(k, v) for k, v in rows[0].items()} table_name = f"event_{case}_{uniq_id()}" - client.schema.update_schema(new_table(table_name, columns=list(table.values()))) + client.schema.update_table(new_table(table_name, columns=list(table.values()))) client.schema.bump_version() client.update_stored_schema() return rows, table_name diff --git a/tests/load/utils.py b/tests/load/utils.py index d8455e3c6d..9941bbb55e 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -188,7 +188,7 @@ def prepare_table(client: JobClientBase, case_name: str = "event_user", table_na user_table_name = table_name + uniq_id() else: user_table_name = table_name - client.schema.update_schema(new_table(user_table_name, columns=list(user_table.values()))) + client.schema.update_table(new_table(user_table_name, columns=list(user_table.values()))) client.schema.bump_version() client.update_stored_schema() return user_table_name From d85e04f4798ec7c1d094053eb3240000b77a8ceb Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 2 Oct 2023 15:33:25 +0200 Subject: [PATCH 45/73] small cleanup --- dlt/common/schema/schema.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 2075e5789c..fe7dc884ff 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -195,9 +195,6 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def is_table_populated(self, table_name: str) -> bool: - return table_name in self.tables and (self.tables[table_name].get("populated") is True) - def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out From 302d90959a2ab953c4d88ba83b3bffdd1ed5ed06 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 17 Oct 2023 22:08:43 +0200 Subject: [PATCH 46/73] post merge code updates --- dlt/common/schema/schema.py | 21 +++-- dlt/extract/decorators.py | 7 +- dlt/extract/extract.py | 59 ++++++++++---- dlt/normalize/items_normalizers.py | 78 +++++++++++-------- dlt/pipeline/pipeline.py | 17 ++-- ...e_functions.py => test_schema_contract.py} | 6 +- ...a_contract.py => test_schema_contracts.py} | 2 +- 7 files changed, 122 insertions(+), 68 deletions(-) rename tests/common/schema/{test_contract_mode_functions.py => test_schema_contract.py} (99%) rename tests/load/{test_freeze_and_data_contract.py => test_schema_contracts.py} (99%) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 8ff475d732..262f12bb2c 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -195,7 +195,8 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + @staticmethod + def apply_schema_contract(schema: "Schema", contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -219,31 +220,31 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - is_new_table = (table_name not in self.tables) or (not self.tables[table_name]["columns"]) + is_new_table = not schema or (table_name not in schema.tables) or (not schema.tables[table_name]["columns"]) # check case where we have a new table if is_new_table: if contract_modes["tables"] in ["discard_row", "discard_value"]: return None, None if contract_modes["tables"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") + raise SchemaFrozenException(schema.name if schema else "", table_name, f"Trying to add table {table_name} but new tables are frozen.") # in case we only check table creation in pipeline if not row: return row, partial_table # if evolve once is set, allow all column changes - evolve_once = (table_name in self.tables) and self.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] + evolve_once = (table_name in schema.tables) and schema.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] if evolve_once: return row, partial_table # check columns for item in list(row.keys()): # dlt cols may always be added - if item.startswith(self._dlt_tables_prefix): + if item.startswith(schema._dlt_tables_prefix): continue # if this is a new column for an existing table... - if not is_new_table and (item not in self.tables[table_name]["columns"] or not utils.is_complete_column(self.tables[table_name]["columns"][item])): + if not is_new_table and (item not in schema.tables[table_name]["columns"] or not utils.is_complete_column(schema.tables[table_name]["columns"][item])): is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") if contract_modes["columns"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): row.pop(item) @@ -251,9 +252,9 @@ def apply_schema_contract(self, contract_modes: TSchemaContractDict, table_name: elif contract_modes["columns"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None elif is_variant and contract_modes["data_type"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") + raise SchemaFrozenException(schema.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") elif contract_modes["columns"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") + raise SchemaFrozenException(schema.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") return row, partial_table @@ -463,6 +464,8 @@ def update_normalizers(self) -> None: self._configure_normalizers(normalizers) def set_schema_contract(self, settings: TSchemaContract, update_table_settings: bool = False) -> None: + if not settings: + return self._settings["schema_contract"] = settings if update_table_settings: for table in self.tables.values(): @@ -666,6 +669,8 @@ def __repr__(self) -> str: def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" + current_schema = current_schema or incoming_schema + def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: settings = settings or {} if isinstance(settings, str): diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index fe1fd16ab0..2d53258a65 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -69,7 +69,7 @@ def source( root_key: bool = False, schema: Schema = None, schema_contract: TSchemaContract = None, - spec: Type[BaseConfiguration] = None + spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: ... @@ -83,9 +83,8 @@ def source( root_key: bool = False, schema: Schema = None, schema_contract: TSchemaContract = None, - spec: Type[BaseConfiguration] = None + spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ->>>>>>> devel ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. @@ -358,7 +357,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=columns, primary_key=primary_key, merge_key=merge_key, - schema_contract=schema_contract + schema_contract=schema_contract, table_format=table_format ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index c4a9bc8c8c..dfd2a43e6b 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,6 +1,6 @@ import contextlib import os -from typing import ClassVar, List, Set, Dict, Type, Any, Sequence, Optional +from typing import ClassVar, List, Set, Dict, Type, Any, Sequence, Optional, Set from collections import defaultdict from dlt.common.configuration.container import Container @@ -116,7 +116,8 @@ def __init__( schema: Schema, resources_with_items: Set[str], dynamic_tables: TSchemaUpdate, - collector: Collector = NULL_COLLECTOR + collector: Collector = NULL_COLLECTOR, + pipeline_schema: Schema = None ) -> None: self._storage = storage self.schema = schema @@ -124,6 +125,8 @@ def __init__( self.collector = collector self.resources_with_items = resources_with_items self.extract_id = extract_id + self.disallowed_tables: Set[str] = set() + self.pipeline_schema = pipeline_schema @property def storage(self) -> ExtractorItemStorage: @@ -148,7 +151,6 @@ def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> No if isinstance(meta, TableNameMeta): table_name = meta.table_name self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) else: if resource._table_name_hint_fun: if isinstance(items, list): @@ -160,7 +162,6 @@ def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> No # write item belonging to table with static name table_name = resource.table_name # type: ignore[assignment] self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) def write_empty_file(self, table_name: str) -> None: table_name = self.schema.naming.normalize_table_identifier(table_name) @@ -179,7 +180,8 @@ def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: table_name = resource._table_name_hint_fun(item) existing_table = self.dynamic_tables.get(table_name) if existing_table is None: - self.dynamic_tables[table_name] = [resource.compute_table_schema(item)] + if not self._add_dynamic_table(resource, data_item=item): + return else: # quick check if deep table merge is required if resource._table_has_other_dynamic_hints: @@ -195,9 +197,40 @@ def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: existing_table = self.dynamic_tables.get(table_name) if existing_table is None: - static_table = resource.compute_table_schema() - static_table["name"] = table_name - self.dynamic_tables[table_name] = [static_table] + if not self._add_dynamic_table(resource, table_name=table_name): + return + self._write_item(table_name, resource.name, items) + + def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, table_name: Optional[str] = None) -> bool: + """ + Computes new table and does contract checks + """ + # TODO: We have to normalize table identifiers here + table = resource.compute_table_schema(data_item) + if table_name: + table["name"] = table_name + + # fast exit if we already evaluated this + if table["name"] in self.disallowed_tables: + return False + + # this is a new table so allow evolve once + # TODO: is this the correct check for a new table, should a table with only incomplete columns be new too? + is_new_table = (self.pipeline_schema == None) or (table["name"] not in self.pipeline_schema.tables) or (not self.pipeline_schema.tables[table["name"]]["columns"]) + if is_new_table: + table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] + + # apply schema contract and apply on pipeline schema + # here we only check that table may be created + schema_contract = resolve_contract_settings_for_table(None, table["name"], self.pipeline_schema, self.schema, table) + _, checked_table = Schema.apply_schema_contract(self.pipeline_schema, schema_contract, table["name"], None, table) + + if not checked_table: + self.disallowed_tables.add(table["name"]) + return False + + self.dynamic_tables[checked_table["name"]] = [checked_table] + return True class JsonLExtractor(Extractor): @@ -238,6 +271,7 @@ def extract( storage: ExtractorStorage, collector: Collector = NULL_COLLECTOR, *, + pipeline_schema: Schema = None, max_parallel_items: int = None, workers: int = None, futures_poll_interval: float = None @@ -247,10 +281,10 @@ def extract( resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector, pipeline_schema=pipeline_schema ), "arrow": ArrowExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector, pipeline_schema=pipeline_schema ) } last_item_format: Optional[TLoaderFileFormat] = None @@ -318,11 +352,10 @@ def extract_with_schema( with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - - extractor = extract(extract_id, source, storage, collector, pipeline_schema, max_parallel_items=max_parallel_items, workers=workers) + extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers, pipeline_schema=pipeline_schema) # iterate over all items in the pipeline and update the schema if dynamic table hints were present for _, partials in extractor.items(): for partial in partials: - schema.update_table(schema.normalize_table_identifiers(partial)) + source.schema.update_table(source.schema.normalize_table_identifiers(partial)) return extract_id diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 2f613f4b40..526345bc89 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -5,12 +5,12 @@ from dlt.common import json, logger from dlt.common.json import custom_pua_decode from dlt.common.runtime import signals -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.storages import NormalizeStorage, LoadStorage, NormalizeStorageConfiguration, FileStorage +from dlt.common.schema.typing import TTableSchemaColumns, TSchemaContractDict +from dlt.common.storages import NormalizeStorage, LoadStorage, FileStorage from dlt.common.typing import TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.utils import TRowCount, merge_row_count, increase_row_count - +from dlt.common.schema.schema import resolve_contract_settings_for_table class ItemsNormalizer(Protocol): def __call__( @@ -41,45 +41,57 @@ def _normalize_chunk( schema_name = schema.name items_count = 0 row_counts: TRowCount = {} + schema_contract: TSchemaContractDict = None for item in items: for (table_name, parent_table), row in schema.normalize_data_item( item, load_id, root_table_name ): + if not schema_contract: + schema_contract = resolve_contract_settings_for_table(parent_table, table_name, schema) # filter row, may eliminate some or all fields row = schema.filter_row(table_name, row) # do not process empty rows - if row: - # decode pua types - for k, v in row.items(): - row[k] = custom_pua_decode(v) # type: ignore - # coerce row of values into schema table, generating partial table with new columns if any - row, partial_table = schema.coerce_row( - table_name, parent_table, row - ) - # theres a new table or new columns in existing table - if partial_table: - # update schema and save the change - schema.update_table(partial_table) - table_updates = schema_update.setdefault(table_name, []) - table_updates.append(partial_table) - # update our columns - column_schemas[table_name] = schema.get_table_columns( - table_name - ) - # get current columns schema - columns = column_schemas.get(table_name) - if not columns: - columns = schema.get_table_columns(table_name) - column_schemas[table_name] = columns - # store row - # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock - load_storage.write_data_item( - load_id, schema_name, table_name, row, columns + if not row: + continue + + # decode pua types + for k, v in row.items(): + row[k] = custom_pua_decode(v) # type: ignore + # coerce row of values into schema table, generating partial table with new columns if any + row, partial_table = schema.coerce_row( + table_name, parent_table, row + ) + + # if we detect a migration, check schema contract + if partial_table: + row, partial_table = Schema.apply_schema_contract(schema, schema_contract, table_name, row, partial_table) + if not row: + continue + + # theres a new table or new columns in existing table + if partial_table: + # update schema and save the change + schema.update_table(partial_table) + table_updates = schema_update.setdefault(table_name, []) + table_updates.append(partial_table) + # update our columns + column_schemas[table_name] = schema.get_table_columns( + table_name ) - # count total items - items_count += 1 - increase_row_count(row_counts, table_name, 1) + # get current columns schema + columns = column_schemas.get(table_name) + if not columns: + columns = schema.get_table_columns(table_name) + column_schemas[table_name] = columns + # store row + # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock + load_storage.write_data_item( + load_id, schema_name, table_name, row, columns + ) + # count total items + items_count += 1 + increase_row_count(row_counts, table_name, 1) signals.raise_if_signalled() return schema_update, items_count, row_counts diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 1005af0f1e..c2c9ea9fc8 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -863,20 +863,25 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para source_schema = source.schema source_schema.update_normalizers() + # discover the existing pipeline schema + pipeline_schema = self._schema_storage[source_schema.name] if source_schema.name in self._schema_storage else None + # extract into pipeline schema - extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) + source.schema.set_schema_contract(global_contract, True) + extract_id = extract_with_schema(storage, source, pipeline_schema, self.collector, max_parallel_items, workers) # save import with fully discovered schema self._schema_storage.save_import_schema_if_not_exists(source_schema) - # if source schema does not exist in the pipeline - if source_schema.name not in self._schema_storage: - # create new schema + # save schema if not present in store + if not pipeline_schema: self._schema_storage.save_schema(source_schema) + pipeline_schema = source_schema - # update pipeline schema (do contract checks here) - pipeline_schema = self._schema_storage[source_schema.name] + # update pipeline schema + print(source_schema.tables) pipeline_schema.update_schema(source_schema) + pipeline_schema.set_schema_contract(global_contract, True) # set as default if this is first schema in pipeline if not self.default_schema_name: diff --git a/tests/common/schema/test_contract_mode_functions.py b/tests/common/schema/test_schema_contract.py similarity index 99% rename from tests/common/schema/test_contract_mode_functions.py rename to tests/common/schema/test_schema_contract.py index 95f2d0e267..d635983367 100644 --- a/tests/common/schema/test_contract_mode_functions.py +++ b/tests/common/schema/test_schema_contract.py @@ -245,7 +245,7 @@ def test_check_adding_new_columns(base_settings) -> None: "column_2": 123 } data_with_new_row = { - **data, # type: ignore + **data, "new_column": "some string" } table_update: TTableSchema = { @@ -276,7 +276,7 @@ def test_check_adding_new_columns(base_settings) -> None: "column_2": 123, } data_with_new_row = { - **data, # type: ignore + **data, "incomplete_column_1": "some other string", } table_update = { @@ -312,7 +312,7 @@ def test_check_adding_new_variant() -> None: "column_2": 123 } data_with_new_row = { - **data, # type: ignore + **data, "column_2_variant": 345345 } table_update: TTableSchema = { diff --git a/tests/load/test_freeze_and_data_contract.py b/tests/load/test_schema_contracts.py similarity index 99% rename from tests/load/test_freeze_and_data_contract.py rename to tests/load/test_schema_contracts.py index 7b2beee84c..da2e1b2568 100644 --- a/tests/load/test_freeze_and_data_contract.py +++ b/tests/load/test_schema_contracts.py @@ -123,7 +123,7 @@ def source() -> DltResource: pipeline.run(source(), schema_contract=settings.get("override")) # check updated schema - assert pipeline.default_schema._settings.get("schema_contract", {}) == (settings.get("override") or settings.get("source")) + assert pipeline.default_schema._settings.get("schema_contract", None) == (settings.get("override") or settings.get("source")) # check items table settings assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("override") or settings.get("resource") or {}) From 4a1fab0a02ee4cd7eb860b6499e79d1d886a79bb Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 17 Oct 2023 22:59:54 +0200 Subject: [PATCH 47/73] small fixes --- dlt/common/normalizers/json/relational.py | 2 +- dlt/common/schema/exceptions.py | 2 +- dlt/extract/extract.py | 3 +- dlt/normalize/items_normalizers.py | 2 +- dlt/pipeline/pipeline.py | 1 - tests/common/schema/test_schema_contract.py | 38 ++++++++++----------- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index f90874db18..0c43dada06 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -264,7 +264,7 @@ def extend_schema(self) -> None: def extend_table(self, table_name: str) -> None: # if the table has a merge w_d, add propagation info to normalizer table = self.schema.tables.get(table_name) - if not table.get("parent") and table["write_disposition"] == "merge": + if not table.get("parent") and table.get("write_disposition") == "merge": DataItemNormalizer.update_normalizer_config(self.schema, {"propagation": { "tables": { table_name: { diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 9e1f8c5c8b..57c1d5c1df 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -77,7 +77,7 @@ def __init__(self, schema_name: str, table_name: str, msg: str) -> None: self.schema_name = schema_name self.table_name = table_name - + class UnknownTableException(SchemaException): def __init__(self, table_name: str) -> None: self.table_name = table_name diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index dfd2a43e6b..178d669b95 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -216,7 +216,7 @@ def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, # this is a new table so allow evolve once # TODO: is this the correct check for a new table, should a table with only incomplete columns be new too? - is_new_table = (self.pipeline_schema == None) or (table["name"] not in self.pipeline_schema.tables) or (not self.pipeline_schema.tables[table["name"]]["columns"]) + is_new_table = (self.pipeline_schema is None) or (table["name"] not in self.pipeline_schema.tables) or (not self.pipeline_schema.tables[table["name"]]["columns"]) if is_new_table: table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] @@ -263,6 +263,7 @@ def _write_static_table(self, resource: DltResource, table_name: str, items: TDa static_table["columns"] = arrow_columns static_table["name"] = table_name self.dynamic_tables[table_name] = [static_table] + self._write_item(table_name, resource.name, items) def extract( diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 526345bc89..ab83fe9c6b 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -65,7 +65,7 @@ def _normalize_chunk( # if we detect a migration, check schema contract if partial_table: - row, partial_table = Schema.apply_schema_contract(schema, schema_contract, table_name, row, partial_table) + row, partial_table = schema.apply_schema_contract(schema, schema_contract, table_name, row, partial_table) if not row: continue diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index c2c9ea9fc8..b9118fb559 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -879,7 +879,6 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para pipeline_schema = source_schema # update pipeline schema - print(source_schema.tables) pipeline_schema.update_schema(source_schema) pipeline_schema.set_schema_contract(global_contract, True) diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py index d635983367..2c1623af32 100644 --- a/tests/common/schema/test_schema_contract.py +++ b/tests/common/schema/test_schema_contract.py @@ -225,12 +225,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), "new_table", data, new_table) == (data, new_table) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), "new_table", data, new_table) == (None, None) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), "new_table", data, new_table) == (data, new_table) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), "new_table", data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), "new_table", data, new_table) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), "new_table", data, new_table) @pytest.mark.parametrize("base_settings", base_settings) @@ -260,12 +260,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), table_update) # @@ -292,12 +292,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) @@ -328,16 +328,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file From 903f0000074accde95ea0569fb1b6c8413016732 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 18 Oct 2023 10:29:53 +0200 Subject: [PATCH 48/73] some cleanup --- dlt/common/schema/schema.py | 44 +++++++++++++-------- dlt/common/schema/utils.py | 1 - dlt/extract/extract.py | 22 +++++++---- dlt/normalize/items_normalizers.py | 2 +- tests/common/schema/test_schema_contract.py | 38 +++++++++--------- 5 files changed, 61 insertions(+), 46 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 262f12bb2c..f3f7ac04f2 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -196,7 +196,7 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial @staticmethod - def apply_schema_contract(schema: "Schema", contract_modes: TSchemaContractDict, table_name: str, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract(pipeline_schema: Optional["Schema"], contract_modes: TSchemaContractDict, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: """ Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: @@ -215,46 +215,49 @@ def apply_schema_contract(schema: "Schema", contract_modes: TSchemaContractDict, """ assert partial_table + table_name = partial_table["name"] # default settings allow all evolutions, skip all else if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: return row, partial_table - is_new_table = not schema or (table_name not in schema.tables) or (not schema.tables[table_name]["columns"]) + is_new_table = not pipeline_schema or pipeline_schema.is_new_table(table_name) # check case where we have a new table if is_new_table: if contract_modes["tables"] in ["discard_row", "discard_value"]: return None, None if contract_modes["tables"] == "freeze": - raise SchemaFrozenException(schema.name if schema else "", table_name, f"Trying to add table {table_name} but new tables are frozen.") + raise SchemaFrozenException(pipeline_schema.name if pipeline_schema else "", table_name, f"Trying to add table {table_name} but new tables are frozen.") - # in case we only check table creation in pipeline - if not row: + # iif there is no row data, we only check table modes + if not row or not pipeline_schema: return row, partial_table # if evolve once is set, allow all column changes - evolve_once = (table_name in schema.tables) and schema.tables[table_name].get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] + evolve_once = pipeline_schema.tables.get(table_name, {}).get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] if evolve_once: return row, partial_table # check columns - for item in list(row.keys()): + for column_name in list(row.keys()): # dlt cols may always be added - if item.startswith(schema._dlt_tables_prefix): + if column_name.startswith(pipeline_schema._dlt_tables_prefix): continue # if this is a new column for an existing table... - if not is_new_table and (item not in schema.tables[table_name]["columns"] or not utils.is_complete_column(schema.tables[table_name]["columns"][item])): - is_variant = (item in partial_table["columns"]) and partial_table["columns"][item].get("variant") + if not is_new_table and not utils.is_complete_column(pipeline_schema.tables[table_name]["columns"].get(column_name, {})): + is_variant = partial_table["columns"].get(column_name, {}).get("variant") if contract_modes["columns"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): - row.pop(item) - partial_table["columns"].pop(item) + row.pop(column_name) + partial_table["columns"].pop(column_name) elif contract_modes["columns"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): return None, None + # raise on variant columns frozen elif is_variant and contract_modes["data_type"] == "freeze": - raise SchemaFrozenException(schema.name, table_name, f"Trying to create new variant column {item} to table {table_name} data_types are frozen.") + raise SchemaFrozenException(pipeline_schema.name, table_name, f"Trying to create new variant column {column_name} to table {table_name} data_types are frozen.") + # raise on new columns frozen elif contract_modes["columns"] == "freeze": - raise SchemaFrozenException(schema.name, table_name, f"Trying to add column {item} to table {table_name} but columns are frozen.") + raise SchemaFrozenException(pipeline_schema.name, table_name, f"Trying to add column {column_name} to table {table_name} but columns are frozen.") return row, partial_table @@ -396,6 +399,10 @@ def dlt_tables(self) -> List[TTableSchema]: def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) + def is_new_table(self, table_name: str) -> bool: + """Returns true if this table is incomplete (has only incomplete columns) and therefore new""" + return (table_name not in self.tables) or (not [c for c in self.tables[table_name]["columns"].values() if utils.is_complete_column(c)]) + @property def version(self) -> int: """Version of the schema content that takes into account changes from the time of schema loading/creation. @@ -666,29 +673,32 @@ def _compile_settings(self) -> None: def __repr__(self) -> str: return f"Schema {self.name} at {id(self)}" -def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Schema, incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> TSchemaContractDict: +def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Optional[Schema], incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" current_schema = current_schema or incoming_schema + # find settings and expand them to dict if needed def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: settings = settings or {} if isinstance(settings, str): settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {}) + # we have contract modes set on the incoming table definition (from the resource) if incoming_table and (incoming_table_contract_mode := resolve_single(incoming_table.get("schema_contract", {}))): return incoming_table_contract_mode - # find table settings + # find correct parent table table = parent_table or table_name if table in current_schema.tables: table = utils.get_top_level_table(current_schema.tables, parent_table or table_name)["name"] - # modes + # resolve existing contract modes current_table_contract_modes = resolve_single(current_schema.tables.get(table, {}).get("schema_contract", {})) current_schema_contract_modes = resolve_single(current_schema._settings.get("schema_contract", {})) + # if we have stuff defined on the incoming schema, this takes precedence if incoming_schema: if incoming_table_contract_mode := resolve_single(incoming_schema.tables.get(table, {}).get("schema_contract", {})): return incoming_table_contract_mode diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 26e307f3e3..2920ed6682 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -704,7 +704,6 @@ def new_column(column_name: str, data_type: TDataType = None, nullable: bool = T return column - def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: return None diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 178d669b95..5349b6f419 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -127,6 +127,7 @@ def __init__( self.extract_id = extract_id self.disallowed_tables: Set[str] = set() self.pipeline_schema = pipeline_schema + self.normalized_table_names: Dict[str, str] = {} @property def storage(self) -> ExtractorItemStorage: @@ -147,6 +148,14 @@ def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: return "puae-jsonl" return None # Empty list is unknown format + def normalize_table_name(self, table_name: str) -> str: + """Cache normalized table names""" + if normalized_name := self.normalized_table_names.get(table_name): + return normalized_name + normalized_name = self.schema.naming.normalize_table_identifier(table_name) + self.normalized_table_names[table_name] = normalized_name + return normalized_name + def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: if isinstance(meta, TableNameMeta): table_name = meta.table_name @@ -164,14 +173,13 @@ def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_static_table(resource, table_name, items) def write_empty_file(self, table_name: str) -> None: - table_name = self.schema.naming.normalize_table_identifier(table_name) + table_name = self.normalize_table_name(table_name) self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) def _write_item(self, table_name: str, resource_name: str, items: TDataItems) -> None: # normalize table name before writing so the name match the name in schema - # note: normalize function should be cached so there's almost no penalty on frequent calling # note: column schema is not required for jsonl writer used here - table_name = self.schema.naming.normalize_identifier(table_name) + table_name = self.normalize_table_name(table_name) self.collector.update(table_name) self.resources_with_items.add(resource_name) self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, None) @@ -203,9 +211,8 @@ def _write_static_table(self, resource: DltResource, table_name: str, items: TDa def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, table_name: Optional[str] = None) -> bool: """ - Computes new table and does contract checks + Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written """ - # TODO: We have to normalize table identifiers here table = resource.compute_table_schema(data_item) if table_name: table["name"] = table_name @@ -215,15 +222,14 @@ def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, return False # this is a new table so allow evolve once - # TODO: is this the correct check for a new table, should a table with only incomplete columns be new too? - is_new_table = (self.pipeline_schema is None) or (table["name"] not in self.pipeline_schema.tables) or (not self.pipeline_schema.tables[table["name"]]["columns"]) + is_new_table = (self.pipeline_schema is None) or self.pipeline_schema.is_new_table(table["name"]) if is_new_table: table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] # apply schema contract and apply on pipeline schema # here we only check that table may be created schema_contract = resolve_contract_settings_for_table(None, table["name"], self.pipeline_schema, self.schema, table) - _, checked_table = Schema.apply_schema_contract(self.pipeline_schema, schema_contract, table["name"], None, table) + _, checked_table = Schema.apply_schema_contract(self.pipeline_schema, schema_contract, None, table) if not checked_table: self.disallowed_tables.add(table["name"]) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index ab83fe9c6b..71b1ae6916 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -65,7 +65,7 @@ def _normalize_chunk( # if we detect a migration, check schema contract if partial_table: - row, partial_table = schema.apply_schema_contract(schema, schema_contract, table_name, row, partial_table) + row, partial_table = schema.apply_schema_contract(schema, schema_contract, row, partial_table) if not row: continue diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py index 2c1623af32..90a943b72d 100644 --- a/tests/common/schema/test_schema_contract.py +++ b/tests/common/schema/test_schema_contract.py @@ -225,12 +225,12 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), "new_table", data, new_table) == (data, new_table) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), "new_table", data, new_table) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), "new_table", data, new_table) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), data, new_table) == (data, new_table) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), data, new_table) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), data, new_table) == (None, None) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), "new_table", data, new_table) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), data, new_table) @pytest.mark.parametrize("base_settings", base_settings) @@ -260,12 +260,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("new_column") - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(data_with_new_row), table_update) # @@ -292,12 +292,12 @@ def test_check_adding_new_columns(base_settings) -> None: popped_table_update["columns"].pop("incomplete_column_1") # incomplete columns should be treated like new columns - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(data_with_new_row), table_update) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), "mixed_table", copy.deepcopy(data_with_new_row), table_update) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(data_with_new_row), table_update) @@ -328,16 +328,16 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) + schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) # check interaction with new columns settings, variants are new columns.. with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), "tables", copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) + assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file From 912dd8b63a0153221580dbecc1f71e350acb45e1 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 18 Oct 2023 10:46:26 +0200 Subject: [PATCH 49/73] update docs --- docs/website/docs/general-usage/data-contracts.md | 4 ++++ docs/website/docs/getting-started.md | 2 +- docs/website/sidebars.js | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md index cc6fd14f10..543edf2502 100644 --- a/docs/website/docs/general-usage/data-contracts.md +++ b/docs/website/docs/general-usage/data-contracts.md @@ -21,6 +21,8 @@ def items(): This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which contains a new column. +### Possible settings + The `schema_contract` exists on the `source` decorator as a directive for all resources of that source and on the `resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. The `schema_contract` is a dictionary with keys that control the following: @@ -34,6 +36,8 @@ Each property can be set to one of three values: * `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. All other rows will be. * `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. +If a table is a new table that has not been created on the destination yet, dlt will allow the creation of all columns and variants on the first run + ### Code Examples The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md index fc321284ee..cd3f2cc69d 100644 --- a/docs/website/docs/getting-started.md +++ b/docs/website/docs/getting-started.md @@ -444,7 +444,7 @@ Each event type is sent to a different table in `duckdb`. import dlt from dlt.sources.helpers import requests -@dlt.resource(primary_key="id", table_name=lambda i: i["type"], write_disposition="append") # type: ignore +@dlt.resource(primary_key="id", table_name=lambda i: i["type"], write_disposition="append") def repo_events( last_created_at = dlt.sources.incremental("created_at") ): diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 92cf893790..341db3d17e 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -103,6 +103,7 @@ const sidebars = { 'general-usage/full-loading', 'general-usage/credentials', 'general-usage/schema', + 'general-usage/data-contracts', 'general-usage/configuration', 'reference/performance', { From 168f0da1fd14b0ed09626e8bc15aabe5f7a6fb83 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 2 Nov 2023 00:05:26 +0100 Subject: [PATCH 50/73] makes bumping version optional in Schema, preserves hashes on replace schema content --- dlt/common/schema/schema.py | 20 +++++++++++--------- dlt/common/schema/utils.py | 6 +++--- dlt/common/storages/live_schema_storage.py | 20 +++++++++++++------- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 9bffe359e5..e76943423c 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -67,7 +67,7 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._reset_schema(name, normalizers) @classmethod - def from_dict(cls, d: DictStrAny) -> "Schema": + def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": # upgrade engine if needed stored_schema = utils.migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) @@ -77,7 +77,8 @@ def from_dict(cls, d: DictStrAny) -> "Schema": stored_schema = utils.apply_defaults(stored_schema) # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) return cls.from_stored_schema(stored_schema) @classmethod @@ -89,9 +90,10 @@ def from_stored_schema(cls, stored_schema: TStoredSchema) -> "Schema": def replace_schema_content(self, schema: "Schema") -> None: self._reset_schema(schema.name, schema._normalizers_config) - self._from_stored_schema(schema.to_dict()) + # do not bump version so hash from `schema` is preserved + self._from_stored_schema(schema.to_dict(bump_version=False)) - def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: + def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> TStoredSchema: stored_schema: TStoredSchema = { "version": self._stored_version, "version_hash": self._stored_version_hash, @@ -107,7 +109,8 @@ def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: stored_schema["description"] = self._schema_description # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) # remove defaults after bumping version if remove_defaults: utils.remove_defaults(stored_schema) @@ -304,9 +307,8 @@ def bump_version(self) -> Tuple[int, str]: Returns: Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple """ - version = utils.bump_version_if_modified(self.to_dict()) - self._stored_version, self._stored_version_hash = version - return version + self._stored_version, self._stored_version_hash, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) + return self._stored_version, self._stored_version_hash def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} @@ -400,7 +402,7 @@ def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) def is_new_table(self, table_name: str) -> bool: - """Returns true if this table is incomplete (has only incomplete columns) and therefore new""" + """Returns true if this table does not exist OR is incomplete (has only incomplete columns) and therefore new""" return (table_name not in self.tables) or (not [c for c in self.tables[table_name]["columns"].values() if utils.is_complete_column(c)]) @property diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 82a4467a8f..38a857144d 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -134,8 +134,8 @@ def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: # return copy(column) # type: ignore -def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: - # if any change to schema document is detected then bump version and write new hash +def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, str]: + """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") if not previous_hash: @@ -144,7 +144,7 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: elif hash_ != previous_hash: stored_schema["version"] += 1 stored_schema["version_hash"] = hash_ - return stored_schema["version"], hash_ + return stored_schema["version"], hash_, previous_hash def generate_version_hash(stored_schema: TStoredSchema) -> str: diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index c482d5e7ea..79aeb22e61 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List from dlt.common.schema.schema import Schema from dlt.common.configuration.accessors import config @@ -18,7 +18,7 @@ def __getitem__(self, name: str) -> Schema: else: # return new schema instance schema = super().load_schema(name) - self._update_live_schema(schema) + self.update_live_schema(schema) return schema @@ -30,7 +30,7 @@ def load_schema(self, name: str) -> Schema: def save_schema(self, schema: Schema) -> str: rv = super().save_schema(schema) # update the live schema with schema being saved, if no live schema exist, create one to be available for a getter - self._update_live_schema(schema) + self.update_live_schema(schema) return rv def remove_schema(self, name: str) -> None: @@ -54,12 +54,18 @@ def commit_live_schema(self, name: str) -> Schema: self._save_schema(live_schema) return live_schema - def _update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + def update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + """Will update live schema content without writing to storage. Optionally allows to create a new live schema""" live_schema = self.live_schemas.get(schema.name) if live_schema: - # replace content without replacing instance - # print(f"live schema {live_schema} updated in place") - live_schema.replace_schema_content(schema) + if id(live_schema) != id(schema): + # replace content without replacing instance + # print(f"live schema {live_schema} updated in place") + live_schema.replace_schema_content(schema) elif can_create_new: # print(f"live schema {schema.name} created from schema") self.live_schemas[schema.name] = schema + + def list_schemas(self) -> List[str]: + names = list(set(super().list_schemas()) | set(self.live_schemas.keys())) + return names From 760cc4320c23f3580ccb70e9bcca6a2bbf61693d Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 2 Nov 2023 00:06:20 +0100 Subject: [PATCH 51/73] extracts on single pipeline schema --- dlt/extract/extract.py | 66 +++++++++++----------------- dlt/normalize/normalize.py | 4 +- dlt/pipeline/pipeline.py | 69 ++++++++++++++++-------------- tests/common/schema/test_schema.py | 10 ++++- tests/extract/test_extract.py | 4 +- tests/pipeline/test_pipeline.py | 5 ++- 6 files changed, 80 insertions(+), 78 deletions(-) diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 0681fae844..444a6df01f 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -117,8 +117,7 @@ def __init__( schema: Schema, resources_with_items: Set[str], dynamic_tables: TSchemaUpdate, - collector: Collector = NULL_COLLECTOR, - pipeline_schema: Schema = None + collector: Collector = NULL_COLLECTOR ) -> None: self._storage = storage self.schema = schema @@ -127,8 +126,6 @@ def __init__( self.resources_with_items = resources_with_items self.extract_id = extract_id self.disallowed_tables: Set[str] = set() - self.pipeline_schema = pipeline_schema - self.normalized_table_names: Dict[str, str] = {} @property def storage(self) -> ExtractorItemStorage: @@ -149,14 +146,6 @@ def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: return "puae-jsonl" return None # Empty list is unknown format - def normalize_table_name(self, table_name: str) -> str: - """Cache normalized table names""" - if normalized_name := self.normalized_table_names.get(table_name): - return normalized_name - normalized_name = self.schema.naming.normalize_table_identifier(table_name) - self.normalized_table_names[table_name] = normalized_name - return normalized_name - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: if isinstance(meta, TableNameMeta): table_name = meta.table_name @@ -174,13 +163,13 @@ def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_static_table(resource, table_name, items) def write_empty_file(self, table_name: str) -> None: - table_name = self.normalize_table_name(table_name) + table_name = self.schema.naming.normalize_table_identifier(table_name) self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: # normalize table name before writing so the name match the name in schema - # note: column schema is not required for jsonl writer used here - table_name = self.normalize_table_name(table_name) + # note: normalize_table_identifier is caching the normalization results + table_name = self.schema.naming.normalize_table_identifier(table_name) self.collector.update(table_name) self.resources_with_items.add(resource_name) self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) @@ -206,11 +195,11 @@ def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: existing_table = self.dynamic_tables.get(table_name) if existing_table is None: - if not self._add_dynamic_table(resource, table_name=table_name): + if not self._add_dynamic_table(resource, items, table_name=table_name): return self._write_item(table_name, resource.name, items) - def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, table_name: Optional[str] = None) -> bool: + def _add_dynamic_table(self, resource: DltResource, data_item: TDataItems = None, table_name: Optional[str] = None) -> bool: """ Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written """ @@ -223,14 +212,14 @@ def _add_dynamic_table(self, resource: DltResource, data_item: TDataItem = None, return False # this is a new table so allow evolve once - is_new_table = (self.pipeline_schema is None) or self.pipeline_schema.is_new_table(table["name"]) + is_new_table = self.schema.is_new_table(table["name"]) if is_new_table: table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] # apply schema contract and apply on pipeline schema # here we only check that table may be created - schema_contract = resolve_contract_settings_for_table(None, table["name"], self.pipeline_schema, self.schema, table) - _, checked_table = Schema.apply_schema_contract(self.pipeline_schema, schema_contract, None, table) + schema_contract = resolve_contract_settings_for_table(None, table["name"], self.schema) + _, checked_table = Schema.apply_schema_contract(self.schema, schema_contract, None, table) if not checked_table: self.disallowed_tables.add(table["name"]) @@ -280,20 +269,19 @@ def _write_item(self, table_name: str, resource_name: str, items: TDataItems, co def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: existing_table = self.dynamic_tables.get(table_name) - if existing_table is not None: - return - static_table = resource.compute_table_schema() - if isinstance(items, list): - item = items[0] - else: - item = items - # Merge the columns to include primary_key and other hints that may be set on the resource - arrow_columns = pyarrow.py_arrow_to_table_schema_columns(item.schema) - for key, value in static_table["columns"].items(): - arrow_columns[key] = utils.merge_columns(value, arrow_columns.get(key, {})) - static_table["columns"] = arrow_columns - static_table["name"] = table_name - self.dynamic_tables[table_name] = [self.schema.normalize_table_identifiers(static_table)] + if existing_table is None: + static_table = resource.compute_table_schema() + if isinstance(items, list): + item = items[0] + else: + item = items + # Merge the columns to include primary_key and other hints that may be set on the resource + arrow_columns = pyarrow.py_arrow_to_table_schema_columns(item.schema) + for key, value in static_table["columns"].items(): + arrow_columns[key] = utils.merge_columns(value, arrow_columns.get(key, {})) + static_table["columns"] = arrow_columns + static_table["name"] = table_name + self.dynamic_tables[table_name] = [self.schema.normalize_table_identifiers(static_table)] self._write_item(table_name, resource.name, items) @@ -303,7 +291,6 @@ def extract( storage: ExtractorStorage, collector: Collector = NULL_COLLECTOR, *, - pipeline_schema: Schema = None, max_parallel_items: int = None, workers: int = None, futures_poll_interval: float = None @@ -313,10 +300,10 @@ def extract( resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector, pipeline_schema=pipeline_schema + extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector ), "arrow": ArrowExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector, pipeline_schema=pipeline_schema + extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector ) } last_item_format: Optional[TLoaderFileFormat] = None @@ -353,7 +340,7 @@ def extract( for table in tables_by_resources[resource.name]: # we only need to write empty files for the top tables if not table.get("parent", None): - extractors[last_item_format or "puae-jsonl"].write_empty_file(table["name"]) + extractors["puae-jsonl"].write_empty_file(table["name"]) if left_gens > 0: # go to 100% @@ -369,7 +356,6 @@ def extract( def extract_with_schema( storage: ExtractorStorage, source: DltSource, - pipeline_schema: Schema, collector: Collector, max_parallel_items: int, workers: int, @@ -384,7 +370,7 @@ def extract_with_schema( with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers, pipeline_schema=pipeline_schema) + extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) # iterate over all items in the pipeline and update the schema if dynamic table hints were present for _, partials in extractor.items(): for partial in partials: diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index a063d980f8..92b0a8bef3 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -12,11 +12,9 @@ from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.schema.typing import TStoredSchema from dlt.common.schema.utils import merge_schema_updates -from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration -from dlt.common.typing import TDataItem -from dlt.common.schema import TSchemaUpdate, Schema, utils +from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo from dlt.common.utils import chunks, TRowCount, merge_row_count, increase_row_count diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 9bf201eccc..c5ebd5a619 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -83,6 +83,9 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # refresh live schemas in storage or import schema path self._schema_storage.commit_live_schema(name) rv = f(self, *args, **kwargs) + # save modified live schemas + for name in self._schema_storage.live_schemas: + self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added self.schema_names = self._schema_storage.list_schemas() return rv @@ -279,12 +282,12 @@ def extract( try: with self._maybe_destination_capabilities(): # extract all sources - for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key): + for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key, schema_contract): if source.exhausted: raise SourceExhausted(source.name) # TODO: merge infos for all the sources extract_ids.append( - self._extract_source(storage, source, max_parallel_items, workers, schema_contract) + self._extract_source(storage, source, max_parallel_items, workers) ) # commit extract ids # TODO: if we fail here we should probably wipe out the whole extract folder @@ -534,6 +537,7 @@ def sync_destination(self, destination: TDestinationReferenceArg = None, staging # on merge schemas are replaced so we delete all old versions self._schema_storage.clear_storage() for schema in restored_schemas: + print("RESTORE SCHEMA?") self._schema_storage.save_schema(schema) # if the remote state is present then unset first run if remote_state is not None: @@ -795,7 +799,8 @@ def _data_to_sources(self, parent_table_name: str = None, write_disposition: TWriteDisposition = None, columns: TAnySchemaColumns = None, - primary_key: TColumnNames = None + primary_key: TColumnNames = None, + schema_contract: TSchemaContract = None ) -> List[DltSource]: def apply_hint_args(resource: DltResource) -> None: @@ -806,10 +811,15 @@ def apply_hint_args(resource: DltResource) -> None: def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: - return schema - if self.default_schema_name: - return self.default_schema.clone() - return self._make_schema_with_default_name() + schema_ = schema + elif self.default_schema_name: + schema_ = self.default_schema.clone() + else: + schema_ = self._make_schema_with_default_name() + # apply schema contract settings + if schema_contract: + schema_.set_schema_contract(schema_contract, update_table_settings=True) + return schema_ effective_schema = choose_schema() @@ -859,34 +869,30 @@ def append_data(data_item: Any) -> None: return sources - def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int, global_contract: TSchemaContract) -> str: - # discover the schema from source - source_schema = source.schema - source_schema.update_normalizers() - + def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: # discover the existing pipeline schema - pipeline_schema = self._schema_storage[source_schema.name] if source_schema.name in self._schema_storage else None + if source.schema.name in self.schemas: + # use clone until extraction complete + pipeline_schema = self.schemas[source.schema.name].clone() + # apply all changes in the source schema to pipeline schema + # NOTE: we do not apply contracts to changes done programmatically + pipeline_schema.update_schema(source.schema) + # replace schema in the source + source.schema = pipeline_schema # extract into pipeline schema - source.schema.set_schema_contract(global_contract, True) - extract_id = extract_with_schema(storage, source, pipeline_schema, self.collector, max_parallel_items, workers) + extract_id = extract_with_schema(storage, source, self.collector, max_parallel_items, workers) # save import with fully discovered schema - self._schema_storage.save_import_schema_if_not_exists(source_schema) - - # save schema if not present in store - if not pipeline_schema: - self._schema_storage.save_schema(source_schema) - pipeline_schema = source_schema + self._schema_storage.save_import_schema_if_not_exists(source.schema) - # update pipeline schema - pipeline_schema.update_schema(source_schema) - pipeline_schema.set_schema_contract(global_contract, True) + # update live schema but not update the store yet + self._schema_storage.update_live_schema(source.schema) # set as default if this is first schema in pipeline if not self.default_schema_name: # this performs additional validations as schema contains the naming module - self._set_default_schema_name(pipeline_schema) + self._set_default_schema_name(source.schema) return extract_id @@ -1200,11 +1206,12 @@ def managed_state(self, *, extract_state: bool = False) -> Iterator[TPipelineSta # restore original pipeline props self._state_to_props(backup_state) # synchronize schema storage with initial list of schemas, note that we'll not be able to synchronize the schema content - if self._schema_storage: - # TODO: we should restore schemas backup here - for existing_schema_name in self._schema_storage.list_schemas(): - if existing_schema_name not in self.schema_names: - self._schema_storage.remove_schema(existing_schema_name) + # NOTE: not needed - schemas are not saved and are kept as live until with_schema_sync ends + # if self._schema_storage: + # # TODO: we should restore schemas backup here + # for existing_schema_name in self._schema_storage.list_schemas(): + # if existing_schema_name not in self.schema_names: + # self._schema_storage.remove_schema(existing_schema_name) # raise original exception raise else: @@ -1265,7 +1272,7 @@ def _extract_state(self, state: TPipelineState) -> TPipelineState: # note: the schema will be persisted because the schema saving decorator is over the state manager decorator for extract state_source = DltSource(self.default_schema.name, self.pipeline_name, self.default_schema, [state_resource(state)]) storage = ExtractorStorage(self._normalize_storage_config) - extract_id = extract_with_schema(storage, state_source, self.default_schema, _NULL_COLLECTOR, 1, 1) + extract_id = extract_with_schema(storage, state_source, _NULL_COLLECTOR, 1, 1) storage.commit_extract_files(extract_id) return state diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 61c6f922b7..f5f406a7a1 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -203,13 +203,21 @@ def test_replace_schema_content() -> None: eth_v5: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v5") eth_v5["imported_version_hash"] = "IMP_HASH" schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] - schema_eth.bump_version() schema.replace_schema_content(schema_eth) assert schema_eth.stored_version_hash == schema.stored_version_hash assert schema_eth.version == schema.version assert schema_eth.version_hash == schema.version_hash assert schema_eth._imported_version_hash == schema._imported_version_hash + # replace content of modified schema + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + assert schema_eth.version_hash != schema_eth.stored_version_hash + # replace content does not bump version + schema = Schema("simple") + schema.replace_schema_content(schema_eth) + assert schema.version_hash != schema.stored_version_hash + @pytest.mark.parametrize("columns,hint,value", [ (["_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx"], "nullable", False), diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 75673180e7..ad3b3f4678 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -18,7 +18,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage, pipeline_schema=Schema("some_schema")) + schema_update = extract(extract_id, source, storage) # odd and even tables assert len(schema_update) == 2 assert "odd_table" in schema_update @@ -42,7 +42,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage, pipeline_schema=Schema("some_schema")) + schema_update = extract(extract_id, source, storage) assert len(schema_update) == 1 assert "odd_table" in schema_update for partials in schema_update.values(): diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 917dac75c4..42de866962 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -297,7 +297,8 @@ def i_fail(): s4 = DltSource("default_4", "module", dlt.Schema("default_4"), [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) with pytest.raises(PipelineStepFailed): - p.extract([s3, s4]) + # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later + p.extract([s4, s3]) # nothing to normalize assert len(storage.list_files_to_normalize_sorted()) == 0 @@ -670,6 +671,8 @@ def resource_1(): assert p.default_schema.get_table("resource_1")["write_disposition"] == "append" p.run(resource_1, write_disposition="replace") + print(list(p._schema_storage.live_schemas.values())[0].to_pretty_yaml()) + assert p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace" assert p.default_schema.get_table("resource_1")["write_disposition"] == "replace" From b645927a6c700966275eb071279703e510a20d34 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:43:46 +0100 Subject: [PATCH 52/73] allows to control relational normalizer descend with send --- dlt/common/normalizers/json/__init__.py | 4 +- dlt/common/normalizers/json/relational.py | 6 ++- .../normalizers/test_json_relational.py | 53 +++++++++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index 07ffd52e2e..9a845e6608 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -1,5 +1,5 @@ import abc -from typing import Any, Generic, Type, Iterator, Tuple, Callable, Protocol, TYPE_CHECKING, TypeVar +from typing import Any, Generic, Type, Generator, Tuple, Protocol, TYPE_CHECKING, TypeVar from dlt.common.typing import DictStrAny, TDataItem, StrAny if TYPE_CHECKING: @@ -10,7 +10,7 @@ # type definitions for json normalization function # iterator of form ((table_name, parent_table), dict) must be returned from normalization function -TNormalizedRowIterator = Iterator[Tuple[Tuple[str, str], StrAny]] +TNormalizedRowIterator = Generator[Tuple[Tuple[str, str], StrAny], bool, None] # type var for data item normalizer config TNormalizerConfig = TypeVar("TNormalizerConfig", bound=Any) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 4355b2b2d5..c9ce5a9d25 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -48,6 +48,8 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): _skip_primary_key: Dict[str, bool] def __init__(self, schema: Schema) -> None: + """This item normalizer works with nested dictionaries. It flattens dictionaries and descends into lists. + It yields row dictionaries at each nesting level.""" self.schema = schema self._reset() @@ -230,7 +232,9 @@ def _normalize_row( extend.update(self._get_propagated_values(table, flattened_row, _r_lvl )) # yield parent table first - yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + should_descend = yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + if should_descend is False: + return # normalize and yield lists for list_path, list_content in lists.items(): diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 7169044117..91b5a93466 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -321,6 +321,59 @@ def test_list_position(norm: RelationalNormalizer) -> None: # print(rows) +def test_control_descending(norm: RelationalNormalizer) -> None: + row: StrAny = { + "f": [{ + "l": ["a", "b", "c"], + "v": 120, + "lo": [[{"e": "a"}, {"e": "b"}, {"e":"c"}]] + }], + "g": "val" + } + + # break at first row + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # prevent yielding descendants of "f" but yield all else + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + (table, _), _ = rows_gen.send(True) + assert table == "table__f" + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # descend into "l" + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__l" + assert one_row["value"] == "a" + # get next element in the list - even with sending False - we do not descend + (table, _), one_row = rows_gen.send(False) + assert table == "table__f__l" + assert one_row["value"] == "b" + + # prevent descending into list of lists + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + # yield "l" + next(rows_gen) + next(rows_gen) + next(rows_gen) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__lo" + # do not descend into lists + with pytest.raises(StopIteration): + rows_gen.send(False) + + def test_list_in_list() -> None: chats = { "_dlt_id": "123456", From 8989a754f4cde3d55d0fc9f38fdacd072121ce19 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:45:01 +0100 Subject: [PATCH 53/73] refactors data contract apply to generate filters instead of actual filtering --- dlt/common/schema/schema.py | 169 ++++++++--------- dlt/common/schema/typing.py | 1 + tests/common/schema/test_schema_contract.py | 200 ++++++++------------ 3 files changed, 164 insertions(+), 206 deletions(-) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index e76943423c..73b10a6996 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -10,7 +10,7 @@ from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType -from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, +from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) @@ -198,12 +198,18 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial - @staticmethod - def apply_schema_contract(pipeline_schema: Optional["Schema"], contract_modes: TSchemaContractDict, row: DictStrAny, partial_table: TPartialTableSchema) -> Tuple[DictStrAny, TPartialTableSchema]: + def apply_schema_contract( + self, + schema_contract: TSchemaContractDict, + partial_table: TPartialTableSchema, + raise_on_freeze: bool = True + ) -> Tuple[TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]]]: """ - Checks if contract mode allows for the requested changes to the data and the schema. It will allow all changes to pass, filter out the row filter out - columns for both the data and the schema_update or reject the update completely, depending on the mode. An example settings could be: + Checks if `schema_contract` allows for the `partial_table` to update the schema. It applies the contract dropping + the affected columns or the whole `partial_table`. It generates and returns a set of filters that should be applied to incoming data in order to modify it + so it conforms to the contract. + Example `schema_contract`: { "tables": "freeze", "columns": "evolve", @@ -215,54 +221,82 @@ def apply_schema_contract(pipeline_schema: Optional["Schema"], contract_modes: T * freeze: allow no change and fail the load * discard_row: allow no schema change and filter out the row * discard_value: allow no schema change and filter out the value but load the rest of the row - """ - assert partial_table - table_name = partial_table["name"] + Returns a tuple where a first element is modified partial table and the second is a list of filters. The modified partial may be None in case the + whole table is not allowed. + Each filter is a tuple of (table|columns, entity name, freeze | discard_row | discard_value). + Note: by default `freeze` immediately raises SchemaFrozenException which is convenient in most use cases + """ # default settings allow all evolutions, skip all else - if contract_modes == DEFAULT_SCHEMA_CONTRACT_MODE: - return row, partial_table + if schema_contract == DEFAULT_SCHEMA_CONTRACT_MODE: + return partial_table, [] - is_new_table = not pipeline_schema or pipeline_schema.is_new_table(table_name) + assert partial_table + table_name = partial_table["name"] + existing_table: TTableSchema = self._schema_tables.get(table_name, None) + # table is new when not yet exist or + is_new_table = not existing_table or self.is_new_table(table_name) # check case where we have a new table - if is_new_table: - if contract_modes["tables"] in ["discard_row", "discard_value"]: - return None, None - if contract_modes["tables"] == "freeze": - raise SchemaFrozenException(pipeline_schema.name if pipeline_schema else "", table_name, f"Trying to add table {table_name} but new tables are frozen.") - - # iif there is no row data, we only check table modes - if not row or not pipeline_schema: - return row, partial_table - - # if evolve once is set, allow all column changes - evolve_once = pipeline_schema.tables.get(table_name, {}).get("x-normalizer", {}).get("evolve_once", False) # type: ignore[attr-defined] - if evolve_once: - return row, partial_table - - # check columns - for column_name in list(row.keys()): + if is_new_table and schema_contract["tables"] != "evolve": + if raise_on_freeze and schema_contract["tables"] == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") + # filter tables with name below + return None, [("tables", table_name, schema_contract["tables"])] + + column_mode, data_mode = schema_contract["columns"], schema_contract["data_type"] + # allow to add new columns when table is new or if columns are allowed to evolve once + if is_new_table or existing_table.get("x-normalizer", {}).get("evolve-columns-once", False): # type: ignore[attr-defined] + column_mode = "evolve" + + # check if we should filter any columns, partial table below contains only new columns + filters: List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]] = [] + for column_name, column in list(partial_table["columns"].items()): # dlt cols may always be added - if column_name.startswith(pipeline_schema._dlt_tables_prefix): + if column_name.startswith(self._dlt_tables_prefix): continue - # if this is a new column for an existing table... - if not is_new_table and not utils.is_complete_column(pipeline_schema.tables[table_name]["columns"].get(column_name, {})): - is_variant = partial_table["columns"].get(column_name, {}).get("variant") - if contract_modes["columns"] == "discard_value" or (is_variant and contract_modes["data_type"] == "discard_value"): - row.pop(column_name) - partial_table["columns"].pop(column_name) - elif contract_modes["columns"] == "discard_row" or (is_variant and contract_modes["data_type"] == "discard_row"): - return None, None - # raise on variant columns frozen - elif is_variant and contract_modes["data_type"] == "freeze": - raise SchemaFrozenException(pipeline_schema.name, table_name, f"Trying to create new variant column {column_name} to table {table_name} data_types are frozen.") - # raise on new columns frozen - elif contract_modes["columns"] == "freeze": - raise SchemaFrozenException(pipeline_schema.name, table_name, f"Trying to add column {column_name} to table {table_name} but columns are frozen.") - - return row, partial_table + is_variant = column.get("variant", False) + # new column and contract prohibits that + if column_mode != "evolve" and not is_variant: + if raise_on_freeze and column_mode == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to add column {column_name} to table {table_name} but columns are frozen.") + # filter column with name below + filters.append(("columns", column_name, column_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + # variant (data type evolution) and contract prohibits that + if data_mode != "evolve" and is_variant: + if raise_on_freeze and data_mode == "freeze": + raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen.") + # filter column with name below + filters.append(("columns", column_name, data_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + return partial_table, filters + + @staticmethod + def expand_schema_contract_settings(settings: TSchemaContract) -> TSchemaContractDict: + """Expand partial or shorthand settings into full settings dictionary""" + if isinstance(settings, str): + settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) + return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings}) + + def resolve_contract_settings_for_table(self, table_name: str) -> TSchemaContractDict: + """Resolve the exact applicable schema contract settings for the table `table_name`.""" + + settings: TSchemaContract = {} + # find root table + try: + table = utils.get_top_level_table(self._schema_tables, table_name) + settings = table["schema_contract"] + except KeyError: + settings = self._settings.get("schema_contract", {}) + + # expand settings, empty settings will expand into default settings + return Schema.expand_schema_contract_settings(settings) def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: table_name = partial_table["name"] @@ -472,14 +506,11 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) - def set_schema_contract(self, settings: TSchemaContract, update_table_settings: bool = False) -> None: + def set_schema_contract(self, settings: TSchemaContract) -> None: if not settings: - return - self._settings["schema_contract"] = settings - if update_table_settings: - for table in self.tables.values(): - if not table.get("parent"): - table["schema_contract"] = settings + self._settings.pop("schema_contract", None) + else: + self._settings["schema_contract"] = settings def add_type_detection(self, detection: TTypeDetections) -> None: """Add type auto detection to the schema.""" @@ -686,37 +717,3 @@ def _compile_settings(self) -> None: def __repr__(self) -> str: return f"Schema {self.name} at {id(self)}" - -def resolve_contract_settings_for_table(parent_table: str, table_name: str, current_schema: Optional[Schema], incoming_schema: Schema = None, incoming_table: TTableSchema = None) -> TSchemaContractDict: - """Resolve the exact applicable schema contract settings for the table during the normalization stage.""" - - current_schema = current_schema or incoming_schema - - # find settings and expand them to dict if needed - def resolve_single(settings: TSchemaContract) -> TSchemaContractDict: - settings = settings or {} - if isinstance(settings, str): - settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) - return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings} if settings else {}) - - # we have contract modes set on the incoming table definition (from the resource) - if incoming_table and (incoming_table_contract_mode := resolve_single(incoming_table.get("schema_contract", {}))): - return incoming_table_contract_mode - - # find correct parent table - table = parent_table or table_name - if table in current_schema.tables: - table = utils.get_top_level_table(current_schema.tables, parent_table or table_name)["name"] - - # resolve existing contract modes - current_table_contract_modes = resolve_single(current_schema.tables.get(table, {}).get("schema_contract", {})) - current_schema_contract_modes = resolve_single(current_schema._settings.get("schema_contract", {})) - - # if we have stuff defined on the incoming schema, this takes precedence - if incoming_schema: - if incoming_table_contract_mode := resolve_single(incoming_schema.tables.get(table, {}).get("schema_contract", {})): - return incoming_table_contract_mode - if not current_table_contract_modes and (incoming_schema_contract_modes := resolve_single(incoming_schema._settings.get("schema_contract", {}))): - return incoming_schema_contract_modes - - return current_table_contract_modes or current_schema_contract_modes or DEFAULT_SCHEMA_CONTRACT_MODE \ No newline at end of file diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index ebcac1579d..f896032e6f 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -72,6 +72,7 @@ class TColumnSchema(TColumnSchemaBase, total=False): SIMPLE_REGEX_PREFIX = "re:" TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] +TSchemaContractEntities = Literal["tables", "columns", "data_type"] class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py index 90a943b72d..160aca9fd9 100644 --- a/tests/common/schema/test_schema_contract.py +++ b/tests/common/schema/test_schema_contract.py @@ -4,7 +4,6 @@ import copy from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE, TSchemaContractDict -from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema.typing import TTableSchema @@ -55,6 +54,12 @@ def get_schema() -> Schema: "columns": {**incomplete_columns, **columns} })) + s.update_table(cast(TTableSchema, { + "name": "evolve_once_table", + "x-normalizer": {"evolve-columns-once": True}, + "columns": {**incomplete_columns, **columns} + })) + return s @@ -62,18 +67,18 @@ def test_resolve_contract_settings() -> None: # defaults schema = get_schema() - assert resolve_contract_settings_for_table(None, "tables", schema) == DEFAULT_SCHEMA_CONTRACT_MODE - assert resolve_contract_settings_for_table("tables", "child_table", schema) == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("tables") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("child_table") == DEFAULT_SCHEMA_CONTRACT_MODE # table specific full setting schema = get_schema() schema.tables["tables"]["schema_contract"] = "freeze" - assert resolve_contract_settings_for_table(None, "tables", schema) == { + assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" } - assert resolve_contract_settings_for_table("tables", "child_table", schema) == { + assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" @@ -85,12 +90,12 @@ def test_resolve_contract_settings() -> None: "tables": "freeze", "columns": "discard_value", } - assert resolve_contract_settings_for_table(None, "tables", schema) == { + assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" } - assert resolve_contract_settings_for_table("tables", "child_table", schema) == { + assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" @@ -99,12 +104,12 @@ def test_resolve_contract_settings() -> None: # schema specific full setting schema = get_schema() schema._settings["schema_contract"] = "freeze" - assert resolve_contract_settings_for_table(None, "tables", schema) == { + assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" } - assert resolve_contract_settings_for_table("tables", "child_table", schema) == { + assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "freeze", "data_type": "freeze" @@ -116,12 +121,12 @@ def test_resolve_contract_settings() -> None: "tables": "freeze", "columns": "discard_value", } - assert resolve_contract_settings_for_table(None, "tables", schema) == { + assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" } - assert resolve_contract_settings_for_table("tables", "child_table", schema) == { + assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "discard_value", "data_type": "evolve" @@ -134,68 +139,24 @@ def test_resolve_contract_settings() -> None: "tables": "evolve", "columns": "discard_value", } - assert resolve_contract_settings_for_table(None, "tables", schema) == { + assert schema.resolve_contract_settings_for_table("tables") == { "tables": "evolve", "columns": "discard_value", "data_type": "evolve" } - assert resolve_contract_settings_for_table("tables", "child_table", schema) == { + assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "evolve", "columns": "discard_value", "data_type": "evolve" } - # current and incoming schema - current_schema = get_schema() - current_schema._settings["schema_contract"] = "discard_value" - incoming_schema = get_schema() - incoming_schema._settings["schema_contract"] = "discard_row" - incoming_table: TTableSchema = {"name": "incomplete_table", "schema_contract": "freeze"} - - - # incoming schema overrides - assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { - "tables": "discard_row", - "columns": "discard_row", - "data_type": "discard_row" - } - - # direct incoming table overrides - assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema, incoming_table) == { - "tables": "freeze", - "columns": "freeze", - "data_type": "freeze" - } - - # table defined on existing schema overrided incoming schema setting - current_schema.tables["tables"]["schema_contract"] = "discard_value" - assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { - "tables": "discard_value", - "columns": "discard_value", - "data_type": "discard_value" - } - - # but table on incoming schema overrides again - incoming_schema.tables["tables"]["schema_contract"] = "discard_row" - assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema) == { - "tables": "discard_row", - "columns": "discard_row", - "data_type": "discard_row" - } - - # incoming table still overrides all - assert resolve_contract_settings_for_table(None, "tables", current_schema, incoming_schema, incoming_table) == { - "tables": "freeze", - "columns": "freeze", - "data_type": "freeze" - } # ensure other settings do not interfere with the main setting we are testing base_settings = [{ "tables": "evolve", "columns": "evolve", "data_type": "evolve" - },{ + }, { "tables": "discard_row", "columns": "discard_row", "data_type": "discard_row" @@ -215,39 +176,49 @@ def test_resolve_contract_settings() -> None: def test_check_adding_table(base_settings) -> None: schema = get_schema() - data = { - "column_1": "some string", - "column_2": 123 - } new_table = copy.deepcopy(schema.tables["tables"]) new_table["name"] = "new_table" # # check adding new table # - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), data, new_table) == (data, new_table) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), data, new_table) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), data, new_table) == (None, None) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), new_table) + assert (partial, filters) == (new_table, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, raise_on_freeze=False) + assert (partial, filters) == (None, [("tables", "new_table", "freeze")]) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), data, new_table) + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table) @pytest.mark.parametrize("base_settings", base_settings) def test_check_adding_new_columns(base_settings) -> None: schema = get_schema() + + def assert_new_column(table_update: TTableSchema, column_name: str) -> None: + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop(column_name) + + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "freeze")]) + + with pytest.raises(SchemaFrozenException): + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update)) + # # check adding new column # - data = { - "column_1": "some string", - "column_2": 123 - } - data_with_new_row = { - **data, - "new_column": "some string" - } table_update: TTableSchema = { "name": "tables", "columns": { @@ -257,48 +228,38 @@ def test_check_adding_new_columns(base_settings) -> None: } } } - popped_table_update = copy.deepcopy(table_update) - popped_table_update["columns"].pop("new_column") - - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) - - with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(data_with_new_row), table_update) - + assert_new_column(table_update, "new_column") # # check adding new column if target column is not complete # - data = { - "column_1": "some string", - "column_2": 123, - } - data_with_new_row = { - **data, - "incomplete_column_1": "some other string", - } table_update = { "name": "mixed_table", "columns": { "incomplete_column_1": { "name": "incomplete_column_1", - "data_type": "text" } } } - popped_table_update = copy.deepcopy(table_update) - popped_table_update["columns"].pop("incomplete_column_1") - - # incomplete columns should be treated like new columns - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(data_with_new_row), table_update) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(data_with_new_row), table_update) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(data_with_new_row), table_update) == (data, popped_table_update) - - with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(data_with_new_row), table_update) + assert_new_column(table_update, "incomplete_column_1") + # + # check x-normalize evolve_once behaving as evolve override + # + table_update = { + "name": "evolve_once_table", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "text" + }, + "incomplete_column_1": { + "name": "incomplete_column_1", + } + } + } + partial, filters = schema.apply_schema_contract(base_settings, copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) def test_check_adding_new_variant() -> None: @@ -307,14 +268,6 @@ def test_check_adding_new_variant() -> None: # # check adding new variant column # - data = { - "column_1": "some string", - "column_2": 123 - } - data_with_new_row = { - **data, - "column_2_variant": 345345 - } table_update: TTableSchema = { "name": "tables", "columns": { @@ -328,16 +281,23 @@ def test_check_adding_new_variant() -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "freeze")]) with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) - # check interaction with new columns settings, variants are new columns.. - with pytest.raises(SchemaFrozenException): - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data_with_new_row, table_update) + # variants are not new columns - new data types + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_row"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (None, None) - assert schema.apply_schema_contract(schema, cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "discard_value"}}), copy.deepcopy(data_with_new_row), copy.deepcopy(table_update)) == (data, popped_table_update) \ No newline at end of file + # evolve once does not apply to variant evolution + table_update["name"] = "evolve_once_table" + with pytest.raises(SchemaFrozenException): + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) From 57842cc5b1fa08a1becea48e7a84a70b387e7d6c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:46:00 +0100 Subject: [PATCH 54/73] detects if bytes string possibly contains pue characters --- dlt/common/json/__init__.py | 5 +++++ tests/common/test_json.py | 14 +++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index cdcc609d03..deceaaf033 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -181,6 +181,11 @@ def custom_pua_remove(obj: Any) -> Any: return obj +def may_have_pua(line: bytes) -> bool: + """Checks if bytes string contains pua marker""" + return b'\xef\x80' in line + + # pick the right impl json: SupportsJson = None if os.environ.get("DLT_USE_JSON") == "simplejson": diff --git a/tests/common/test_json.py b/tests/common/test_json.py index 983484d326..f6e9b06425 100644 --- a/tests/common/test_json.py +++ b/tests/common/test_json.py @@ -6,7 +6,7 @@ from dlt.common import json, Decimal, pendulum from dlt.common.arithmetics import numeric_default_context -from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, _orjson, _simplejson, SupportsJson, _DATETIME +from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, may_have_pua, _orjson, _simplejson, SupportsJson, _DATETIME from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED, JSON_TYPED_DICT_NESTED, JSON_TYPED_DICT_NESTED_DECODED @@ -250,6 +250,18 @@ def test_json_typed_encode(json_impl: SupportsJson) -> None: assert d_d == JSON_TYPED_DICT_DECODED +@pytest.mark.parametrize("json_impl", _JSON_IMPL) +def test_pua_detection(json_impl: SupportsJson) -> None: + with io.BytesIO() as b: + json_impl.typed_dump(JSON_TYPED_DICT, b) + content_b = b.getvalue() + assert may_have_pua(content_b) + with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: + content_b = f.read() + assert not may_have_pua(content_b) + + + def test_load_and_compare_all_impls() -> None: with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: content_b = f.read() From 441299fa77868c9899dadeee7262ba1de027d927 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:47:40 +0100 Subject: [PATCH 55/73] applies schema contracts in item normalizer, uses binary stream, detects pue to skip decoding --- dlt/normalize/items_normalizers.py | 192 ++++++++++++++++++++--------- dlt/normalize/normalize.py | 13 +- 2 files changed, 135 insertions(+), 70 deletions(-) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index be959560b4..ce2a14477a 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -1,21 +1,20 @@ import os -from typing import List, Dict, Tuple, Protocol, Any -from pathlib import Path +from typing import List, Dict, Set, Tuple, Any from abc import abstractmethod from dlt.common import json, logger -from dlt.common.json import custom_pua_decode +from dlt.common.json import custom_pua_decode, may_have_pua from dlt.common.runtime import signals -from dlt.common.schema.typing import TTableSchemaColumns, TSchemaContractDict +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict from dlt.common.storages import NormalizeStorage, LoadStorage, FileStorage -from dlt.common.typing import TDataItem +from dlt.common.typing import DictStrAny, TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.utils import TRowCount, merge_row_count, increase_row_count -from dlt.common.schema.schema import resolve_contract_settings_for_table -from dlt.normalize.configuration import NormalizeConfiguration from dlt.common.exceptions import MissingDependencyException from dlt.common.normalizers.utils import generate_dlt_ids +from dlt.normalize.configuration import NormalizeConfiguration + try: from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa @@ -45,66 +44,136 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> Tuple[Lis class JsonLItemsNormalizer(ItemsNormalizer): - def _normalize_chunk(self, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: - column_schemas: Dict[ - str, TTableSchemaColumns - ] = {} # quick access to column schema for writers below + def __init__( + self, + load_storage: LoadStorage, + normalize_storage: NormalizeStorage, + schema: Schema, + load_id: str, + config: NormalizeConfiguration + ) -> None: + super().__init__(load_storage, normalize_storage, schema, load_id, config) + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_tables_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + # quick access to column schema for writers below + self._column_schemas: Dict[str, TTableSchemaColumns] = {} + + def _filter_columns(self, filtered_columns: Dict[str, TSchemaEvolutionMode], row: DictStrAny) -> DictStrAny: + for name, mode in filtered_columns.items(): + if name in row: + if mode == "discard_row": + return None + elif mode == "discard_value": + row.pop(name) + return row + + def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_have_pua: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: + column_schemas = self._column_schemas schema_update: TSchemaUpdate = {} schema = self.schema schema_name = schema.name items_count = 0 row_counts: TRowCount = {} - schema_contract: TSchemaContractDict = None + normalize_data_fun = self.schema.normalize_data_item for item in items: - for (table_name, parent_table), row in self.schema.normalize_data_item( - item, self.load_id, root_table_name - ): - if not schema_contract: - schema_contract = resolve_contract_settings_for_table(parent_table, table_name, schema) - # filter row, may eliminate some or all fields - row = schema.filter_row(table_name, row) - # do not process empty rows - if not row: - continue - - # decode pua types - for k, v in row.items(): - row[k] = custom_pua_decode(v) # type: ignore - # coerce row of values into schema table, generating partial table with new columns if any - row, partial_table = schema.coerce_row( - table_name, parent_table, row - ) + items_gen = normalize_data_fun(item, self.load_id, root_table_name) + try: + should_descend: bool = None + # use send to prevent descending into child rows when row was discarded + while row_info := items_gen.send(should_descend): + should_descend = True + (table_name, parent_table), row = row_info - # if we detect a migration, check schema contract - if partial_table: - row, partial_table = schema.apply_schema_contract(schema, schema_contract, row, partial_table) - if not row: - continue - - # theres a new table or new columns in existing table - if partial_table: - # update schema and save the change - schema.update_table(partial_table) - table_updates = schema_update.setdefault(table_name, []) - table_updates.append(partial_table) - # update our columns - column_schemas[table_name] = schema.get_table_columns( - table_name + # rows belonging to filtered out tables are skipped + if table_name in self._filtered_tables: + # stop descending into further rows + should_descend = False + continue + + # filter row, may eliminate some or all fields + row = schema.filter_row(table_name, row) + # do not process empty rows + if not row: + should_descend = False + continue + + # filter columns or full rows if schema contract said so + # do it before schema inference in `coerce_row` to not trigger costly migration code + filtered_columns = self._filtered_tables_columns.get(table_name, None) + if filtered_columns: + row = self._filter_columns(filtered_columns, row) # type: ignore[arg-type] + # if whole row got dropped + if not row: + should_descend = False + continue + + # decode pua types + if may_have_pua: + for k, v in row.items(): + row[k] = custom_pua_decode(v) # type: ignore + + # coerce row of values into schema table, generating partial table with new columns if any + row, partial_table = schema.coerce_row( + table_name, parent_table, row ) - # get current columns schema - columns = column_schemas.get(table_name) - if not columns: - columns = schema.get_table_columns(table_name) - column_schemas[table_name] = columns - # store row - # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock - self.load_storage.write_data_item( - self.load_id, schema_name, table_name, row, columns - ) - # count total items - items_count += 1 - increase_row_count(row_counts, table_name, 1) + + # if we detect a migration, check schema contract + if partial_table: + schema_contract = self._table_contracts.setdefault( + table_name, + schema.resolve_contract_settings_for_table(parent_table or table_name) # parent_table, if present, exists in the schema + ) + partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table) + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_tables_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + + if partial_table is None: + # discard migration and row + should_descend = False + continue + # theres a new table or new columns in existing table + # update schema and save the change + schema.update_table(partial_table) + table_updates = schema_update.setdefault(table_name, []) + table_updates.append(partial_table) + + # update our columns + column_schemas[table_name] = schema.get_table_columns( + table_name + ) + + # apply new filters + if filtered_columns and filters: + row = self._filter_columns(filtered_columns, row) + # do not continue if new filters skipped the full row + if not row: + should_descend = False + continue + + # get current columns schema + columns = column_schemas.get(table_name) + if not columns: + columns = schema.get_table_columns(table_name) + column_schemas[table_name] = columns + # store row + # TODO: store all rows for particular items all together after item is fully completed + # will be useful if we implement bad data sending to a table + self.load_storage.write_data_item( + self.load_id, schema_name, table_name, row, columns + ) + # count total items + # TODO: take counts and bytes from buffered file writers instead of taking those here + items_count += 1 + increase_row_count(row_counts, table_name, 1) + except StopIteration: + pass signals.raise_if_signalled() return schema_update, items_count, row_counts @@ -115,12 +184,13 @@ def __call__( ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: schema_updates: List[TSchemaUpdate] = [] row_counts: TRowCount = {} - with self.normalize_storage.storage.open_file(extracted_items_file) as f: + with self.normalize_storage.storage.open_file(extracted_items_file, "rb") as f: # enumerate jsonl file line by line items_count = 0 + line: bytes for line_no, line in enumerate(f): - items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items) + items: List[TDataItem] = json.loadb(line) + partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items, may_have_pua(line)) schema_updates.append(partial_update) merge_row_count(row_counts, r_counts) logger.debug( diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 92b0a8bef3..87bbac651c 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -247,17 +247,12 @@ def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) - # set all populated tables to populated - needs_schema_save = len(schema_updates) > 0 # remove normalizer specific info for table in schema.tables.values(): - if table.pop("x-normalizer", None): # type: ignore[typeddict-item] - needs_schema_save = True - # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) - if needs_schema_save: - logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") - # schema is updated, save it to schema volume - self.schema_storage.save_schema(schema) + table.pop("x-normalizer", None) # type: ignore[typeddict-item] + logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") + # schema is updated, save it to schema volume + self.schema_storage.save_schema(schema) # save schema to temp load folder self.load_storage.save_temp_schema(schema, load_id) # save schema updates even if empty From fc0eb47f06be3fdf06e554d50a2334dbe9b2db1d Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:48:22 +0100 Subject: [PATCH 56/73] methods to remove and rename arrow columns, need arrow 12+ --- dlt/common/libs/pyarrow.py | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index fb2f5c2e72..7dce062c54 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -140,23 +140,39 @@ def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: def remove_null_columns(item: TAnyArrowItem) -> TAnyArrowItem: - """Remove all columns of datatype pyarrow.null() from the table or record batch - """ + """Remove all columns of datatype pyarrow.null() from the table or record batch""" + return remove_columns(item, [field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + + +def remove_columns(item: TAnyArrowItem, columns: Sequence[str]) -> TAnyArrowItem: + """Remove `columns` from Arrow `item`""" + if not columns: + return item + if isinstance(item, pyarrow.Table): - return item.drop([field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + return item.drop(columns) elif isinstance(item, pyarrow.RecordBatch): - null_idx = [i for i, col in enumerate(item.columns) if pyarrow.types.is_null(col.type)] - new_schema = item.schema - for i in reversed(null_idx): - new_schema = new_schema.remove(i) - return pyarrow.RecordBatch.from_arrays( - [col for i, col in enumerate(item.columns) if i not in null_idx], - schema=new_schema - ) + # NOTE: select is available in pyarrow 12 an up + return item.select([n for n in item.schema.names if n not in columns]) # reverse selection else: raise ValueError(item) +def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAnyArrowItem: + """Rename arrow columns on Table or RecordBatch, returns same data but with renamed schema""" + + if list(item.schema.names) == list(new_column_names): + # No need to rename + return item + + if isinstance(item, pyarrow.Table): + return item.rename_columns(new_column_names) + elif isinstance(item, pyarrow.RecordBatch): + new_fields = [field.with_name(new_name) for new_name, field in zip(new_column_names, item.schema)] + return pyarrow.RecordBatch.from_arrays(item.columns, schema=pyarrow.schema(new_fields)) + else: + raise TypeError(f"Unsupported data item type {type(item)}") + def py_arrow_to_table_schema_columns(schema: pyarrow.Schema) -> TTableSchemaColumns: """Convert a PyArrow schema to a table schema columns dict. From 99772277797caa433d2d2122c1e212e812a9d91d Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:49:53 +0100 Subject: [PATCH 57/73] implements contracts in extract, fixes issues in apply hints, arrow data filtering still missing --- dlt/extract/exceptions.py | 6 + dlt/extract/extract.py | 246 ++++++++++++++-------------- dlt/extract/incremental/__init__.py | 20 ++- dlt/extract/schema.py | 21 +-- tests/extract/test_extract.py | 32 ++-- tests/extract/test_incremental.py | 19 ++- tests/extract/test_sources.py | 3 +- 7 files changed, 177 insertions(+), 170 deletions(-) diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index e540a2468f..1ca876a1f0 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -212,6 +212,12 @@ def __init__(self, resource_name: str) -> None: One of table hints for that resource (typically table name) is a function and hint is computed separately for each instance of data extracted from that resource.""") +class NameNormalizationClash(DltResourceException): + def __init__(self, resource_name: str, reason: str) -> None: + msg = f"Column name clash after input data normalization. {reason}" + super().__init__(resource_name, msg) + + class SourceDataIsNone(DltSourceException): def __init__(self, source_name: str) -> None: self.source_name = source_name diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 444a6df01f..778499dcd5 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,28 +1,28 @@ import contextlib +from copy import copy import os -from typing import ClassVar, List, Set, Dict, Type, Any, Sequence, Optional, Set -from collections import defaultdict +from typing import ClassVar, Set, Dict, Any, Optional, Set from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.libs.pyarrow import TAnyArrowItem from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat from dlt.common.exceptions import MissingDependencyException from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.utils import uniq_id -from dlt.common.typing import TDataItems, TDataItem -from dlt.common.schema import Schema, utils, TSchemaUpdate -from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns +from dlt.common.utils import uniq_id, update_dict_nested +from dlt.common.typing import StrStr, TDataItems, TDataItem +from dlt.common.schema import Schema, utils +from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage from dlt.common.configuration.specs import known_sections from dlt.common.schema.typing import TPartialTableSchema -from dlt.common.schema.schema import resolve_contract_settings_for_table from dlt.extract.decorators import SourceSchemaInjectableContext -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints +from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, NameNormalizationClash from dlt.extract.pipe import PipeIterator from dlt.extract.source import DltResource, DltSource from dlt.extract.typing import TableNameMeta @@ -109,23 +109,22 @@ def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_n class Extractor: file_format: TLoaderFileFormat - dynamic_tables: TSchemaUpdate def __init__( self, extract_id: str, storage: ExtractorStorage, schema: Schema, resources_with_items: Set[str], - dynamic_tables: TSchemaUpdate, collector: Collector = NULL_COLLECTOR ) -> None: - self._storage = storage self.schema = schema - self.dynamic_tables = dynamic_tables self.collector = collector self.resources_with_items = resources_with_items self.extract_id = extract_id - self.disallowed_tables: Set[str] = set() + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] + self._storage = storage @property def storage(self) -> ExtractorItemStorage: @@ -146,87 +145,91 @@ def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: return "puae-jsonl" return None # Empty list is unknown format - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" if isinstance(meta, TableNameMeta): - table_name = meta.table_name - self._write_static_table(resource, table_name, items) + # write item belonging to table with static name + self._write_to_static_table(resource, meta.table_name, items) else: if resource._table_name_hint_fun: - if isinstance(items, list): - for item in items: - self._write_dynamic_table(resource, item) - else: - self._write_dynamic_table(resource, items) + # table has name or other hints depending on data items + self._write_to_dynamic_table(resource, items) else: # write item belonging to table with static name - table_name = resource.table_name # type: ignore[assignment] - self._write_static_table(resource, table_name, items) + self._write_to_static_table(resource, resource.table_name, items) # type: ignore[arg-type] def write_empty_file(self, table_name: str) -> None: table_name = self.schema.naming.normalize_table_identifier(table_name) self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - # normalize table name before writing so the name match the name in schema - # note: normalize_table_identifier is caching the normalization results - table_name = self.schema.naming.normalize_table_identifier(table_name) - self.collector.update(table_name) + new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) + self.collector.update(table_name, inc=new_rows_count) self.resources_with_items.add(resource_name) - self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) - - def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: - table_name = resource._table_name_hint_fun(item) - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - if not self._add_dynamic_table(resource, data_item=item): - return - else: - # quick check if deep table merge is required - if resource._table_has_other_dynamic_hints: - new_table = resource.compute_table_schema(item) - # this merges into existing table in place - utils.merge_tables(existing_table[0], new_table) - else: - # if there are no other dynamic hints besides name then we just leave the existing partial table - pass - # write to storage with inferred table name - self._write_item(table_name, resource.name, item) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - if not self._add_dynamic_table(resource, items, table_name=table_name): - return - self._write_item(table_name, resource.name, items) - - def _add_dynamic_table(self, resource: DltResource, data_item: TDataItems = None, table_name: Optional[str] = None) -> bool: + + def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: + if not isinstance(items, list): + items = [items] + for item in items: + table_name = self.schema.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + if table_name in self._filtered_tables: + continue + if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: + self._compute_and_update_table(resource, table_name, item) + # write to storage with inferred table name + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, item) + + def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: + table_name = self.schema.naming.normalize_table_identifier(table_name) + if table_name not in self._table_contracts: + self._compute_and_update_table(resource, table_name, items) + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, items) + + def _compute_table(self, resource: DltResource, data_item: TDataItem) -> TTableSchema: + """Computes a schema for a new or dynamic table and normalizes identifiers""" + return self.schema.normalize_table_identifiers( + resource.compute_table_schema(data_item) + ) + + def _compute_and_update_table(self, resource: DltResource, table_name: str, data_item: TDataItem) -> None: """ Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written """ - table = resource.compute_table_schema(data_item) - if table_name: - table["name"] = table_name - - # fast exit if we already evaluated this - if table["name"] in self.disallowed_tables: - return False + computed_table = self._compute_table(resource, data_item) + # overwrite table name (if coming from meta) + computed_table["name"] = table_name + # get or compute contract + schema_contract = self._table_contracts.setdefault( + table_name, + self.schema.resolve_contract_settings_for_table(table_name) + ) # this is a new table so allow evolve once - is_new_table = self.schema.is_new_table(table["name"]) - if is_new_table: - table["x-normalizer"] = {"evolve_once": True} # type: ignore[typeddict-unknown-key] + if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): + computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] + existing_table = self.schema._schema_tables.get(table_name, None) + if existing_table: + diff_table = utils.merge_tables(existing_table, computed_table) + else: + diff_table = computed_table - # apply schema contract and apply on pipeline schema - # here we only check that table may be created - schema_contract = resolve_contract_settings_for_table(None, table["name"], self.schema) - _, checked_table = Schema.apply_schema_contract(self.schema, schema_contract, None, table) + # apply contracts + diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table) - if not checked_table: - self.disallowed_tables.add(table["name"]) - return False + # merge with schema table + if diff_table: + self.schema.update_table(diff_table) - self.dynamic_tables[checked_table["name"]] = [checked_table] - return True + # process filters + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_columns.setdefault(table_name, {}) + filtered_columns[name] = mode class JsonLExtractor(Extractor): @@ -236,54 +239,57 @@ class JsonLExtractor(Extractor): class ArrowExtractor(Extractor): file_format = "arrow" - def _rename_columns(self, items: List[TDataItem], new_column_names: List[str]) -> List[TDataItem]: - """Rename arrow columns to normalized schema column names""" - if not items: - return items - if items[0].schema.names == new_column_names: - # No need to rename - return items - if isinstance(items[0], pyarrow.pyarrow.Table): - return [item.rename_columns(new_column_names) for item in items] - elif isinstance(items[0], pyarrow.pyarrow.RecordBatch): - # Convert the batches to table -> rename -> then back to batches - return pa.Table.from_batches(items).rename_columns(new_column_names).to_batches() # type: ignore[no-any-return] - else: - raise TypeError(f"Unsupported data item type {type(items[0])}") - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: items = [ + # 3. remove columns and rows in data contract filters # 2. Remove null-type columns from the table(s) as they can't be loaded - pyarrow.remove_null_columns(tbl) for tbl in ( + self._apply_contract_filters(pyarrow.remove_null_columns(tbl)) for tbl in ( # 1. Convert pandas frame(s) to arrow Table - pyarrow.pyarrow.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item + pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item for item in (items if isinstance(items, list) else [items]) ) ] - super().write_table(resource, items, meta) + super().write_items(resource, items, meta) + + def _apply_contract_filters(self, item: TAnyArrowItem) -> TAnyArrowItem: + # convert arrow schema names into normalized names + # find matching columns and delete by original name + return item + + def _get_normalized_arrow_fields(self, resource_name: str, item: TAnyArrowItem) -> StrStr: + """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" + norm_f = self.schema.naming.normalize_identifier + name_mapping = {n.name: norm_f(n.name) for n in item.schema} + # verify if names uniquely normalize + normalized_names = set(name_mapping.values()) + if len(name_mapping) != len(normalized_names): + raise NameNormalizationClash(resource_name, f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") + return name_mapping def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: # Note: `items` is always a list here due to the conversion in `write_table` - new_columns = list(self.dynamic_tables[table_name][0]["columns"].keys()) - super()._write_item(table_name, resource_name, self._rename_columns(items, new_columns), self.dynamic_tables[table_name][0]["columns"]) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - static_table = resource.compute_table_schema() - if isinstance(items, list): - item = items[0] - else: - item = items - # Merge the columns to include primary_key and other hints that may be set on the resource - arrow_columns = pyarrow.py_arrow_to_table_schema_columns(item.schema) - for key, value in static_table["columns"].items(): - arrow_columns[key] = utils.merge_columns(value, arrow_columns.get(key, {})) - static_table["columns"] = arrow_columns - static_table["name"] = table_name - self.dynamic_tables[table_name] = [self.schema.normalize_table_identifiers(static_table)] - self._write_item(table_name, resource.name, items) + items = [pyarrow.rename_columns( + item, + list(self._get_normalized_arrow_fields(resource_name, item).values()) + ) + for item in items] + super()._write_item(table_name, resource_name, items, self.schema.tables[table_name]["columns"]) + + def _compute_table(self, resource: DltResource, data_item: TDataItem) -> TPartialTableSchema: + data_item = data_item[0] + computed_table = super()._compute_table(resource, data_item) + # Merge the columns to include primary_key and other hints that may be set on the resource + arrow_table = copy(computed_table) + arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(data_item.schema) + # normalize arrow table before merging + arrow_table = self.schema.normalize_table_identifiers(arrow_table) + # we must override the columns to preserve the order in arrow table + arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) + + return arrow_table def extract( extract_id: str, @@ -294,16 +300,15 @@ def extract( max_parallel_items: int = None, workers: int = None, futures_poll_interval: float = None -) -> TSchemaUpdate: - dynamic_tables: TSchemaUpdate = {} +) -> None: schema = source.schema resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ), "arrow": ArrowExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ) } last_item_format: Optional[TLoaderFileFormat] = None @@ -326,7 +331,7 @@ def extract( resource = source.resources[pipe_item.pipe.name] # Fallback to last item's format or default (puae-jsonl) if the current item is an empty list item_format = Extractor.item_format(pipe_item.item) or last_item_format or "puae-jsonl" - extractors[item_format].write_table(resource, pipe_item.item, pipe_item.meta) + extractors[item_format].write_items(resource, pipe_item.item, pipe_item.meta) last_item_format = item_format # find defined resources that did not yield any pipeitems and create empty jobs for them @@ -349,9 +354,6 @@ def extract( # flush all buffered writers storage.close_writers(extract_id) - # returns set of partial tables - return dynamic_tables - def extract_with_schema( storage: ExtractorStorage, @@ -370,10 +372,6 @@ def extract_with_schema( with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) - # iterate over all items in the pipeline and update the schema if dynamic table hints were present - for _, partials in extractor.items(): - for partial in partials: - source.schema.update_table(source.schema.normalize_table_identifiers(partial)) + extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) return extract_id diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 652adc19d2..ed374aeacb 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -1,8 +1,7 @@ import os -from typing import Generic, TypeVar, Any, Optional, Callable, List, TypedDict, get_args, get_origin, Sequence, Type, Dict +from typing import Generic, ClassVar, Any, Optional, get_args, get_origin, Type, Dict import inspect -from functools import wraps, partial -from datetime import datetime # noqa: I251 +from functools import wraps try: import pandas as pd @@ -12,8 +11,7 @@ import dlt from dlt.common.exceptions import MissingDependencyException from dlt.common import pendulum, logger -from dlt.common.json import json -from dlt.common.jsonpath import compile_path, find_values, JSONPath +from dlt.common.jsonpath import compile_path from dlt.common.typing import TDataItem, TDataItems, TFun, extract_inner_type, get_generic_type_argument_from_instance, is_optional_type from dlt.common.schema.typing import TColumnNames from dlt.common.configuration import configspec, ConfigurationValueError @@ -22,12 +20,12 @@ from dlt.common.utils import digest128 from dlt.common.data_types.type_helpers import coerce_from_date_types, coerce_value, py_type_to_sc_type -from dlt.extract.exceptions import IncrementalUnboundError, PipeException -from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.exceptions import IncrementalUnboundError +from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing from dlt.extract.incremental.typing import IncrementalColumnState, TCursorValue, LastValueFunc from dlt.extract.pipe import Pipe from dlt.extract.utils import resolve_column_value -from dlt.extract.typing import SupportsPipe, TTableHintTemplate, MapItem, YieldMapItem, FilterItem, ItemTransform +from dlt.extract.typing import SupportsPipe, TTableHintTemplate, ItemTransform from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental, IncrementalTransform try: from dlt.common.libs.pyarrow import is_arrow_item, pyarrow as pa, TAnyArrowItem @@ -73,11 +71,15 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa The values passed explicitly to Incremental will be ignored. Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded """ + # this is config/dataclass so declare members cursor_path: str = None # TODO: Support typevar here initial_value: Optional[Any] = None end_value: Optional[Any] = None + # incremental acting as empty + EMPTY: ClassVar["Incremental[Any]"] = None + def __init__( self, cursor_path: str = dlt.config.value, @@ -340,6 +342,8 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return [item for item in (self._transform_item(transformer, row) for row in rows) if item is not None] return self._transform_item(transformer, rows) +Incremental.EMPTY = Incremental[Any]("") + class IncrementalResourceWrapper(ItemTransform[TDataItem]): _incremental: Optional[Incremental[Any]] = None diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index e84ca8b30f..8000cddbdb 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -3,7 +3,7 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat, TSchemaContract +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat, TSchemaContract from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -70,7 +70,7 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: return None return self._table_schema_template.get("columns") - def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: + def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data""" if not self._table_schema_template: return new_table(self.name, resource=self.name) @@ -85,13 +85,11 @@ def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: if self._table_name_hint_fun and item is None: raise DataItemRequiredForDynamicTableHints(self.name) # resolve - resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items()} # type: ignore - resolved_template.pop("incremental", None) - resolved_template.pop("validator", None) + resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator"]} # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name validate_dict_ignoring_xkeys( - spec=TPartialTableSchema, + spec=TTableSchema, doc=table_schema, path=f"new_table/{self.name}", ) @@ -139,8 +137,6 @@ def apply_hints( t.pop("parent", None) if write_disposition: t["write_disposition"] = write_disposition - if schema_contract: - t["schema_contract"] = schema_contract if columns is not None: t['validator'] = get_column_validator(columns) # if callable then override existing @@ -154,7 +150,6 @@ def apply_hints( else: # set to empty columns t["columns"] = ensure_table_schema_columns(columns) - if primary_key is not None: if primary_key: t["primary_key"] = primary_key @@ -165,9 +160,15 @@ def apply_hints( t["merge_key"] = merge_key else: t.pop("merge_key", None) + if schema_contract is not None: + t["schema_contract"] = schema_contract # set properties that cannot be passed to new_table_template - t["incremental"] = incremental + if incremental is not None: + if incremental is Incremental.EMPTY: + t["incremental"] = None + else: + t["incremental"] = incremental self.set_template(t) def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index ad3b3f4678..864424dad9 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -18,13 +18,11 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - # odd and even tables - assert len(schema_update) == 2 - assert "odd_table" in schema_update - assert "even_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + # odd and even tables must be in the source schema + assert len(source.schema.data_tables(include_incomplete=True)) == 2 + assert "odd_table" in source.schema._schema_tables + assert "even_table" in source.schema._schema_tables # you must commit the files assert len(storage.list_files_to_normalize_sorted()) == 0 storage.commit_extract_files(extract_id) @@ -42,11 +40,9 @@ def expect_tables(resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert len(schema_update) == 1 - assert "odd_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + assert len(source.schema.data_tables(include_incomplete=True)) == 1 + assert "odd_table" in source.schema._schema_tables storage.commit_extract_files(extract_id) assert len(storage.list_files_to_normalize_sorted()) == 1 expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1,3,5,7,9])) @@ -86,10 +82,10 @@ def input_gen(): source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, input_r.with_name("gen_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) + extract(extract_id, source, storage) # both tables got generated - assert "input_gen" in schema_update - assert "gen_clone" in schema_update + assert "input_gen" in source.schema._schema_tables + assert "gen_clone" in source.schema._schema_tables def test_extract_renamed_clone_and_parent(): @@ -105,8 +101,8 @@ def tx_step(item): source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, (input_r | input_tx).with_name("tx_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert "input_gen" in schema_update - assert "tx_clone" in schema_update + extract(extract_id, source, storage) + assert "input_gen" in source.schema._schema_tables + assert "tx_clone" in source.schema._schema_tables # mind that pipe name of the evaluated parent will have different name than the resource assert source.tx_clone._pipe.parent.name == "input_gen_tx_clone" diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 9d5b37f472..ac3061ca60 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -19,7 +19,7 @@ from dlt.extract.source import DltSource from dlt.sources.helpers.transform import take_first -from dlt.extract.incremental import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing from dlt.pipeline.exceptions import PipelineStepFailed from tests.extract.utils import AssertItems, data_to_item_format, TItemFormat, ALL_ITEM_FORMATS, data_item_to_list @@ -127,9 +127,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -166,8 +165,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -360,7 +359,7 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, isrc, market FROM some_data order by created_at, isrc, market") as cur: @@ -688,10 +687,12 @@ def child(item): info = p.run(child, write_disposition="replace") # print(info.load_packages[0]) assert len(info.loads_ids) == 1 - # pipeline applied hints to the child resource - assert child.write_disposition == "replace" + # pipeline applied hints to the child resource but it was placed into source first + # so the original is still "append" + assert child.write_disposition == "append" # create a source where we place only child + child.write_disposition = "replace" s = DltSource("comp", "section", Schema("comp"), [child]) # but extracted resources will include its parent where it derives write disposition from child extracted = s.resources.extracted diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index d8223f2ee8..05366aaa94 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -10,6 +10,7 @@ from dlt.common.schema import Schema from dlt.common.typing import TDataItems from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, ResourcesNotFoundError +from dlt.extract.incremental import Incremental from dlt.extract.pipe import Pipe from dlt.extract.typing import FilterItem, MapItem from dlt.extract.source import DltResource, DltResourceDict, DltSource @@ -1170,7 +1171,7 @@ def empty_gen(): assert empty_r.table_name == "table" # reset - empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}) + empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY) assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append'} table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" From d74242a7d612875eeaae83d2c74372070e787c5f Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:51:03 +0100 Subject: [PATCH 58/73] always uses pipeline schema when extracting --- dlt/pipeline/pipeline.py | 41 ++++++++------- tests/pipeline/test_pipeline.py | 9 +++- .../test_schema_contracts.py | 50 ++++++++++++------- tests/pipeline/test_schema_updates.py | 4 +- 4 files changed, 65 insertions(+), 39 deletions(-) rename tests/{load => pipeline}/test_schema_contracts.py (93%) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index c5ebd5a619..d69b673408 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -537,7 +537,6 @@ def sync_destination(self, destination: TDestinationReferenceArg = None, staging # on merge schemas are replaced so we delete all old versions self._schema_storage.clear_storage() for schema in restored_schemas: - print("RESTORE SCHEMA?") self._schema_storage.save_schema(schema) # if the remote state is present then unset first run if remote_state is not None: @@ -804,9 +803,19 @@ def _data_to_sources(self, ) -> List[DltSource]: def apply_hint_args(resource: DltResource) -> None: - # apply hints only if any of the hints is present, table_name must be always present - if table_name or parent_table_name or write_disposition or columns or primary_key: - resource.apply_hints(table_name or resource.table_name or resource.name, parent_table_name, write_disposition, columns, primary_key) + resource.apply_hints( + table_name, + parent_table_name, + write_disposition, + columns, + primary_key, + schema_contract=schema_contract + ) + + def apply_settings(source_: DltSource) -> None: + # apply schema contract settings + if schema_contract: + source_.schema_contract = schema_contract def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" @@ -816,9 +825,6 @@ def choose_schema() -> Schema: schema_ = self.default_schema.clone() else: schema_ = self._make_schema_with_default_name() - # apply schema contract settings - if schema_contract: - schema_.set_schema_contract(schema_contract, update_table_settings=True) return schema_ effective_schema = choose_schema() @@ -832,14 +838,8 @@ def append_data(data_item: Any) -> None: # if schema is explicit then override source schema if schema: data_item.schema = schema - # try to apply hints to resources - _resources = data_item.resources.values() - for r in _resources: - apply_hint_args(r) sources.append(data_item) elif isinstance(data_item, DltResource): - # apply hints - apply_hint_args(data_item) # do not set section to prevent source that represent a standalone resource # to overwrite other standalone resources (ie. parents) in that source sources.append( @@ -848,10 +848,9 @@ def append_data(data_item: Any) -> None: else: # iterator/iterable/generator # create resource first without table template - resource = DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) - # apply hints - apply_hint_args(resource) - resources.append(resource) + resources.append( + DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) + ) if isinstance(data, C_Sequence) and len(data) > 0: # if first element is source or resource @@ -863,10 +862,16 @@ def append_data(data_item: Any) -> None: else: append_data(data) + # add all the appended resources in one source if resources: - # add all the appended resources in one source sources.append(DltSource(effective_schema.name, self.pipeline_name, effective_schema, resources)) + # apply hints and settings + for source in sources: + apply_settings(source) + for resource in source.selected_resources.values(): + apply_hint_args(resource) + return sources def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 42de866962..3d95af806c 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -445,13 +445,16 @@ def data_piece_2(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None + # one of the schemas is in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 1 # restore the pipeline p = dlt.attach(pipeline_name) assert p.first_run is True assert p.has_data is False + # no schema was saved to storage, the one above was only in memory assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None @@ -479,7 +482,9 @@ def data_schema_3(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] + # schemas from two sources are in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 2 assert p.default_schema_name is None os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately diff --git a/tests/load/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py similarity index 93% rename from tests/load/test_schema_contracts.py rename to tests/pipeline/test_schema_contracts.py index da2e1b2568..55d77ee050 100644 --- a/tests/load/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -1,16 +1,16 @@ import dlt, os, pytest +import contextlib +from typing import Any, Union, Optional + from dlt.common.schema.typing import TSchemaContract from dlt.common.utils import uniq_id -from typing import Any, Union, Optional from dlt.extract.source import DltSource, DltResource -import contextlib - -from tests.load.pipeline.utils import load_table_counts -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.exceptions import PipelineStepFailed from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.schema import utils +from tests.load.pipeline.utils import load_table_counts from tests.utils import skip_if_not_active skip_if_not_active("duckdb") @@ -19,6 +19,7 @@ LOCATIONS = ["source", "resource", "override"] SCHEMA_ELEMENTS = ["tables", "columns", "data_type"] + @contextlib.contextmanager def raises_frozen_exception(check_raise: bool = True) -> Any: if not check_raise: @@ -28,6 +29,7 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: yield assert isinstance(py_exc.value.__context__, SchemaFrozenException) + def items(settings: TSchemaContract) -> Any: @dlt.resource(name="items", write_disposition="append", schema_contract=settings) @@ -41,6 +43,7 @@ def load_items(): return load_items + def items_with_variant(settings: TSchemaContract) -> Any: @dlt.resource(name="items", write_disposition="append", schema_contract=settings) @@ -54,6 +57,7 @@ def load_items(): return load_items + def items_with_new_column(settings: TSchemaContract) -> Any: @dlt.resource(name="items", write_disposition="append", schema_contract=settings) @@ -104,7 +108,7 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline, resource_fun, settings) -> None: +def run_resource(pipeline: Pipeline, resource_fun, settings) -> None: for item in settings.keys(): assert item in LOCATIONS @@ -122,11 +126,14 @@ def source() -> DltResource: # run pipeline pipeline.run(source(), schema_contract=settings.get("override")) - # check updated schema + # check global settings assert pipeline.default_schema._settings.get("schema_contract", None) == (settings.get("override") or settings.get("source")) # check items table settings - assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("override") or settings.get("resource") or {}) + # assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("resource") or {}) + + # check effective table settings + # assert resolve_contract_settings_for_table(None, "items", pipeline.default_schema) == expand_schema_contract_settings(settings.get("resource") or settings.get("override") or "evolve") def get_pipeline(): import duckdb @@ -161,6 +168,8 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None # test adding new subtable with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_subtable, full_settings) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 @@ -203,6 +212,8 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non # test adding new column with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_new_column, full_settings) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) if contract_setting == "evolve": assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -212,15 +223,16 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard_value"] else 20) # test adding variant column - with raises_frozen_exception(contract_setting == "freeze"): - run_resource(pipeline, items_with_variant, full_settings) - - if contract_setting == "evolve": - assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - else: - assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + # with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_variant, full_settings) + # variants are not new columns and should be able to always evolve + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + # if contract_setting == "evolve": + # assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + # else: + # assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 20) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) @pytest.mark.parametrize("contract_setting", schema_contract) @@ -319,6 +331,7 @@ def test_settings_precedence_2() -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 30 + @pytest.mark.parametrize("setting_location", LOCATIONS) def test_change_mode(setting_location: str) -> None: pipeline = get_pipeline() @@ -342,6 +355,7 @@ def test_change_mode(setting_location: str) -> None: table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 + @pytest.mark.parametrize("setting_location", LOCATIONS) def test_single_settings_value(setting_location: str) -> None: pipeline = get_pipeline() @@ -551,7 +565,7 @@ def test_dynamic_new_columns(column_mode: str) -> None: # 1. schema.dlt_tables() - everything evolve # 2. is_dlt_column (I hope we have helper) - column evolve, data_type freeze - def columns(item): + def dynamic_columns(item): if item["id"] == 1: return [{"name": "key", "data_type": "text", "nullable": True}] if item["id"] == 2: @@ -569,7 +583,7 @@ def get_items(): } items = get_items() - items.apply_hints(columns=columns) + items.apply_hints(columns=dynamic_columns) # apply hints apply to `items` not the original resource, so doing get_items() below removed them completely pipeline.run(items) assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 2 diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index 97345061e3..86c0c04be2 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -1,8 +1,10 @@ +import os import dlt def test_schema_updates() -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name="test_schema_updates", full_refresh=True, destination="dummy") @dlt.source() @@ -15,7 +17,7 @@ def resource(): # test without normalizer attributes s = source() p.run(s, table_name="items", write_disposition="append") - assert p.default_schema._normalizers_config["json"]["config"] == {} + assert "config" not in p.default_schema._normalizers_config["json"] # add table propagation s = source() From e9344ee4e09c82ebdecb5b4b7f7157afc6a4106c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:51:48 +0100 Subject: [PATCH 59/73] returns new items count from buffered write --- dlt/common/data_writers/buffered.py | 14 +++++++++----- dlt/common/storages/data_item_storage.py | 4 ++-- dlt/common/utils.py | 1 + 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 5c93e22bc6..783a3501d2 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -68,7 +68,7 @@ def __init__( except TypeError: raise InvalidFileNameTemplateException(file_name_template) - def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> int: self._ensure_open() # rotate file if columns changed and writer does not allow for that # as the only allowed change is to add new column (no updates/deletes), we detect the change by comparing lengths @@ -78,21 +78,24 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # until the first chunk is written we can change the columns schema freely if columns is not None: self._current_columns = dict(columns) + + new_rows_count: int if isinstance(item, List): # items coming in single list will be written together, not matter how many are there self._buffered_items.extend(item) # update row count, if item supports "num_rows" it will be used to count items if len(item) > 0 and hasattr(item[0], "num_rows"): - self._buffered_items_count += sum(tbl.num_rows for tbl in item) + new_rows_count = sum(tbl.num_rows for tbl in item) else: - self._buffered_items_count += len(item) + new_rows_count = len(item) else: self._buffered_items.append(item) # update row count, if item supports "num_rows" it will be used to count items if hasattr(item, "num_rows"): - self._buffered_items_count += item.num_rows + new_rows_count = item.num_rows else: - self._buffered_items_count += 1 + new_rows_count = 1 + self._buffered_items_count += new_rows_count # flush if max buffer exceeded if self._buffered_items_count >= self.buffer_max_items: self._flush_items() @@ -104,6 +107,7 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # rotate on max items elif self.file_max_items and self._writer.items_count >= self.file_max_items: self._rotate_file() + return new_rows_count def write_empty_file(self, columns: TTableSchemaColumns) -> None: if columns is not None: diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 8de95a6f60..6621f07e26 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -24,10 +24,10 @@ def get_writer(self, load_id: str, schema_name: str, table_name: str) -> Buffere self.buffered_writers[writer_id] = writer return writer - def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> int: writer = self.get_writer(load_id, schema_name, table_name) # write item(s) - writer.write_data_item(item, columns) + return writer.write_data_item(item, columns) def write_empty_file(self, load_id: str, schema_name: str, table_name: str, columns: TTableSchemaColumns) -> None: writer = self.get_writer(load_id, schema_name, table_name) diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 692f8452c9..5d22601681 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -245,6 +245,7 @@ def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None: def update_dict_nested(dst: TDict, src: StrAny) -> TDict: + """Merges `src` into `dst` key wise. Does not recur into lists. Values in `src` overwrite `dst` if both keys exit.""" # based on https://github.com/clarketm/mergedeep/blob/master/mergedeep/mergedeep.py def _is_recursive_merge(a: StrAny, b: StrAny) -> bool: From e980396b97e6a5f9067d8b1a770adb0b4a2f6b7b Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Sun, 12 Nov 2023 23:57:48 +0100 Subject: [PATCH 60/73] bumps pyarrow to 12, temporary removes snowflake extra --- poetry.lock | 72 ++++++++++++++++++++++++++++---------------------- pyproject.toml | 6 ++--- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/poetry.lock b/poetry.lock index 15c257b607..5ae75a7757 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3303,7 +3303,7 @@ python-versions = ">=3.7" name = "pandas" version = "1.5.3" description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" +category = "dev" optional = false python-versions = ">=3.8" @@ -3523,11 +3523,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pyarrow" -version = "10.0.1" +version = "14.0.1" description = "Python library for Apache Arrow" category = "main" optional = true -python-versions = ">=3.7" +python-versions = ">=3.8" [package.dependencies] numpy = ">=1.16.6" @@ -4265,9 +4265,7 @@ idna = ">=2.5,<4" keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} oscrypto = "<2.0.0" packaging = "*" -pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} platformdirs = ">=2.6.0,<3.9.0" -pyarrow = {version = ">=10.0.1,<10.1.0", optional = true, markers = "extra == \"pandas\""} pycryptodomex = ">=3.2,<3.5.0 || >3.5.0,<4.0.0" pyjwt = "<3.0.0" pyOpenSSL = ">=16.2.0,<24.0.0" @@ -4855,13 +4853,12 @@ postgres = ["psycopg2-binary", "psycopg2cffi"] pydantic = ["pydantic"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["s3fs", "botocore"] -snowflake = ["snowflake-connector-python"] weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<4.0" -content-hash = "7d5b9bfb96bfd08e2b6843df885a3ff605abe603250db78e35350e18bc933a64" +content-hash = "09402c3f7c1e14326de5f5ae9637d966b7aba4acc01b2b5f05810ca421d4af48" [metadata.files] about-time = [ @@ -6917,31 +6914,42 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyarrow = [ - {file = "pyarrow-10.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:e00174764a8b4e9d8d5909b6d19ee0c217a6cf0232c5682e31fdfbd5a9f0ae52"}, - {file = "pyarrow-10.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6f7a7dbe2f7f65ac1d0bd3163f756deb478a9e9afc2269557ed75b1b25ab3610"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb627673cb98708ef00864e2e243f51ba7b4c1b9f07a1d821f98043eccd3f585"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba71e6fc348c92477586424566110d332f60d9a35cb85278f42e3473bc1373da"}, - {file = "pyarrow-10.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7b4ede715c004b6fc535de63ef79fa29740b4080639a5ff1ea9ca84e9282f349"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e3fe5049d2e9ca661d8e43fab6ad5a4c571af12d20a57dffc392a014caebef65"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:254017ca43c45c5098b7f2a00e995e1f8346b0fb0be225f042838323bb55283c"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70acca1ece4322705652f48db65145b5028f2c01c7e426c5d16a30ba5d739c24"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abb57334f2c57979a49b7be2792c31c23430ca02d24becd0b511cbe7b6b08649"}, - {file = "pyarrow-10.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1765a18205eb1e02ccdedb66049b0ec148c2a0cb52ed1fb3aac322dfc086a6ee"}, - {file = "pyarrow-10.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:61f4c37d82fe00d855d0ab522c685262bdeafd3fbcb5fe596fe15025fbc7341b"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e141a65705ac98fa52a9113fe574fdaf87fe0316cde2dffe6b94841d3c61544c"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf26f809926a9d74e02d76593026f0aaeac48a65b64f1bb17eed9964bfe7ae1a"}, - {file = "pyarrow-10.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:443eb9409b0cf78df10ced326490e1a300205a458fbeb0767b6b31ab3ebae6b2"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f2d00aa481becf57098e85d99e34a25dba5a9ade2f44eb0b7d80c80f2984fc03"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b1fc226d28c7783b52a84d03a66573d5a22e63f8a24b841d5fc68caeed6784d4"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa59933b20183c1c13efc34bd91efc6b2997377c4c6ad9272da92d224e3beb1"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:668e00e3b19f183394388a687d29c443eb000fb3fe25599c9b4762a0afd37775"}, - {file = "pyarrow-10.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1bc6e4d5d6f69e0861d5d7f6cf4d061cf1069cb9d490040129877acf16d4c2a"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:42ba7c5347ce665338f2bc64685d74855900200dac81a972d49fe127e8132f75"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b069602eb1fc09f1adec0a7bdd7897f4d25575611dfa43543c8b8a75d99d6874"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94fb4a0c12a2ac1ed8e7e2aa52aade833772cf2d3de9dde685401b22cec30002"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db0c5986bf0808927f49640582d2032a07aa49828f14e51f362075f03747d198"}, - {file = "pyarrow-10.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:0ec7587d759153f452d5263dbc8b1af318c4609b607be2bd5127dcda6708cdb1"}, - {file = "pyarrow-10.0.1.tar.gz", hash = "sha256:1a14f57a5f472ce8234f2964cd5184cccaa8df7e04568c64edc33b23eb285dd5"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, ] pyasn1 = [ {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, diff --git a/pyproject.toml b/pyproject.toml index 3795f0096b..5c50625ab2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ psycopg2-binary = {version = ">=2.9.1", optional = true} psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_implementation == 'PyPy'"} grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} -pyarrow = {version = ">=8.0.0", optional = true} +pyarrow = {version = ">=12.0.0", optional = true} duckdb = {version = ">=0.6.1,<0.10.0", optional = true} dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} @@ -68,7 +68,7 @@ dbt-athena-community = {version = ">=1.2.0", optional = true} s3fs = {version = ">=2022.4.0", optional = true} gcsfs = {version = ">=2022.4.0", optional = true} botocore = {version = ">=1.28", optional = true} -snowflake-connector-python = {version = ">=3.1.1", optional = true, extras = ["pandas"]} +# snowflake-connector-python = {version = ">=3.1.1", optional = true, extras = ["pandas"]} cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} @@ -91,7 +91,7 @@ filesystem = ["s3fs", "botocore"] s3 = ["s3fs", "botocore"] gs = ["gcsfs"] az = ["adlfs"] -snowflake = ["snowflake-connector-python"] +# snowflake = ["snowflake-connector-python"] motherduck = ["duckdb", "pyarrow"] cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] From 3dc4fa5d9f0231eadffd53493585495a2600e18e Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 13 Nov 2023 16:10:14 +0100 Subject: [PATCH 61/73] fixes arrow imports and normalizer config --- dlt/common/libs/pyarrow.py | 3 +-- dlt/common/schema/schema.py | 5 ++--- dlt/extract/extract.py | 8 +++----- tests/load/pipeline/test_write_disposition_changes.py | 2 +- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 7dce062c54..5051635e67 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -4,8 +4,7 @@ from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.destination.capabilities import DestinationCapabilitiesContext -from dlt.common.schema.typing import TColumnType, TColumnSchemaBase -from dlt.common.data_types import TDataType +from dlt.common.schema.typing import TColumnType from dlt.common.typing import TFileOrPath try: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 73b10a6996..942b49838e 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -325,9 +325,8 @@ def update_schema(self, schema: "Schema") -> None: # update all tables for table in schema.tables.values(): self.update_table(table) - # update normalizer config nondestructively - self.data_item_normalizer.update_normalizer_config(self, self.data_item_normalizer.get_normalizer_config(schema)) - self.update_normalizers() + # pass normalizer config + self._configure_normalizers(schema._normalizers_config) # update and compile settings self._settings = deepcopy(schema.settings) self._compile_settings() diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 778499dcd5..176d3ca0ff 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -6,7 +6,6 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.libs.pyarrow import TAnyArrowItem from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat from dlt.common.exceptions import MissingDependencyException @@ -14,7 +13,7 @@ from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.utils import uniq_id, update_dict_nested -from dlt.common.typing import StrStr, TDataItems, TDataItem +from dlt.common.typing import StrStr, TDataItems, TDataItem, NoneType from dlt.common.schema import Schema, utils from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage @@ -28,9 +27,10 @@ from dlt.extract.typing import TableNameMeta try: from dlt.common.libs import pyarrow - from dlt.common.libs.pyarrow import pyarrow as pa + from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pyarrow = None + TAnyArrowItem = Any # type: ignore[misc] try: import pandas as pd except ModuleNotFoundError: @@ -239,8 +239,6 @@ class JsonLExtractor(Extractor): class ArrowExtractor(Extractor): file_format = "arrow" - - def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: items = [ # 3. remove columns and rows in data contract filters diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index 158993b7c8..c88fd79588 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -77,7 +77,7 @@ def source(): if with_root_key: assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["root"] == {'_dlt_id': '_dlt_root_id'} else: - assert "propagation" not in pipeline.default_schema._normalizers_config["json"]["config"] + assert "propagation" not in pipeline.default_schema._normalizers_config["json"].get("config", {}) # without a root key this will fail, it is expected if not with_root_key and destination_config.supports_merge: From c76788ac652eb092ab3e6397b52b318ab19c8d3c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 13 Nov 2023 19:41:00 +0100 Subject: [PATCH 62/73] fixes normalizer config tests and pipeline state serialization --- dlt/pipeline/pipeline.py | 15 ++++++--------- docs/website/docs/reference/tracing.md | 6 ++++++ tests/pipeline/test_schema_updates.py | 10 ++++------ 3 files changed, 16 insertions(+), 15 deletions(-) create mode 100644 docs/website/docs/reference/tracing.md diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index d69b673408..e1530e4242 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -87,7 +87,7 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: for name in self._schema_storage.live_schemas: self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added - self.schema_names = self._schema_storage.list_schemas() + self.schema_names = self._list_schemas_sorted() return rv return _wrap # type: ignore @@ -1210,13 +1210,6 @@ def managed_state(self, *, extract_state: bool = False) -> Iterator[TPipelineSta backup_state = self._get_state() # restore original pipeline props self._state_to_props(backup_state) - # synchronize schema storage with initial list of schemas, note that we'll not be able to synchronize the schema content - # NOTE: not needed - schemas are not saved and are kept as live until with_schema_sync ends - # if self._schema_storage: - # # TODO: we should restore schemas backup here - # for existing_schema_name in self._schema_storage.list_schemas(): - # if existing_schema_name not in self.schema_names: - # self._schema_storage.remove_schema(existing_schema_name) # raise original exception raise else: @@ -1267,7 +1260,11 @@ def _props_to_state(self, state: TPipelineState) -> None: state["destination"] = self.destination.__name__ if self.staging: state["staging"] = self.staging.__name__ - state["schema_names"] = self._schema_storage.list_schemas() + state["schema_names"] = self._list_schemas_sorted() + + def _list_schemas_sorted(self) -> List[str]: + """Lists schema names sorted to have deterministic state""" + return sorted(self._schema_storage.list_schemas()) def _save_state(self, state: TPipelineState) -> None: self._pipeline_storage.save(Pipeline.STATE_FILE, json_encode_state(state)) diff --git a/docs/website/docs/reference/tracing.md b/docs/website/docs/reference/tracing.md new file mode 100644 index 0000000000..0ad0a59912 --- /dev/null +++ b/docs/website/docs/reference/tracing.md @@ -0,0 +1,6 @@ +1. Identifiers + +2. Data Lineage + +3. Schema Lineage + diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index 86c0c04be2..b88c1a7773 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -47,12 +47,12 @@ def resource(): s = source() s.root_key = False p.run(s, table_name="items", write_disposition="merge") + # source schema overwrites normalizer settings so `root` propagation is gone assert p.default_schema._normalizers_config["json"]["config"] == { "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } } } @@ -64,8 +64,7 @@ def resource(): "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 5 } @@ -79,8 +78,7 @@ def resource(): "tables": { "items": {'_dlt_id': '_dlt_root_id'}, "items2": {'_dlt_id': '_dlt_root_id'}, - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 50 } \ No newline at end of file From 423a1636d8e60dbc6dc46de300179d3ebfbe0070 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 13 Nov 2023 23:20:30 +0100 Subject: [PATCH 63/73] normalizes arrow tables before saving --- dlt/common/data_writers/writers.py | 2 +- dlt/common/libs/pyarrow.py | 97 +++++++++++++++++++++++++--- dlt/common/schema/schema.py | 4 +- dlt/common/schema/typing.py | 1 + dlt/extract/exceptions.py | 6 -- dlt/extract/extract.py | 42 ++++++------ dlt/normalize/items_normalizers.py | 3 +- dlt/normalize/normalize.py | 7 +- tests/cases.py | 16 ++++- tests/pipeline/test_arrow_sources.py | 75 +++++++++++++++++++-- 10 files changed, 202 insertions(+), 51 deletions(-) diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 401f6aafd2..412e732e97 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -220,7 +220,7 @@ def __init__(self, self.parquet_row_group_size = row_group_size def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter": - from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype + from dlt.common.libs.pyarrow import pyarrow return pyarrow.parquet.ParquetWriter(self._f, schema, flavor=self.parquet_flavor, version=self.parquet_version, data_page_size=self.parquet_data_page_size) def write_header(self, columns_schema: TTableSchemaColumns) -> None: diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 5051635e67..f585971ee8 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -1,11 +1,14 @@ from typing import Any, Tuple, Optional, Union, Callable, Iterable, Iterator, Sequence, Tuple +from copy import copy + from dlt import version from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.typing import DLT_NAME_PREFIX, TTableSchemaColumns from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema.typing import TColumnType -from dlt.common.typing import TFileOrPath +from dlt.common.typing import StrStr, TFileOrPath +from dlt.common.normalizers.naming import NamingConvention try: import pyarrow @@ -173,6 +176,76 @@ def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAny raise TypeError(f"Unsupported data item type {type(item)}") +def normalize_py_arrow_schema( + item: TAnyArrowItem, + columns: TTableSchemaColumns, + naming: NamingConvention, + caps: DestinationCapabilitiesContext +) -> TAnyArrowItem: + """Normalize arrow `item` schema according to the `columns`. + + 1. arrow schema field names will be normalized according to `naming` + 2. arrows columns will be reordered according to `columns` + 3. empty columns will be inserted if they are missing, types will be generated using `caps` + """ + rename_mapping = get_normalized_arrow_fields_mapping(item, naming) + rev_mapping = {v: k for k, v in rename_mapping.items()} + dlt_table_prefix = naming.normalize_table_identifier(DLT_NAME_PREFIX) + + # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns + # that should happen in the normalizer + columns = {name:column for name, column in columns.items() if not name.startswith(dlt_table_prefix) or name in rev_mapping} + + # check if nothing to rename + if list(rename_mapping.keys()) == list(rename_mapping.values()): + # check if nothing to reorder + if list(rename_mapping.keys())[:len(columns)]== list(columns.keys()): + return item + + schema = item.schema + new_fields = [] + new_columns = [] + + for column_name, column in columns.items(): + # get original field name + field_name = rev_mapping.pop(column_name, column_name) + if field_name in rename_mapping: + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + else: + # column does not exist in pyarrow. create empty field and column + new_field = pyarrow.field( + column_name, + get_py_arrow_datatype(column, caps, "UTC"), + nullable=column.get("nullable", True) + ) + new_fields.append(new_field) + new_columns.append(pyarrow.nulls(item.num_rows, type=new_field.type)) + + # add the remaining columns + for column_name, field_name in rev_mapping.items(): + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + + # create desired type + return item.__class__.from_arrays(new_columns, schema=pyarrow.schema(new_fields)) + + +def get_normalized_arrow_fields_mapping(item: TAnyArrowItem, naming: NamingConvention) -> StrStr: + """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" + norm_f = naming.normalize_identifier + name_mapping = {n.name: norm_f(n.name) for n in item.schema} + # verify if names uniquely normalize + normalized_names = set(name_mapping.values()) + if len(name_mapping) != len(normalized_names): + raise NameNormalizationClash(f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") + return name_mapping + + def py_arrow_to_table_schema_columns(schema: pyarrow.Schema) -> TTableSchemaColumns: """Convert a PyArrow schema to a table schema columns dict. @@ -208,9 +281,8 @@ def get_row_count(parquet_file: TFileOrPath) -> int: def is_arrow_item(item: Any) -> bool: return isinstance(item, (pyarrow.Table, pyarrow.RecordBatch)) - -TNewColumns = Sequence[Tuple[pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] - +TNewColumns = Sequence[Tuple[int, pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] +"""Sequence of tuples: (field index, field, generating function)""" def pq_stream_with_new_columns( parquet_file: TFileOrPath, columns: TNewColumns, row_groups_per_read: int = 1 @@ -221,7 +293,7 @@ def pq_stream_with_new_columns( Args: parquet_file: path or file object to parquet file - columns: list of columns to add in the form of (`pyarrow.Field`, column_value_callback) + columns: list of columns to add in the form of (insertion index, `pyarrow.Field`, column_value_callback) The callback should accept a `pyarrow.Table` and return an array of values for the column. row_groups_per_read: number of row groups to read at a time. Defaults to 1. @@ -233,6 +305,15 @@ def pq_stream_with_new_columns( # Iterate through n row groups at a time for i in range(0, n_groups, row_groups_per_read): tbl: pyarrow.Table = reader.read_row_groups(range(i, min(i + row_groups_per_read, n_groups))) - for col in columns: - tbl = tbl.append_column(col[0], col[1](tbl)) + for idx, field, gen_ in columns: + if idx == -1: + tbl = tbl.append_column(field, gen_(tbl)) + else: + tbl = tbl.add_column(idx, field, gen_(tbl)) yield tbl + + +class NameNormalizationClash(ValueError): + def __init__(self, reason: str) -> None: + msg = f"Arrow column name clash after input data normalization. {reason}" + super().__init__(msg) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 942b49838e..c6093333b5 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -10,7 +10,7 @@ from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType -from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, +from dlt.common.schema.typing import (COLUMN_HINTS, DLT_NAME_PREFIX, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) @@ -635,7 +635,7 @@ def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: # name normalization functions self.naming = naming_module - self._dlt_tables_prefix = self.naming.normalize_table_identifier("_dlt") + self._dlt_tables_prefix = self.naming.normalize_table_identifier(DLT_NAME_PREFIX) self.version_table_name = self.naming.normalize_table_identifier(VERSION_TABLE_NAME) self.loads_table_name = self.naming.normalize_table_identifier(LOADS_TABLE_NAME) self.state_table_name = self.naming.normalize_table_identifier(STATE_TABLE_NAME) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index f896032e6f..b6a0357793 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -17,6 +17,7 @@ VERSION_TABLE_NAME = "_dlt_version" LOADS_TABLE_NAME = "_dlt_loads" STATE_TABLE_NAME = "_dlt_pipeline_state" +DLT_NAME_PREFIX = "_dlt" TColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "merge_key", "root_key"] """Known properties and hints of the column""" diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index 1ca876a1f0..e540a2468f 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -212,12 +212,6 @@ def __init__(self, resource_name: str) -> None: One of table hints for that resource (typically table name) is a function and hint is computed separately for each instance of data extracted from that resource.""") -class NameNormalizationClash(DltResourceException): - def __init__(self, resource_name: str, reason: str) -> None: - msg = f"Column name clash after input data normalization. {reason}" - super().__init__(resource_name, msg) - - class SourceDataIsNone(DltSourceException): def __init__(self, source_name: str) -> None: self.source_name = source_name diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 176d3ca0ff..061fd98e66 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -5,7 +5,9 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section -from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import ConfigSectionContext, BaseConfiguration, configspec, known_sections +from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat from dlt.common.exceptions import MissingDependencyException @@ -13,15 +15,13 @@ from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.utils import uniq_id, update_dict_nested -from dlt.common.typing import StrStr, TDataItems, TDataItem, NoneType +from dlt.common.typing import TDataItems, TDataItem from dlt.common.schema import Schema, utils -from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns +from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage -from dlt.common.configuration.specs import known_sections -from dlt.common.schema.typing import TPartialTableSchema from dlt.extract.decorators import SourceSchemaInjectableContext -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, NameNormalizationClash +from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints from dlt.extract.pipe import PipeIterator from dlt.extract.source import DltResource, DltSource from dlt.extract.typing import TableNameMeta @@ -109,13 +109,21 @@ def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_n class Extractor: file_format: TLoaderFileFormat + + @configspec + class ExtractorConfiguration(BaseConfiguration): + _caps: Optional[DestinationCapabilitiesContext] = None + + @with_config(spec=ExtractorConfiguration) def __init__( self, extract_id: str, storage: ExtractorStorage, schema: Schema, resources_with_items: Set[str], - collector: Collector = NULL_COLLECTOR + collector: Collector = NULL_COLLECTOR, + *, + _caps: DestinationCapabilitiesContext = None ) -> None: self.schema = schema self.collector = collector @@ -125,6 +133,7 @@ def __init__( self._filtered_tables: Set[str] = set() self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] self._storage = storage + self._caps = _caps or DestinationCapabilitiesContext.generic_capabilities() @property def storage(self) -> ExtractorItemStorage: @@ -256,24 +265,11 @@ def _apply_contract_filters(self, item: TAnyArrowItem) -> TAnyArrowItem: # find matching columns and delete by original name return item - def _get_normalized_arrow_fields(self, resource_name: str, item: TAnyArrowItem) -> StrStr: - """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" - norm_f = self.schema.naming.normalize_identifier - name_mapping = {n.name: norm_f(n.name) for n in item.schema} - # verify if names uniquely normalize - normalized_names = set(name_mapping.values()) - if len(name_mapping) != len(normalized_names): - raise NameNormalizationClash(resource_name, f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") - return name_mapping - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + columns = columns or self.schema.tables[table_name]["columns"] # Note: `items` is always a list here due to the conversion in `write_table` - items = [pyarrow.rename_columns( - item, - list(self._get_normalized_arrow_fields(resource_name, item).values()) - ) - for item in items] - super()._write_item(table_name, resource_name, items, self.schema.tables[table_name]["columns"]) + items = [pyarrow.normalize_py_arrow_schema(item, columns, self.schema.naming, self._caps) for item in items] + super()._write_item(table_name, resource_name, items, columns) def _compute_table(self, resource: DltResource, data_item: TDataItem) -> TPartialTableSchema: data_item = data_item[0] diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index ce2a14477a..f096afbe71 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -217,6 +217,7 @@ def _write_with_dlt_columns( table_updates.append(table_update) load_id_type = pa.dictionary(pa.int8(), pa.string()) new_columns.append(( + -1, pa.field("_dlt_load_id", load_id_type, nullable=False), lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type) )) @@ -226,6 +227,7 @@ def _write_with_dlt_columns( table_updates = schema_update.setdefault(root_table_name, []) table_updates.append(table_update) new_columns.append(( + -1, pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), lambda batch: pa.array(generate_dlt_ids(batch.num_rows)) )) @@ -269,7 +271,6 @@ def __call__( self, extracted_items_file: str, root_table_name: str ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: base_schema_update = self._fix_schema_precisions(root_table_name) - import pyarrow as pa add_dlt_id = self.config.parquet_normalizer.add_dlt_id add_dlt_load_id = self.config.parquet_normalizer.add_dlt_load_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 87bbac651c..ab87a5a2a1 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -51,7 +51,12 @@ def create_storages(self) -> None: # pass initial normalize storage config embedded in normalize config self.normalize_storage = NormalizeStorage(True, config=self.config._normalize_storage_config) # normalize saves in preferred format but can read all supported formats - self.load_storage = LoadStorage(True, self.config.destination_capabilities.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, config=self.config._load_storage_config) + self.load_storage = LoadStorage( + True, + self.config.destination_capabilities.preferred_loader_file_format, + LoadStorage.ALL_SUPPORTED_FILE_FORMATS, + config=self.config._load_storage_config + ) @staticmethod def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Schema: diff --git a/tests/cases.py b/tests/cases.py index ca8a97082e..70c20d74af 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -333,7 +333,14 @@ def assert_all_data_types_row( assert db_mapping == expected_rows -def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = True, include_time: bool = True, num_rows: int = 3) -> Tuple[Any, List[Dict[str, Any]]]: +def arrow_table_all_data_types( + object_format: TArrowFormat, + include_json: bool = True, + include_time: bool = True, + include_not_normalized_name: bool = True, + include_name_clash: bool = False, + num_rows: int = 3 +) -> Tuple[Any, List[Dict[str, Any]]]: """Create an arrow object or pandas dataframe with all supported data types. Returns the table and its records in python format @@ -342,7 +349,6 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = from dlt.common.libs.pyarrow import pyarrow as pa data = { - "Pre Normalized Column": [random.choice(ascii_lowercase) for _ in range(num_rows)], "string": [random.choice(ascii_lowercase) for _ in range(num_rows)], "float": [round(random.uniform(0, 100), 4) for _ in range(num_rows)], "int": [random.randrange(0, 100) for _ in range(num_rows)], @@ -355,6 +361,12 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = "null": pd.Series( [None for _ in range(num_rows)]) } + if include_name_clash: + data["pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + include_not_normalized_name = True + if include_not_normalized_name: + data["Pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + if include_json: data["json"] = [{"a": random.randrange(0, 100)} for _ in range(num_rows)] diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 31d5d001df..1b32596786 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -6,15 +6,17 @@ import os import io import pyarrow as pa -from typing import List import dlt +from dlt.common import json, Decimal from dlt.common.utils import uniq_id +from dlt.common.libs.pyarrow import NameNormalizationClash + from dlt.pipeline.exceptions import PipelineStepFailed + from tests.cases import arrow_table_all_data_types, TArrowFormat from tests.utils import preserve_environ -from dlt.common import json -from dlt.common import Decimal + @pytest.mark.parametrize( @@ -87,7 +89,6 @@ def some_data(): assert schema_columns['json']['data_type'] == 'complex' - @pytest.mark.parametrize( ("item_type", "is_list"), [("pandas", False), ("table", False), ("record_batch", False), ("pandas", True), ("table", True), ("record_batch", True)] ) @@ -181,6 +182,44 @@ def data_frames(): assert len(pipeline.get_load_package_info(load_id).jobs["new_jobs"]) == 10 +@pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) +def test_arrow_clashing_names(item_type: TArrowFormat) -> None: + # # use parquet for dummy + os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") + + item, _ = arrow_table_all_data_types(item_type, include_name_clash=True) + + @dlt.resource + def data_frames(): + for _ in range(10): + yield item + + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(data_frames()) + assert isinstance(py_ex.value.__context__, NameNormalizationClash) + + +@pytest.mark.parametrize("item_type", ["table", "record_batch"]) +def test_load_arrow_vary_schema(item_type: TArrowFormat) -> None: + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + # remove int column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + @pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" @@ -199,7 +238,7 @@ def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: assert info.row_counts["items"] == len(rows) -@pytest.mark.parametrize("item_type", ["table", "pandas", "record_batch"]) +@pytest.mark.parametrize("item_type", ["table"]) # , "pandas", "record_batch" def test_normalize_with_dlt_columns(item_type: TArrowFormat): item, records = arrow_table_all_data_types(item_type, num_rows=5432) os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID'] = "True" @@ -212,10 +251,10 @@ def test_normalize_with_dlt_columns(item_type: TArrowFormat): def some_data(): yield item - pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="filesystem") + pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="duckdb") pipeline.extract(some_data()) - pipeline.normalize() + pipeline.normalize(loader_file_format="parquet") load_id = pipeline.list_normalized_load_packages()[0] storage = pipeline._get_load_storage() @@ -241,3 +280,25 @@ def some_data(): schema = pipeline.default_schema assert schema.tables['some_data']['columns']['_dlt_id']['data_type'] == 'text' assert schema.tables['some_data']['columns']['_dlt_load_id']['data_type'] == 'text' + + pipeline.load().raise_on_failed_jobs() + + # should be able to load again + pipeline.run(some_data()).raise_on_failed_jobs() + + # should be able to load arrow without a column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + # should be able to load arrow with a new column + item, records = arrow_table_all_data_types(item_type, num_rows=200) + item = item.append_column("static_int", [[0] * 200]) + pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + schema = pipeline.default_schema + assert schema.tables['some_data']['columns']['static_int']['data_type'] == 'bigint' From c1c32d683bddcacf6c846880c47f8e8a1d3ae010 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 16 Nov 2023 23:52:13 +0100 Subject: [PATCH 64/73] adds validation and model synth for contracts to pydantic helper --- dlt/common/libs/pydantic.py | 182 +++++++++++++++++++++++++++-- tests/libs/test_pydantic.py | 225 ++++++++++++++++++++++++++++++++++++ 2 files changed, 395 insertions(+), 12 deletions(-) create mode 100644 tests/libs/test_pydantic.py diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index c66d67f1f7..18fafa0138 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,30 +1,64 @@ -from typing import Type, Union, get_type_hints, get_args, Any +from typing import Generic, Sequence, TypedDict, List, Type, Union, TypeVar, get_type_hints, get_args, Any -from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.data_types import py_type_to_sc_type, TDataType -from dlt.common.typing import is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union +from dlt.common.exceptions import MissingDependencyException, DltException +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns +from dlt.common.data_types import py_type_to_sc_type +from dlt.common.typing import TDataItem, TDataItems, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union try: - from pydantic import BaseModel, Field, Json + from pydantic import BaseModel, ValidationError, Json, create_model except ImportError: - raise MissingDependencyException("DLT pydantic Helpers", ["pydantic"], "DLT Helpers for for pydantic.") + raise MissingDependencyException("dlt Pydantic helpers", ["pydantic"], "Both Pydantic 1.x and 2.x are supported") +_PYDANTIC_2 = False +try: + from pydantic import PydanticDeprecatedSince20 + _PYDANTIC_2 = True + # hide deprecation warning + import warnings + warnings.simplefilter("ignore", category=PydanticDeprecatedSince20) +except ImportError: + pass + +_TPydanticModel = TypeVar("_TPydanticModel", bound=BaseModel) + + +class ListModel(BaseModel, Generic[_TPydanticModel]): + items: List[_TPydanticModel] + + +class DltConfig(TypedDict, total=False): + """dlt configuration that can be attached to Pydantic model + + Example below removes `nested` field from the resulting dlt schema. + >>> class ItemModel(BaseModel): + >>> b: bool + >>> nested: Dict[str, Any] + >>> dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + """ + skip_complex_types: bool + """If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from dlt schema generated from the model""" -def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], skip_complex_types: bool = False) -> TTableSchemaColumns: + +def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) -> TTableSchemaColumns: """Convert a pydantic model to a table schema columns dict + See also DltConfig for more control over how the schema is created + Args: model: The pydantic model to convert. Can be a class or an instance. - skip_complex_types: If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from the result. + Returns: TTableSchemaColumns: table schema columns dict """ + skip_complex_types = False + if hasattr(model, "dlt_config"): + skip_complex_types = model.dlt_config.get("skip_complex_types", False) + result: TTableSchemaColumns = {} - fields = model.__fields__ - for field_name, field in fields.items(): + for field_name, field in model.__fields__.items(): # type: ignore[union-attr] annotation = field.annotation if inner_annotation := getattr(annotation, 'inner_type', None): # This applies to pydantic.Json fields, the inner type is the type after json parsing @@ -49,7 +83,12 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s inner_type = dict name = field.alias or field_name - data_type = py_type_to_sc_type(inner_type) + try: + data_type = py_type_to_sc_type(inner_type) + except TypeError: + # try to coerce unknown type to text + data_type = "text" + if data_type == 'complex' and skip_complex_types: continue @@ -60,3 +99,122 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s } return result + +def apply_schema_contract_to_model( + model: Type[_TPydanticModel], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode = "freeze" +) -> Type[_TPydanticModel]: + """Configures or re-creates `model` so it behaves according to `column_mode` and `data_mode` settings. + + `column_mode` sets the model behavior when unknown field is found. + `data_mode` sets model behavior when known field does not validate. currently `evolve` and `freeze` are supported here. + + `discard_row` is implemented in `validate_item`. + """ + if data_mode == "evolve": + # create a lenient model that accepts any data + model = create_model(model.__name__ + "Any", **{n:(Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] + elif data_mode == "discard_value": + raise ValueError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models. Not yet implemented.") + + extra = "forbid" + if column_mode == "evolve": + extra = "allow" + elif column_mode == "discard_value": + extra = "ignore" + + if _PYDANTIC_2: + config = model.model_config + config["extra"] = extra # type: ignore[typeddict-item] + else: + config = model.Config # type: ignore[attr-defined] + config.extra = extra # type: ignore[attr-defined] + + return create_model( # type: ignore[no-any-return, call-overload] + model.__name__ + "Extra" + extra.title(), + __config__ = config, + **{n:(f.annotation, f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] + ) + + +def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze") -> Type[ListModel[_TPydanticModel]]: + """Creates a model from `model` for validating list of items in batch according to `data_mode` + + Currently only freeze is supported. See comments in the code + """ + # TODO: use LenientList to create list model that automatically discards invalid items + # https://github.com/pydantic/pydantic/issues/2274 and https://gist.github.com/dmontagu/7f0cef76e5e0e04198dd608ad7219573 + return create_model( + "List" + __name__, + items=(List[model], ...) # type: ignore[return-value,valid-type] + ) + + +def validate_items( + list_model: Type[ListModel[_TPydanticModel]], + items: List[TDataItem], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> List[_TPydanticModel]: + """Validates list of `item` with `list_model` and returns parsed Pydantic models + + `list_model` should be created with `create_list_model` and have `items` field which this function returns. + """ + try: + return list_model(items=items).items + except ValidationError as e: + delta_idx = 0 + for err in e.errors(): + if len(err["loc"]) >= 2: + err_idx = int(err["loc"][1]) - delta_idx + err_item = items[err_idx] + else: + # top level error which means misalignment of list model and items + raise FullValidationError(list_model, items, e) from e + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise FullValidationError(list_model, err_item, e) from e + elif column_mode == "discard_row": + items.pop(err_idx) + delta_idx += 1 + + else: + if data_mode == "freeze": + raise FullValidationError(list_model, err_item, e) from e + elif data_mode == "discard_row": + items.pop(err_idx) + delta_idx += 1 + + # validate again with error items removed + return validate_items(list_model, items, column_mode, data_mode) + + +def validate_item(model: Type[_TPydanticModel], item: TDataItems, column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> _TPydanticModel: + """Validates `item` against model `model` and returns an instance of it""" + try: + return model.parse_obj(item) + except ValidationError as e: + for err in e.errors(include_url=False, include_context=False): + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise FullValidationError(model, item, e) from e + elif column_mode == "discard_row": + return None + else: + if data_mode == "freeze": + raise FullValidationError(model, item, e) from e + elif data_mode == "discard_row": + return None + # validate again with error items removed + return validate_item(model, item, column_mode, data_mode) + + +class FullValidationError(ValueError, DltException): + def __init__(self, validator: Type[BaseModel], data_item: TDataItems, original_exception: Exception) ->None: + self.original_exception = original_exception + self.validator = validator + self.data_item = data_item + super().__init__(f"Extracted data item could not be validated with {validator}. Original message: {original_exception}") diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py new file mode 100644 index 0000000000..274ce20fcd --- /dev/null +++ b/tests/libs/test_pydantic.py @@ -0,0 +1,225 @@ +from copy import copy +import pytest +from typing import ClassVar, Union, Optional, List, Dict, Any +from enum import Enum + +from datetime import datetime, date, time # noqa: I251 +from dlt.common import Decimal +from dlt.common import json + +from dlt.common.libs.pydantic import DltConfig, pydantic_to_table_schema_columns, apply_schema_contract_to_model, validate_item, validate_items, create_list_model +from pydantic import BaseModel, Json, AnyHttpUrl, ConfigDict, ValidationError + + +class StrEnum(str, Enum): + a = "a_value" + b = "b_value" + c = "c_value" + + +class IntEnum(int, Enum): + a = 0 + b = 1 + c = 2 + + +class MixedEnum(Enum): + a_int = 0 + b_str = "b_value" + c_int = 2 + + +class NestedModel(BaseModel): + nested_field: str + + +class Model(BaseModel): + bigint_field: int + text_field: str + timestamp_field: datetime + date_field: date + decimal_field: Decimal + double_field: float + time_field: time + + nested_field: NestedModel + list_field: List[str] + + union_field: Union[int, str] + + optional_field: Optional[float] + + blank_dict_field: dict # type: ignore[type-arg] + parametrized_dict_field: Dict[str, int] + + str_enum_field: StrEnum + int_enum_field: IntEnum + # Both of these shouold coerce to str + mixed_enum_int_field: MixedEnum + mixed_enum_str_field: MixedEnum + + json_field: Json[List[str]] + + url_field: AnyHttpUrl + + any_field: Any + json_any_field: Json[Any] + + +class ModelWithConfig(Model): + model_config = ConfigDict(frozen=True, extra="allow") + + +TEST_MODEL_INSTANCE = Model( + bigint_field=1, text_field="text", timestamp_field=datetime.now(), + date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, + time_field=time(1, 2, 3, 12345), + nested_field=NestedModel(nested_field="nested"), + list_field=["a", "b", "c"], + union_field=1, + optional_field=None, + blank_dict_field={}, + parametrized_dict_field={"a": 1, "b": 2, "c": 3}, + str_enum_field=StrEnum.a, + int_enum_field=IntEnum.a, + mixed_enum_int_field=MixedEnum.a_int, + mixed_enum_str_field=MixedEnum.b_str, + json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] + url_field="https://example.com", # type: ignore[arg-type] + any_field="any_string", + json_any_field=json.dumps("any_string"), +) + + +@pytest.mark.parametrize('instance', [True, False]) +def test_pydantic_model_to_columns(instance: bool) -> None: + if instance: + model = TEST_MODEL_INSTANCE + else: + model = Model # type: ignore[assignment] + + result = pydantic_to_table_schema_columns(model) + + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + assert result["date_field"]["data_type"] == "date" + assert result["decimal_field"]["data_type"] == "decimal" + assert result["double_field"]["data_type"] == "double" + assert result["time_field"]["data_type"] == "time" + assert result["nested_field"]["data_type"] == "complex" + assert result['list_field']['data_type'] == 'complex' + assert result['union_field']['data_type'] == 'bigint' + assert result['optional_field']['data_type'] == 'double' + assert result['optional_field']['nullable'] is True + assert result['blank_dict_field']['data_type'] == 'complex' + assert result['parametrized_dict_field']['data_type'] == 'complex' + assert result['str_enum_field']['data_type'] == 'text' + assert result['int_enum_field']['data_type'] == 'bigint' + assert result['mixed_enum_int_field']['data_type'] == 'text' + assert result['mixed_enum_str_field']['data_type'] == 'text' + assert result['json_field']['data_type'] == 'complex' + assert result['url_field']['data_type'] == 'text' + + # Any type fields are excluded from schema + assert 'any_field' not in result + assert 'json_any_field' not in result + + +def test_pydantic_model_skip_complex_types() -> None: + class SkipNestedModel(Model): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + result = pydantic_to_table_schema_columns(SkipNestedModel) + + assert result["bigint_field"]["data_type"] == "bigint" + assert "nested_field" not in result + assert "list_field" not in result + assert "blank_dict_field" not in result + assert "parametrized_dict_field" not in result + assert "json_field" not in result + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + + +def test_model_for_column_mode() -> None: + # extra prop + instance_extra = TEST_MODEL_INSTANCE.dict() + instance_extra["extra_prop"] = "EXTRA" + # back to string + instance_extra["json_field"] = json.dumps(["a", "b", "c"]) + instance_extra["json_any_field"] = json.dumps("any_string") + + # evolve - allow extra fields + model_evolve = apply_schema_contract_to_model(ModelWithConfig, "evolve") + # assert "frozen" in model_evolve.model_config + extra_instance = model_evolve.parse_obj(instance_extra) + assert hasattr(extra_instance, "extra_prop") + assert extra_instance.extra_prop == "EXTRA" + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_evolve.parse_obj(instance_extra) + assert extra_instance.extra_prop == "EXTRA" # type: ignore[attr-defined] + + # freeze - validation error on extra fields + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "freeze") + # assert "frozen" in model_freeze.model_config + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + model_freeze = apply_schema_contract_to_model(Model, "freeze") # type: ignore[arg-type] + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard row - same as freeze + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "discard_row") + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard value - ignore extra fields + model_discard = apply_schema_contract_to_model(ModelWithConfig, "discard_value") + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + + # evolve data but freeze new columns + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "freeze") + instance_extra_2 = copy(instance_extra) + # should parse ok + model_discard.parse_obj(instance_extra_2) + # this must fail validation + instance_extra_2["bigint_field"] = "NOT INT" + with pytest.raises(ValidationError): + model_discard.parse_obj(instance_extra_2) + # let the datatypes evolve + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "evolve") + print(model_freeze.parse_obj(instance_extra_2).dict()) + + with pytest.raises(ValueError): + apply_schema_contract_to_model(ModelWithConfig, "evolve", "discard_value") + + +def test_items_validation() -> None: + + class ItemModel(BaseModel): + b: bool + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + + item = ItemModel(b=True) + print(ItemModel.dlt_config) + print(item.dlt_config) + + #ItemRootModel = RootModel(bool) + + list_model = create_list_model(ItemModel) + list_model = apply_schema_contract_to_model(list_model, "freeze", "discard_row") + + items = validate_items(list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "discard_row") + assert len(items) == 2 + assert items[0].b is True + assert items[1].b is False \ No newline at end of file From c24b643e8f089ee546618fe649926ad4b6c30af0 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 16 Nov 2023 23:56:08 +0100 Subject: [PATCH 65/73] splits extractor into files, improves pydantic validator --- dlt/extract/exceptions.py | 8 - dlt/extract/extract.py | 278 +-------------------------- dlt/extract/extractors.py | 246 ++++++++++++++++++++++++ dlt/extract/incremental/transform.py | 25 +-- dlt/extract/schema.py | 59 +++--- dlt/extract/storage.py | 78 ++++++++ dlt/extract/validation.py | 58 ++++-- 7 files changed, 411 insertions(+), 341 deletions(-) create mode 100644 dlt/extract/extractors.py create mode 100644 dlt/extract/storage.py diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index e540a2468f..351b85a9d8 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -264,11 +264,3 @@ def __init__(self, source_name: str, schema_name: str) -> None: class IncrementalUnboundError(DltResourceException): def __init__(self, cursor_path: str) -> None: super().__init__("", f"The incremental definition with cursor path {cursor_path} is used without being bound to the resource. This most often happens when you create dynamic resource from a generator function that uses incremental. See https://dlthub.com/docs/general-usage/incremental-loading#incremental-loading-with-last-value for an example.") - - -class ValidationError(ValueError, DltException): - def __init__(self, validator: ValidateItem, data_item: TDataItems, original_exception: Exception) ->None: - self.original_exception = original_exception - self.validator = validator - self.data_item = data_item - super().__init__(f"Extracted data item could not be validated with {validator}. Original message: {original_exception}") diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 061fd98e66..1276f1b1f5 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,290 +1,24 @@ import contextlib -from copy import copy -import os -from typing import ClassVar, Set, Dict, Any, Optional, Set +from typing import Set, Dict, Optional, Set from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section -from dlt.common.configuration.inject import with_config -from dlt.common.configuration.specs import ConfigSectionContext, BaseConfiguration, configspec, known_sections -from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.configuration.specs import ConfigSectionContext, known_sections from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat -from dlt.common.exceptions import MissingDependencyException from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.utils import uniq_id, update_dict_nested -from dlt.common.typing import TDataItems, TDataItem -from dlt.common.schema import Schema, utils -from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema -from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage +from dlt.common.schema import utils from dlt.extract.decorators import SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints from dlt.extract.pipe import PipeIterator -from dlt.extract.source import DltResource, DltSource -from dlt.extract.typing import TableNameMeta -try: - from dlt.common.libs import pyarrow - from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem -except MissingDependencyException: - pyarrow = None - TAnyArrowItem = Any # type: ignore[misc] -try: - import pandas as pd -except ModuleNotFoundError: - pd = None +from dlt.extract.source import DltSource +from dlt.extract.storage import ExtractorStorage +from dlt.extract.extractors import JsonLExtractor, ArrowExtractor, Extractor -class ExtractorItemStorage(DataItemStorage): - load_file_type: TLoaderFileFormat - - def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: - # data item storage with jsonl with pua encoding - super().__init__(self.load_file_type) - self.extract_folder = extract_folder - self.storage = storage - - - def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: - template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") - return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.extract_folder, extract_id) - - -class JsonLExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "puae-jsonl" - - -class ArrowExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "arrow" - - -class ExtractorStorage(NormalizeStorage): - EXTRACT_FOLDER: ClassVar[str] = "extract" - - """Wrapper around multiple extractor storages with different file formats""" - def __init__(self, C: NormalizeStorageConfiguration) -> None: - super().__init__(True, C) - self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { - "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), - "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) - } - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.EXTRACT_FOLDER, extract_id) - - def create_extract_id(self) -> str: - extract_id = uniq_id() - self.storage.create_folder(self._get_extract_path(extract_id)) - return extract_id - - def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: - return self._item_storages[loader_file_format] - - def close_writers(self, extract_id: str) -> None: - for storage in self._item_storages.values(): - storage.close_writers(extract_id) - - def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: - extract_path = self._get_extract_path(extract_id) - for file in self.storage.list_folder_files(extract_path, to_root=False): - from_file = os.path.join(extract_path, file) - to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) - if with_delete: - self.storage.atomic_rename(from_file, to_file) - else: - # create hardlink which will act as a copy - self.storage.link_hard(from_file, to_file) - if with_delete: - self.storage.delete_folder(extract_path, recursively=True) - - def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: - self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) - - - -class Extractor: - file_format: TLoaderFileFormat - - @configspec - class ExtractorConfiguration(BaseConfiguration): - _caps: Optional[DestinationCapabilitiesContext] = None - - @with_config(spec=ExtractorConfiguration) - def __init__( - self, - extract_id: str, - storage: ExtractorStorage, - schema: Schema, - resources_with_items: Set[str], - collector: Collector = NULL_COLLECTOR, - *, - _caps: DestinationCapabilitiesContext = None - ) -> None: - self.schema = schema - self.collector = collector - self.resources_with_items = resources_with_items - self.extract_id = extract_id - self._table_contracts: Dict[str, TSchemaContractDict] = {} - self._filtered_tables: Set[str] = set() - self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] - self._storage = storage - self._caps = _caps or DestinationCapabilitiesContext.generic_capabilities() - - @property - def storage(self) -> ExtractorItemStorage: - return self._storage.get_storage(self.file_format) - - @staticmethod - def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: - """Detect the loader file format of the data items based on type. - Currently this is either 'arrow' or 'puae-jsonl' - - Returns: - The loader file format or `None` if if can't be detected. - """ - for item in items if isinstance(items, list) else [items]: - # Assume all items in list are the same type - if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): - return "arrow" - return "puae-jsonl" - return None # Empty list is unknown format - - def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" - if isinstance(meta, TableNameMeta): - # write item belonging to table with static name - self._write_to_static_table(resource, meta.table_name, items) - else: - if resource._table_name_hint_fun: - # table has name or other hints depending on data items - self._write_to_dynamic_table(resource, items) - else: - # write item belonging to table with static name - self._write_to_static_table(resource, resource.table_name, items) # type: ignore[arg-type] - - def write_empty_file(self, table_name: str) -> None: - table_name = self.schema.naming.normalize_table_identifier(table_name) - self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) - self.collector.update(table_name, inc=new_rows_count) - self.resources_with_items.add(resource_name) - - def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: - if not isinstance(items, list): - items = [items] - for item in items: - table_name = self.schema.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) - if table_name in self._filtered_tables: - continue - if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: - self._compute_and_update_table(resource, table_name, item) - # write to storage with inferred table name - if table_name not in self._filtered_tables: - self._write_item(table_name, resource.name, item) - - def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - table_name = self.schema.naming.normalize_table_identifier(table_name) - if table_name not in self._table_contracts: - self._compute_and_update_table(resource, table_name, items) - if table_name not in self._filtered_tables: - self._write_item(table_name, resource.name, items) - - def _compute_table(self, resource: DltResource, data_item: TDataItem) -> TTableSchema: - """Computes a schema for a new or dynamic table and normalizes identifiers""" - return self.schema.normalize_table_identifiers( - resource.compute_table_schema(data_item) - ) - - def _compute_and_update_table(self, resource: DltResource, table_name: str, data_item: TDataItem) -> None: - """ - Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written - """ - computed_table = self._compute_table(resource, data_item) - # overwrite table name (if coming from meta) - computed_table["name"] = table_name - # get or compute contract - schema_contract = self._table_contracts.setdefault( - table_name, - self.schema.resolve_contract_settings_for_table(table_name) - ) - - # this is a new table so allow evolve once - if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): - computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] - existing_table = self.schema._schema_tables.get(table_name, None) - if existing_table: - diff_table = utils.merge_tables(existing_table, computed_table) - else: - diff_table = computed_table - - # apply contracts - diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table) - - # merge with schema table - if diff_table: - self.schema.update_table(diff_table) - - # process filters - if filters: - for entity, name, mode in filters: - if entity == "tables": - self._filtered_tables.add(name) - elif entity == "columns": - filtered_columns = self._filtered_columns.setdefault(table_name, {}) - filtered_columns[name] = mode - - -class JsonLExtractor(Extractor): - file_format = "puae-jsonl" - - -class ArrowExtractor(Extractor): - file_format = "arrow" - - def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - items = [ - # 3. remove columns and rows in data contract filters - # 2. Remove null-type columns from the table(s) as they can't be loaded - self._apply_contract_filters(pyarrow.remove_null_columns(tbl)) for tbl in ( - # 1. Convert pandas frame(s) to arrow Table - pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item - for item in (items if isinstance(items, list) else [items]) - ) - ] - super().write_items(resource, items, meta) - - def _apply_contract_filters(self, item: TAnyArrowItem) -> TAnyArrowItem: - # convert arrow schema names into normalized names - # find matching columns and delete by original name - return item - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - columns = columns or self.schema.tables[table_name]["columns"] - # Note: `items` is always a list here due to the conversion in `write_table` - items = [pyarrow.normalize_py_arrow_schema(item, columns, self.schema.naming, self._caps) for item in items] - super()._write_item(table_name, resource_name, items, columns) - - def _compute_table(self, resource: DltResource, data_item: TDataItem) -> TPartialTableSchema: - data_item = data_item[0] - computed_table = super()._compute_table(resource, data_item) - - # Merge the columns to include primary_key and other hints that may be set on the resource - arrow_table = copy(computed_table) - arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(data_item.schema) - # normalize arrow table before merging - arrow_table = self.schema.normalize_table_identifiers(arrow_table) - # we must override the columns to preserve the order in arrow table - arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) - - return arrow_table - def extract( extract_id: str, source: DltSource, diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py new file mode 100644 index 0000000000..d1b636b6d3 --- /dev/null +++ b/dlt/extract/extractors.py @@ -0,0 +1,246 @@ +from copy import copy +from typing import Set, Dict, Any, Optional, Set + +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import BaseConfiguration, configspec +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.runtime.collector import Collector, NULL_COLLECTOR +from dlt.common.utils import update_dict_nested +from dlt.common.typing import TDataItems, TDataItem +from dlt.common.schema import Schema, utils +from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema + +from dlt.extract.source import DltResource +from dlt.extract.typing import TableNameMeta +from dlt.extract.storage import ExtractorStorage, ExtractorItemStorage +try: + from dlt.common.libs import pyarrow + from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem +except MissingDependencyException: + pyarrow = None + +try: + import pandas as pd +except ModuleNotFoundError: + pd = None + + +class Extractor: + file_format: TLoaderFileFormat + + @configspec + class ExtractorConfiguration(BaseConfiguration): + _caps: Optional[DestinationCapabilitiesContext] = None + + @with_config(spec=ExtractorConfiguration) + def __init__( + self, + extract_id: str, + storage: ExtractorStorage, + schema: Schema, + resources_with_items: Set[str], + collector: Collector = NULL_COLLECTOR, + *, + _caps: DestinationCapabilitiesContext = None + ) -> None: + self.schema = schema + self.naming = schema.naming + self.collector = collector + self.resources_with_items = resources_with_items + self.extract_id = extract_id + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + self._storage = storage + self._caps = _caps or DestinationCapabilitiesContext.generic_capabilities() + + @property + def storage(self) -> ExtractorItemStorage: + return self._storage.get_storage(self.file_format) + + @staticmethod + def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: + """Detect the loader file format of the data items based on type. + Currently this is either 'arrow' or 'puae-jsonl' + + Returns: + The loader file format or `None` if if can't be detected. + """ + for item in items if isinstance(items, list) else [items]: + # Assume all items in list are the same type + if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): + return "arrow" + return "puae-jsonl" + return None # Empty list is unknown format + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" + if table_name := self._get_static_table_name(resource, meta): + # write item belonging to table with static name + self._write_to_static_table(resource, table_name, items) + else: + # table has name or other hints depending on data items + self._write_to_dynamic_table(resource, items) + + def write_empty_file(self, table_name: str) -> None: + table_name = self.naming.normalize_table_identifier(table_name) + self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) + + def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: + if resource._table_name_hint_fun: + return None + if isinstance(meta, TableNameMeta): + table_name = meta.table_name + else: + table_name = resource.table_name # type: ignore[assignment] + return self.naming.normalize_table_identifier(table_name) + + def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: + return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) + self.collector.update(table_name, inc=new_rows_count) + self.resources_with_items.add(resource_name) + + def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: + if not isinstance(items, list): + items = [items] + + for item in items: + table_name = self._get_dynamic_table_name(resource, item) + if table_name in self._filtered_tables: + continue + if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: + item = self._compute_and_update_table(resource, table_name, item) + # write to storage with inferred table name + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, item) + + def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: + if table_name not in self._table_contracts: + items = self._compute_and_update_table(resource, table_name, items) + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, items) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: + """Computes a schema for a new or dynamic table and normalizes identifiers""" + return self.schema.normalize_table_identifiers( + resource.compute_table_schema(items) + ) + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + """ + Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written + """ + computed_table = self._compute_table(resource, items) + # overwrite table name (if coming from meta) + computed_table["name"] = table_name + # get or compute contract + schema_contract = self._table_contracts.setdefault( + table_name, + self.schema.resolve_contract_settings_for_table(table_name, computed_table) + ) + + # this is a new table so allow evolve once + if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): + computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] + existing_table = self.schema._schema_tables.get(table_name, None) + if existing_table: + diff_table = utils.diff_tables(existing_table, computed_table) + else: + diff_table = computed_table + + # apply contracts + diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table) + + # merge with schema table + if diff_table: + self.schema.update_table(diff_table) + + # process filters + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + return items + + +class JsonLExtractor(Extractor): + file_format = "puae-jsonl" + + +class ArrowExtractor(Extractor): + file_format = "arrow" + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + static_table_name = self._get_static_table_name(resource, meta) + items = [ + # 3. remove columns and rows in data contract filters + # 2. Remove null-type columns from the table(s) as they can't be loaded + self._apply_contract_filters(pyarrow.remove_null_columns(tbl), resource, static_table_name) for tbl in ( + # 1. Convert pandas frame(s) to arrow Table + pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item + for item in (items if isinstance(items, list) else [items]) + ) + ] + super().write_items(resource, items, meta) + + def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, static_table_name: Optional[str]) -> "TAnyArrowItem": + """Removes the columns (discard value) or rows (discard rows) as indicated by contract filters.""" + # convert arrow schema names into normalized names + rename_mapping = pyarrow.get_normalized_arrow_fields_mapping(item, self.naming) + # find matching columns and delete by original name + table_name = static_table_name or self._get_dynamic_table_name(resource, item) + filtered_columns = self._filtered_columns.get(table_name) + if filtered_columns: + # remove rows where columns have non null values + # create a mask where rows will be False if any of the specified columns are non-null + mask = None + rev_mapping = {v: k for k, v in rename_mapping.items()} + for column in [name for name, mode in filtered_columns.items() if mode == "discard_row"]: + is_null = pyarrow.pyarrow.compute.is_null(item[rev_mapping[column]]) + mask = is_null if mask is None else pyarrow.pyarrow.compute.and_(mask, is_null) + # filter the table using the mask + if mask is not None: + item = item.filter(mask) + + # remove value actually removes the whole columns from the table + # NOTE: filtered columns has normalized column names so we need to go through mapping + removed_columns = [name for name in rename_mapping if filtered_columns.get(rename_mapping[name]) is not None] + if removed_columns: + item = pyarrow.remove_columns(item, removed_columns) + + return item + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + columns = columns or self.schema.tables[table_name]["columns"] + # Note: `items` is always a list here due to the conversion in `write_table` + items = [pyarrow.normalize_py_arrow_schema(item, columns, self.naming, self._caps) for item in items] + super()._write_item(table_name, resource_name, items, columns) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: + items = items[0] + computed_table = super()._compute_table(resource, items) + + # Merge the columns to include primary_key and other hints that may be set on the resource + arrow_table = copy(computed_table) + arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(items.schema) + # normalize arrow table before merging + arrow_table = self.schema.normalize_table_identifiers(arrow_table) + # we must override the columns to preserve the order in arrow table + arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) + + return arrow_table + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + items = super()._compute_and_update_table(resource, table_name, items) + # filter data item as filters could be updated in compute table + items = [self._apply_contract_filters(item, resource, table_name) for item in items] + return items diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index af45736da4..44538aa3f5 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -23,9 +23,11 @@ from dlt.extract.typing import TTableHintTemplate from dlt.common.schema.typing import TColumnNames try: + from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pa = None + pyarrow = None class IncrementalTransform: @@ -182,24 +184,7 @@ def _deduplicate(self, tbl: "pa.Table", unique_columns: Optional[List[str]], agg """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: - tbl = tbl.append_column(self._dlt_index, pa.array(np.arange(tbl.num_rows))) - # code below deduplicates groups that include the cursor column in the group id. that was just artifact of - # json incremental and there's no need to duplicate it here - - # if unique_columns is None: - # return tbl - # group_cols = unique_columns + [cursor_path] - # try: - # tbl = tbl.filter( - # pa.compute.is_in( - # tbl[self._dlt_index], - # tbl.group_by(group_cols).aggregate( - # [(self._dlt_index, "one"), (cursor_path, aggregate)] - # )[f'{self._dlt_index}_one'] - # ) - # ) - # except KeyError as e: - # raise IncrementalPrimaryKeyMissing(self.resource_name, unique_columns[0], tbl) from e + tbl = pyarrow.append_column(tbl, self._dlt_index, pa.array(np.arange(tbl.num_rows))) return tbl def __call__( @@ -225,7 +210,7 @@ def __call__( if isinstance(primary_key, str): self._dlt_index = primary_key elif primary_key is None: - unique_columns = tbl.column_names + unique_columns = tbl.schema.names else: # deduplicating is disabled unique_columns = None @@ -312,7 +297,7 @@ def __call__( if len(tbl) == 0: return None, start_out_of_range, end_out_of_range try: - tbl = tbl.drop(["_dlt_index"]) + tbl = pyarrow.remove_columns(tbl, ["_dlt_index"]) except KeyError: pass if is_pandas: diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 8000cddbdb..675c6b0f47 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -1,5 +1,4 @@ from copy import copy, deepcopy -from collections.abc import Mapping as C_Mapping from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table @@ -12,7 +11,7 @@ from dlt.extract.typing import TFunHintTemplate, TTableHintTemplate, ValidateItem from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, TableNameMissing from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint -from dlt.extract.validation import get_column_validator +from dlt.extract.validation import create_item_validator class TTableSchemaTemplate(TypedDict, total=False): @@ -34,6 +33,7 @@ def __init__(self, table_schema_template: TTableSchemaTemplate = None): self._table_name_hint_fun: TFunHintTemplate[str] = None self._table_has_other_dynamic_hints: bool = False self._table_schema_template: TTableSchemaTemplate = None + self._original_columns: TTableHintTemplate[TAnySchemaColumns] = None if table_schema_template: self.set_template(table_schema_template) @@ -118,13 +118,17 @@ def apply_hints( Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using is to skip already loaded data. In non-aware resources, `dlt` will filter out the loaded values, however the resource will yield all the values again. """ + # keep original columns: ie in case it is a Pydantic model + if columns is not None: + self._original_columns = columns + t = None if not self._table_schema_template: # if there's no template yet, create and set new one t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract) else: # set single hints - t = deepcopy(self._table_schema_template) + t = self._clone_table_template(self._table_schema_template) if table_name is not None: if table_name: t["name"] = table_name @@ -138,7 +142,6 @@ def apply_hints( if write_disposition: t["write_disposition"] = write_disposition if columns is not None: - t['validator'] = get_column_validator(columns) # if callable then override existing if callable(columns) or callable(t["columns"]): t["columns"] = ensure_table_schema_columns_hint(columns) @@ -161,7 +164,15 @@ def apply_hints( else: t.pop("merge_key", None) if schema_contract is not None: - t["schema_contract"] = schema_contract + if schema_contract: + t["schema_contract"] = schema_contract + else: + t.pop("schema_contract", None) + # recreate validator if columns definition or contract changed + if schema_contract is not None or columns is not None: + t["validator"], schema_contract = create_item_validator(self._original_columns, t.get("schema_contract")) + if schema_contract is not None: + t["schema_contract"] = schema_contract # set properties that cannot be passed to new_table_template if incremental is not None: @@ -183,13 +194,21 @@ def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: self._table_has_other_dynamic_hints = any(callable(v) for k, v in table_schema_template.items() if k != "name") self._table_schema_template = table_schema_template + @staticmethod + def _clone_table_template(template: TTableSchemaTemplate) -> TTableSchemaTemplate: + t_ = copy(template) + t_["columns"] = deepcopy(template["columns"]) + if "schema_contract" in template: + t_["schema_contract"] = deepcopy(template["schema_contract"]) + return t_ + @staticmethod def _resolve_hint(item: TDataItem, hint: TTableHintTemplate[Any]) -> Any: - """Calls each dynamic hint passing a data item""" - if callable(hint): - return hint(item) - else: - return hint + """Calls each dynamic hint passing a data item""" + if callable(hint): + return hint(item) + else: + return hint @staticmethod def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSchema) -> None: @@ -225,24 +244,20 @@ def new_table_template( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None - ) -> TTableSchemaTemplate: + ) -> TTableSchemaTemplate: + validator, schema_contract = create_item_validator(columns, schema_contract) if columns is not None: - validator = get_column_validator(columns) columns = ensure_table_schema_columns_hint(columns) if not callable(columns): columns = columns.values() # type: ignore - else: - validator = None - - # freeze the columns if we have a fully defined table and no other explicit contract - if not schema_contract and validator: - schema_contract = { - "columns": "freeze" - } # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table( - table_name, parent_table_name, write_disposition=write_disposition, columns=columns, schema_contract=schema_contract, table_format=table_format # type: ignore - + table_name, # type: ignore + parent_table_name, # type: ignore + write_disposition=write_disposition, # type: ignore + columns=columns, # type: ignore + schema_contract=schema_contract, # type: ignore + table_format=table_format # type: ignore ) if not table_name: new_template.pop("name") diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py new file mode 100644 index 0000000000..ddda064aa4 --- /dev/null +++ b/dlt/extract/storage.py @@ -0,0 +1,78 @@ +import os +from typing import ClassVar, Dict + +from dlt.common.data_writers import TLoaderFileFormat + +from dlt.common.utils import uniq_id +from dlt.common.typing import TDataItems +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage + + +class ExtractorItemStorage(DataItemStorage): + load_file_type: TLoaderFileFormat + + def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: + # data item storage with jsonl with pua encoding + super().__init__(self.load_file_type) + self.extract_folder = extract_folder + self.storage = storage + + + def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: + template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") + return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.extract_folder, extract_id) + + +class JsonLExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "puae-jsonl" + + +class ArrowExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "arrow" + + +class ExtractorStorage(NormalizeStorage): + EXTRACT_FOLDER: ClassVar[str] = "extract" + + """Wrapper around multiple extractor storages with different file formats""" + def __init__(self, C: NormalizeStorageConfiguration) -> None: + super().__init__(True, C) + self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { + "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), + "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) + } + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.EXTRACT_FOLDER, extract_id) + + def create_extract_id(self) -> str: + extract_id = uniq_id() + self.storage.create_folder(self._get_extract_path(extract_id)) + return extract_id + + def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: + return self._item_storages[loader_file_format] + + def close_writers(self, extract_id: str) -> None: + for storage in self._item_storages.values(): + storage.close_writers(extract_id) + + def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: + extract_path = self._get_extract_path(extract_id) + for file in self.storage.list_folder_files(extract_path, to_root=False): + from_file = os.path.join(extract_path, file) + to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) + if with_delete: + self.storage.atomic_rename(from_file, to_file) + else: + # create hardlink which will act as a copy + self.storage.link_hard(from_file, to_file) + if with_delete: + self.storage.delete_folder(extract_path, recursively=True) + + def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index c8e30d0eb2..e345904337 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -1,13 +1,14 @@ -from typing import Optional, Protocol, TypeVar, Generic, Type, Union, Any, List +from typing import Optional, Tuple, TypeVar, Generic, Type, Union, Any, List +from dlt.common.schema.schema import Schema try: from pydantic import BaseModel as PydanticBaseModel, ValidationError as PydanticValidationError, create_model except ModuleNotFoundError: - PydanticBaseModel = None # type: ignore[misc] + PydanticBaseModel = Any # type: ignore[misc, assignment] -from dlt.extract.exceptions import ValidationError +# from dlt.extract.exceptions import ValidationError from dlt.common.typing import TDataItems -from dlt.common.schema.typing import TAnySchemaColumns +from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode from dlt.extract.typing import TTableHintTemplate, ValidateItem @@ -16,31 +17,50 @@ class PydanticValidator(ValidateItem, Generic[_TPydanticModel]): model: Type[_TPydanticModel] - def __init__(self, model: Type[_TPydanticModel]) -> None: - self.model = model - # Create a model for validating list of items in batch - self.list_model = create_model( - "List" + model.__name__, - items=(List[model], ...) # type: ignore[valid-type] + def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> None: + from dlt.common.libs.pydantic import apply_schema_contract_to_model, create_list_model + + self.column_mode: TSchemaEvolutionMode = column_mode + self.data_mode: TSchemaEvolutionMode = data_mode + self.list_model = apply_schema_contract_to_model( + create_list_model(model, data_mode), + column_mode, + data_mode ) + self.model = apply_schema_contract_to_model(model, column_mode, data_mode) def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, List[_TPydanticModel]]: """Validate a data item against the pydantic model""" if item is None: return None - try: - if isinstance(item, list): - return self.list_model(items=item).items # type: ignore[attr-defined, no-any-return] - return self.model.parse_obj(item) - except PydanticValidationError as e: - raise ValidationError(self, item, e) from e + + from dlt.common.libs.pydantic import validate_item, validate_items + + if isinstance(item, list): + return validate_items(self.list_model, item, self.column_mode, self.data_mode) + return validate_item(self.model, item, self.column_mode, self.data_mode) def __str__(self, *args: Any, **kwargs: Any) -> str: return f"PydanticValidator(model={self.model.__qualname__})" -def get_column_validator(columns: TTableHintTemplate[TAnySchemaColumns]) -> Optional[ValidateItem]: +def create_item_validator( + columns: TTableHintTemplate[TAnySchemaColumns], + schema_contract: TTableHintTemplate[TSchemaContract] = None +) -> Tuple[Optional[ValidateItem], TTableHintTemplate[TSchemaContract]]: + """Creates item validator for a `columns` definition and a `schema_contract` + + Returns a tuple (validator, schema contract). If validator could not be created, returns None at first position. + If schema_contract was not specified a default schema contract for given validator will be returned + """ if PydanticBaseModel is not None and isinstance(columns, type) and issubclass(columns, PydanticBaseModel): - return PydanticValidator(columns) - return None + assert not callable(schema_contract), "schema_contract cannot be dynamic for Pydantic item validator" + if schema_contract is not None: + expanded_schema_contract = Schema.expand_schema_contract_settings(schema_contract) + else: + # freeze the columns if we have a fully defined table and no other explicit contract + expanded_schema_contract = {"tables": "evolve", "columns": "freeze", "data_type": "freeze"} + + return PydanticValidator(columns, expanded_schema_contract["columns"], expanded_schema_contract["data_type"]), schema_contract or expanded_schema_contract + return None, schema_contract From 2b97a4f848d18acd79a889e4e43eb3d2ccd601f7 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Thu, 16 Nov 2023 23:56:58 +0100 Subject: [PATCH 66/73] runs tests on ci with minimal dependencies --- .github/workflows/lint.yml | 2 +- .github/workflows/test_airflow.yml | 2 +- .github/workflows/test_common.yml | 78 +++-- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- Makefile | 2 +- dlt/common/libs/pyarrow.py | 11 + dlt/common/schema/schema.py | 22 +- dlt/common/schema/utils.py | 2 +- dlt/pipeline/pipeline.py | 2 +- docs/website/docs/general-usage/resource.md | 16 +- poetry.lock | 308 ++++++++++++------ pyproject.toml | 21 +- pytest.ini | 3 +- .../data_writers/test_buffered_writer.py | 62 +--- tests/common/data_writers/utils.py | 17 + tests/common/storages/utils.py | 15 +- tests/common/test_pydantic.py | 134 -------- tests/conftest.py | 2 +- tests/extract/test_incremental.py | 157 ++++----- tests/extract/test_sources.py | 7 +- tests/extract/test_validation.py | 35 +- tests/extract/utils.py | 30 +- tests/libs/__init__.py | 0 tests/libs/test_buffered_writer_arrow,py | 50 +++ .../test_parquet_writer.py | 0 tests/{common => libs}/test_pyarrow.py | 0 tests/load/pipeline/test_arrow_loading.py | 2 +- tests/pipeline/test_pipeline.py | 122 +------ tests/pipeline/test_pipeline_extra.py | 105 ++++++ tests/pipeline/test_schema_contracts.py | 120 ++++--- tests/pipeline/utils.py | 22 +- tests/utils.py | 33 +- 33 files changed, 743 insertions(+), 643 deletions(-) create mode 100644 tests/common/data_writers/utils.py delete mode 100644 tests/common/test_pydantic.py create mode 100644 tests/libs/__init__.py create mode 100644 tests/libs/test_buffered_writer_arrow,py rename tests/{common/data_writers => libs}/test_parquet_writer.py (100%) rename tests/{common => libs}/test_pyarrow.py (100%) create mode 100644 tests/pipeline/test_pipeline_extra.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b597d49e6b..c3e546ecbd 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --all-extras --with airflow + run: poetry install --no-interaction --all-extras --with airflow --with pipeline --with docs --with providers # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index d78a48e8f7..f1806321b3 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -41,7 +41,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow -E duckdb -E parquet + run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 23b6eb9fdd..417a184ae7 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -55,40 +55,80 @@ jobs: virtualenvs-in-project: true installer-parallel: true - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v3 - with: - # path: ${{ steps.pip-cache.outputs.dir }} - path: .venv - key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} + # NOTE: do not cache. we want to have a clean state each run and we upgrade depdendencies later + # - name: Load cached venv + # id: cached-poetry-dependencies + # uses: actions/cache@v3 + # with: + # # path: ${{ steps.pip-cache.outputs.dir }} + # path: .venv + # key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} - - name: Install dependencies + sentry - run: poetry install --no-interaction -E parquet -E pydantic && pip install sentry-sdk + - name: Install dependencies + run: poetry install --no-interaction --with sentry-sdk - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py if: runner.os != 'Windows' - name: Run tests Linux/MAC + name: Run common tests with minimum dependencies Linux/MAC - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources -m "not forked" + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py -m "not forked" if: runner.os == 'Windows' - name: Run tests Windows + name: Run common tests with minimum dependencies Windows shell: cmd - - name: Install extra dependencies - run: poetry install --no-interaction -E duckdb -E cli -E parquet -E pydantic + - name: Install duckdb dependencies + run: poetry install --no-interaction -E duckdb --with sentry-sdk - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os != 'Windows' - name: Run extra tests Linux/MAC + name: Run pipeline smoke tests with minimum deps Linux/MAC - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os == 'Windows' - name: Run extra tests Windows + name: Run smoke tests with minimum deps Windows shell: cmd + - name: Install pipeline dependencies + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline + + - run: | + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common + if: runner.os != 'Windows' + name: Run extract and pipeline tests Linux/MAC + - run: | + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common + if: runner.os == 'Windows' + name: Run extract tests Windows + shell: cmd + + - name: Install Pydantic 1.0 + run: pip install "pydantic<2" + + - run: | + poetry run pytest tests/libs + if: runner.os != 'Windows' + name: Run extract and pipeline tests Linux/MAC + - run: | + poetry run pytest tests/libs + if: runner.os == 'Windows' + name: Run extract tests Windows + shell: cmd + + # - name: Install pipeline dependencies + # run: poetry install --no-interaction -E duckdb -E parquet --with pipeline + + # - run: | + # poetry run pytest tests/pipeline tests/libs + # if: runner.os != 'Windows' + # name: Run extra tests Linux/MAC + # - run: | + # poetry run pytest tests/pipeline tests/libs + # if: runner.os == 'Windows' + # name: Run extra tests Windows + # shell: cmd + matrix_job_required_check: name: Common tests needs: run_common diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index c7f34a0b20..64b8031e5e 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -87,7 +87,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E pydantic + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 5c32743125..65d0af1139 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -84,7 +84,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E pydantic + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with pipeline - run: poetry run pytest tests/load && poetry run pytest tests/cli name: Run tests Linux diff --git a/Makefile b/Makefile index bd522c9ba3..c4098050f4 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with airflow --with docs --with providers + poetry install --all-extras --with airflow --with docs --with providers --with pipeline --with sentry-sdk lint: ./check-package.sh diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index f585971ee8..585bee0d2f 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -160,6 +160,17 @@ def remove_columns(item: TAnyArrowItem, columns: Sequence[str]) -> TAnyArrowItem raise ValueError(item) +def append_column(item: TAnyArrowItem, name: str, data: Any) -> TAnyArrowItem: + """Appends new column to Table or RecordBatch""" + if isinstance(item, pyarrow.Table): + return item.append_column(name, data) + elif isinstance(item, pyarrow.RecordBatch): + new_field = pyarrow.field(name, data.type) + return pyarrow.RecordBatch.from_arrays(item.columns + [data], schema=item.schema.append(new_field)) + else: + raise ValueError(item) + + def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAnyArrowItem: """Rename arrow columns on Table or RecordBatch, returns same data but with renamed schema""" diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index c6093333b5..df7424e563 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -284,16 +284,22 @@ def expand_schema_contract_settings(settings: TSchemaContract) -> TSchemaContrac settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings}) - def resolve_contract_settings_for_table(self, table_name: str) -> TSchemaContractDict: - """Resolve the exact applicable schema contract settings for the table `table_name`.""" + def resolve_contract_settings_for_table(self, table_name: str, new_table_schema: TTableSchema = None) -> TSchemaContractDict: + """Resolve the exact applicable schema contract settings for the table `table_name`. `new_table_schema` is added to the tree during the resolution.""" settings: TSchemaContract = {} - # find root table - try: - table = utils.get_top_level_table(self._schema_tables, table_name) - settings = table["schema_contract"] - except KeyError: - settings = self._settings.get("schema_contract", {}) + if not table_name.startswith(self._dlt_tables_prefix): + if new_table_schema: + tables = copy(self._schema_tables) + tables[table_name] = new_table_schema + else: + tables = self._schema_tables + # find root table + try: + table = utils.get_top_level_table(tables, table_name) + settings = table["schema_contract"] + except KeyError: + settings = self._settings.get("schema_contract", {}) # expand settings, empty settings will expand into default settings return Schema.expand_schema_contract_settings(settings) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 38a857144d..9b4e8fb047 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -674,7 +674,7 @@ def new_table( # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name - if schema_contract: + if schema_contract is not None: table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index e1530e4242..bfb94b5d92 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -810,7 +810,7 @@ def apply_hint_args(resource: DltResource) -> None: columns, primary_key, schema_contract=schema_contract - ) + ) def apply_settings(source_: DltSource) -> None: # apply schema contract settings diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index e203b3d93a..a7f68fadd1 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -110,21 +110,27 @@ Things to note: - Fields with an `Optional` type are marked as `nullable` - Fields with a `Union` type are converted to the first (not `None`) type listed in the union. E.g. `status: Union[int, str]` results in a `bigint` column. -- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. You can override this by manually calling the pydantic helper with `skip_complex_types=True`, see below: +- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. + +You can override this by configuring the Pydantic model ```python -from dlt.common.lib.pydantic import pydantic_to_table_schema_columns +from typing import ClassVar +from dlt.common.libs.pydantic import DltConfig -... +class UserWithNesting(User): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} -@dlt.resource(name="user", columns=pydantic_to_table_schema_columns(User, skip_complex_types=True)) +@dlt.resource(name="user", columns=UserWithNesting) def get_users(): ... ``` -This omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default +`"skip_complex_types"` omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default behaviour of creating child tables for these fields. +We do not support `RootModel` that validate simple types. You can add such validator yourself, see [data filtering section](#filter-transform-and-pivot-data). + ### Dispatch data to many tables You can load data to many tables from a single resource. The most common case is a stream of events diff --git a/poetry.lock b/poetry.lock index e5caba2f07..fca923cfee 100644 --- a/poetry.lock +++ b/poetry.lock @@ -136,6 +136,17 @@ python-versions = ">=3.7, <4" about-time = "4.2.1" grapheme = "0.6.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} + [[package]] name = "ansicon" version = "1.89.0" @@ -164,7 +175,7 @@ trio = ["trio (>=0.22)"] [[package]] name = "apache-airflow" -version = "2.7.0" +version = "2.7.2" description = "Programmatically author, schedule and monitor data pipelines" category = "dev" optional = false @@ -191,7 +202,7 @@ cryptography = ">=0.9.3" deprecated = ">=1.2.13" dill = ">=0.2.2" flask = ">=2.2,<2.3" -flask-appbuilder = "4.3.3" +flask-appbuilder = "4.3.6" flask-caching = ">=1.5.0" flask-login = ">=0.6.2" flask-session = ">=0.4.0" @@ -200,7 +211,7 @@ google-re2 = ">=1.0" graphviz = ">=0.12" gunicorn = ">=20.1.0" httpx = "*" -importlib-metadata = {version = ">=1.7,<5.0.0", markers = "python_version < \"3.9\""} +importlib-metadata = {version = ">=1.7", markers = "python_version < \"3.9\""} importlib-resources = {version = ">=5.2", markers = "python_version < \"3.9\""} itsdangerous = ">=2.0" jinja2 = ">=3.0.0" @@ -213,14 +224,14 @@ markdown-it-py = ">=2.1.0" markupsafe = ">=1.1.1" marshmallow-oneofschema = ">=2.0.1" mdit-py-plugins = ">=0.3.0" -opentelemetry-api = "1.15.0" +opentelemetry-api = ">=1.15.0" opentelemetry-exporter-otlp = "*" packaging = ">=14.0" pathspec = ">=0.9.0" pendulum = ">=2.0" pluggy = ">=1.0" psutil = ">=4.2.0" -pydantic = ">=1.10.0,<2.0.0" +pydantic = ">=1.10.0" pygments = ">=2.0.1" pyjwt = ">=2.0.0" python-daemon = ">=3.0.0" @@ -231,7 +242,7 @@ rfc3339-validator = ">=0.1.4" rich = ">=12.4.4" rich-argparse = ">=1.0.0" setproctitle = ">=1.1.8" -sqlalchemy = ">=1.4,<2.0" +sqlalchemy = ">=1.4.28,<2.0" sqlalchemy-jsonfield = ">=1.0" tabulate = ">=0.7.5" tenacity = ">=6.2.0,<8.2.0 || >8.2.0" @@ -244,8 +255,8 @@ werkzeug = ">=2.0" aiobotocore = ["aiobotocore (>=2.1.1)"] airbyte = ["apache-airflow-providers-airbyte"] alibaba = ["apache-airflow-providers-alibaba"] -all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.24.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>7,<7.15.0)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "sasl (>=0.3.1)", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] -all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sasl (>=0.3.1)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] +all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>8,<9)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] +all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] amazon = ["apache-airflow-providers-amazon"] apache-atlas = ["atlasclient (>=0.1.2)"] apache-beam = ["apache-airflow-providers-apache-beam"] @@ -273,7 +284,7 @@ atlassian-jira = ["apache-airflow-providers-atlassian-jira"] aws = ["apache-airflow-providers-amazon"] azure = ["apache-airflow-providers-microsoft-azure"] cassandra = ["apache-airflow-providers-apache-cassandra"] -celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.2.3,<6)", "flower (>=1.0.0)"] +celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "flower (>=1.0.0)"] cgroups = ["cgroupspy (>=0.2.2)"] cloudant = ["apache-airflow-providers-cloudant"] cncf-kubernetes = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] @@ -284,13 +295,13 @@ databricks = ["apache-airflow-providers-databricks"] datadog = ["apache-airflow-providers-datadog"] dbt-cloud = ["apache-airflow-providers-dbt-cloud"] deprecated-api = ["requests (>=2.26.0)"] -devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] -devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] dingding = ["apache-airflow-providers-dingding"] discord = ["apache-airflow-providers-discord"] -doc = ["astroid (>=2.12.3)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] +doc = ["astroid (>=2.12.3,<3.0)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] doc-gen = ["eralchemy2"] docker = ["apache-airflow-providers-docker"] druid = ["apache-airflow-providers-apache-druid"] @@ -301,9 +312,9 @@ ftp = ["apache-airflow-providers-ftp"] gcp = ["apache-airflow-providers-google"] gcp-api = ["apache-airflow-providers-google"] github = ["apache-airflow-providers-github"] -github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] google = ["apache-airflow-providers-google"] -google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] grpc = ["apache-airflow-providers-grpc"] hashicorp = ["apache-airflow-providers-hashicorp"] hdfs = ["apache-airflow-providers-apache-hdfs"] @@ -340,7 +351,6 @@ plexus = ["apache-airflow-providers-plexus"] postgres = ["apache-airflow-providers-postgres"] presto = ["apache-airflow-providers-presto"] qds = ["apache-airflow-providers-qubole"] -qubole = ["apache-airflow-providers-qubole"] rabbitmq = ["amqp"] redis = ["apache-airflow-providers-redis"] s3 = ["apache-airflow-providers-amazon"] @@ -1846,7 +1856,7 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-appbuilder" -version = "4.3.3" +version = "4.3.6" description = "Simple and rapid application development framework, built on top of Flask. includes detailed security, auto CRUD generation for your models, google charts and much more." category = "dev" optional = false @@ -1878,6 +1888,7 @@ WTForms = "<4" jmespath = ["jmespath (>=0.9.5)"] oauth = ["Authlib (>=0.14,<2.0.0)"] openid = ["Flask-OpenID (>=1.2.5,<2)"] +talisman = ["flask-talisman (>=1.0.0,<2.0)"] [[package]] name = "flask-babel" @@ -3436,7 +3447,7 @@ python-versions = ">=3.7" [[package]] name = "pandas" -version = "1.5.3" +version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" category = "dev" optional = false @@ -3448,11 +3459,32 @@ numpy = [ {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] -python-dateutil = ">=2.8.1" +python-dateutil = ">=2.8.2" pytz = ">=2020.1" - -[package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] [[package]] name = "parsedatetime" @@ -3732,18 +3764,30 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pydantic" -version = "1.10.12" -description = "Data validation and settings management using python type hints" +version = "2.5.0" +description = "Data validation using Python type hints" category = "main" optional = false python-versions = ">=3.7" [package.dependencies] -typing-extensions = ">=4.2.0" +annotated-types = ">=0.4.0" +pydantic-core = "2.14.1" +typing-extensions = ">=4.6.1" [package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.14.1" +description = "" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydoc-markdown" @@ -5061,7 +5105,6 @@ motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] -pydantic = ["pydantic"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["s3fs", "botocore"] @@ -5070,7 +5113,7 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<3.13" -content-hash = "cf91ba47f51a43be0f944b01722e0bdce307a56318ced130495f3d086ababb38" +content-hash = "a29e5968dd174fabfd38815e470b4ad8c76a7aacb047644ce7eff073531796f9" [metadata.files] about-time = [ @@ -5194,6 +5237,10 @@ alive-progress = [ {file = "alive-progress-3.1.4.tar.gz", hash = "sha256:74a95d8d0d42bc99d3a3725dbd06ebb852245f1b64e301a7c375b92b22663f7b"}, {file = "alive_progress-3.1.4-py3-none-any.whl", hash = "sha256:c80ad87ce9c1054b01135a87fae69ecebbfc2107497ae87cbe6aec7e534903db"}, ] +annotated-types = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] ansicon = [ {file = "ansicon-1.89.0-py2.py3-none-any.whl", hash = "sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec"}, {file = "ansicon-1.89.0.tar.gz", hash = "sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1"}, @@ -5203,8 +5250,8 @@ anyio = [ {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, ] apache-airflow = [ - {file = "apache-airflow-2.7.0.tar.gz", hash = "sha256:06fba3df5943b6eda5e2f033e7e45b6ea557d89909ca36e61614ea61075f9722"}, - {file = "apache_airflow-2.7.0-py3-none-any.whl", hash = "sha256:8e3cf4b3cd8583a2e76bd04827af8d34747e0cf30a28cf0e70f4f4f39ce61f6d"}, + {file = "apache-airflow-2.7.2.tar.gz", hash = "sha256:c6fab3449066867d9a7728f40b6b9e27f1ea68bca39b064a27f5c5ddc3262224"}, + {file = "apache_airflow-2.7.2-py3-none-any.whl", hash = "sha256:1bc2c022bcae24b911e49fafd5fb619b49efba87ed7bc8561a2065810d8fe899"}, ] apache-airflow-providers-common-sql = [ {file = "apache-airflow-providers-common-sql-1.7.1.tar.gz", hash = "sha256:ba37f795d9656a87cf4661edc381b8ecfe930272c59324b59f8a158fd0971aeb"}, @@ -5781,8 +5828,8 @@ flask = [ {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] flask-appbuilder = [ - {file = "Flask-AppBuilder-4.3.3.tar.gz", hash = "sha256:b420379f74788e431a2763f8d3749cc37712df682dc00a45538d85d989340768"}, - {file = "Flask_AppBuilder-4.3.3-py3-none-any.whl", hash = "sha256:7eb1904d8f61297778ebf0d0b83f1d74b154534c9e84af3bb9198cfc0f51ff05"}, + {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, + {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, ] flask-babel = [ {file = "Flask-Babel-2.0.0.tar.gz", hash = "sha256:f9faf45cdb2e1a32ea2ec14403587d4295108f35017a7821a2b1acb8cfd9257d"}, @@ -7061,33 +7108,31 @@ packaging = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] pandas = [ - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"}, - {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"}, - {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"}, - {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"}, - {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"}, - {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"}, - {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"}, - {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, ] parsedatetime = [ {file = "parsedatetime-2.4-py2-none-any.whl", hash = "sha256:9ee3529454bf35c40a77115f5a596771e59e1aee8c53306f346c461b8e913094"}, @@ -7357,42 +7402,111 @@ pycryptodomex = [ {file = "pycryptodomex-3.18.0.tar.gz", hash = "sha256:3e3ecb5fe979e7c1bb0027e518340acf7ee60415d79295e5251d13c68dde576e"}, ] pydantic = [ - {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"}, - {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"}, - {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"}, - {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"}, - {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"}, - {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"}, - {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"}, - {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"}, - {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"}, - {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"}, + {file = "pydantic-2.5.0-py3-none-any.whl", hash = "sha256:7ce6e766c456ad026fe5712f7bcf036efc34bd5d107b3e669ef7ea01b3a9050c"}, + {file = "pydantic-2.5.0.tar.gz", hash = "sha256:69bd6fb62d2d04b7055f59a396993486a2ee586c43a0b89231ce0000de07627c"}, +] +pydantic-core = [ + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:812beca1dcb2b722cccc7e9c620bd972cbc323321194ec2725eab3222e6ac573"}, + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2ccdc53cb88e51c7d47d74c59630d7be844428f6b8d463055ffad6f0392d8da"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd937733bf2fe7d6a8bf208c12741f1f730b7bf5636033877767a75093c29b8a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:581bb606a31749a00796f5257947a0968182d7fe91e1dada41f06aeb6bfbc91a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aadf74a40a7ae49c3c1aa7d32334fe94f4f968e21dd948e301bb4ed431fb2412"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b89821a2c77cc1b8f2c1fc3aacd6a3ecc5df8f7e518dc3f18aef8c4dcf66003d"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49ee28d65f506b2858a60745cc974ed005298ebab12693646b97641dd7c99c35"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97246f896b4df7fd84caa8a75a67abb95f94bc0b547665bf0889e3262b060399"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1185548665bc61bbab0dc78f10c8eafa0db0aa1e920fe9a451b77782b10a65cc"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2a7d08b39fac97540fba785fce3b21ee01a81f081a07a4d031efd791da6666f9"}, + {file = "pydantic_core-2.14.1-cp310-none-win32.whl", hash = "sha256:0a8c8daf4e3aa3aeb98e3638fc3d58a359738f3d12590b2474c6bb64031a0764"}, + {file = "pydantic_core-2.14.1-cp310-none-win_amd64.whl", hash = "sha256:4f0788699a92d604f348e9c1ac5e97e304e97127ba8325c7d0af88dcc7d35bd3"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:2be018a84995b6be1bbd40d6064395dbf71592a981169cf154c0885637f5f54a"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fc3227408808ba7df8e95eb1d8389f4ba2203bed8240b308de1d7ae66d828f24"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d5d0e9bbb50481a049bd0203224b339d4db04006b78564df2b782e2fd16ebc"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc6a4ea9f88a810cb65ccae14404da846e2a02dd5c0ad21dee712ff69d142638"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d312ad20e3c6d179cb97c42232b53111bcd8dcdd5c1136083db9d6bdd489bc73"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:679cc4e184f213c8227862e57340d12fd4d4d19dc0e3ddb0f653f86f01e90f94"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101df420e954966868b8bc992aefed5fa71dd1f2755104da62ee247abab28e2f"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c964c0cc443d6c08a2347c0e5c1fc2d85a272dc66c1a6f3cde4fc4843882ada4"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8276bbab68a9dbe721da92d19cbc061f76655248fe24fb63969d0c3e0e5755e7"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:12163197fec7c95751a3c71b36dcc1909eed9959f011ffc79cc8170a6a74c826"}, + {file = "pydantic_core-2.14.1-cp311-none-win32.whl", hash = "sha256:b8ff0302518dcd001bd722bbe342919c29e5066c7eda86828fe08cdc112668b8"}, + {file = "pydantic_core-2.14.1-cp311-none-win_amd64.whl", hash = "sha256:59fa83873223f856d898452c6162a390af4297756f6ba38493a67533387d85d9"}, + {file = "pydantic_core-2.14.1-cp311-none-win_arm64.whl", hash = "sha256:798590d38c9381f07c48d13af1f1ef337cebf76ee452fcec5deb04aceced51c7"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:587d75aec9ae50d0d63788cec38bf13c5128b3fc1411aa4b9398ebac884ab179"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26242e3593d4929123615bd9365dd86ef79b7b0592d64a96cd11fd83c69c9f34"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5879ac4791508d8f0eb7dec71ff8521855180688dac0c55f8c99fc4d1a939845"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad9ea86f5fc50f1b62c31184767fe0cacaa13b54fe57d38898c3776d30602411"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:102ac85a775e77821943ae38da9634ddd774b37a8d407181b4f7b05cdfb36b55"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2459cc06572730e079ec1e694e8f68c99d977b40d98748ae72ff11ef21a56b0b"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:217dcbfaf429a9b8f1d54eb380908b9c778e78f31378283b30ba463c21e89d5d"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9d59e0d7cdfe8ed1d4fcd28aad09625c715dc18976c7067e37d8a11b06f4be3e"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e2be646a5155d408e68b560c0553e8a83dc7b9f90ec6e5a2fc3ff216719385db"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ffba979801e3931a19cd30ed2049450820effe8f152aaa317e2fd93795d318d7"}, + {file = "pydantic_core-2.14.1-cp312-none-win32.whl", hash = "sha256:132b40e479cb5cebbbb681f77aaceabbc8355df16c9124cff1d4060ada83cde2"}, + {file = "pydantic_core-2.14.1-cp312-none-win_amd64.whl", hash = "sha256:744b807fe2733b6da3b53e8ad93e8b3ea3ee3dfc3abece4dd2824cc1f39aa343"}, + {file = "pydantic_core-2.14.1-cp312-none-win_arm64.whl", hash = "sha256:24ba48f9d0b8d64fc5e42e1600366c3d7db701201294989aebdaca23110c02ab"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:ba55d73a2df4771b211d0bcdea8b79454980a81ed34a1d77a19ddcc81f98c895"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e905014815687d88cbb14bbc0496420526cf20d49f20606537d87646b70f1046"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:443dc5eede7fa76b2370213e0abe881eb17c96f7d694501853c11d5d56916602"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abae6fd5504e5e438e4f6f739f8364fd9ff5a5cdca897e68363e2318af90bc28"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9486e27bb3f137f33e2315be2baa0b0b983dae9e2f5f5395240178ad8e644728"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69df82892ff00491d673b1929538efb8c8d68f534fdc6cb7fd3ac8a5852b9034"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184ff7b30c3f60e1b775378c060099285fd4b5249271046c9005f8b247b39377"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3d5b2a4b3c10cad0615670cab99059441ff42e92cf793a0336f4bc611e895204"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:871c641a83719caaa856a11dcc61c5e5b35b0db888e1a0d338fe67ce744575e2"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1e7208946ea9b27a8cef13822c339d4ae96e45952cc01fc4a91c7f1cb0ae2861"}, + {file = "pydantic_core-2.14.1-cp37-none-win32.whl", hash = "sha256:b4ff385a525017f5adf6066d7f9fb309f99ade725dcf17ed623dc7dce1f85d9f"}, + {file = "pydantic_core-2.14.1-cp37-none-win_amd64.whl", hash = "sha256:c7411cd06afeb263182e38c6ca5b4f5fe4f20d91466ad7db0cd6af453a02edec"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:2871daf5b2823bf77bf7d3d43825e5d904030c155affdf84b21a00a2e00821d2"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7977e261cac5f99873dc2c6f044315d09b19a71c4246560e1e67593889a90978"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5a111f9158555582deadd202a60bd7803b6c68f406391b7cf6905adf0af6811"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac417312bf6b7a0223ba73fb12e26b2854c93bf5b1911f7afef6d24c379b22aa"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c36987f5eb2a7856b5f5feacc3be206b4d1852a6ce799f6799dd9ffb0cba56ae"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6e98227eb02623d57e1fd061788837834b68bb995a869565211b9abf3de4bf4"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:023b6d7ec4e97890b28eb2ee24413e69a6d48de4e8b75123957edd5432f4eeb3"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6015beb28deb5306049ecf2519a59627e9e050892927850a884df6d5672f8c7d"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3f48d4afd973abbd65266ac24b24de1591116880efc7729caf6b6b94a9654c9e"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:28734bcfb8fc5b03293dec5eb5ea73b32ff767f6ef79a31f6e41dad2f5470270"}, + {file = "pydantic_core-2.14.1-cp38-none-win32.whl", hash = "sha256:3303113fdfaca927ef11e0c5f109e2ec196c404f9d7ba5f8ddb63cdf287ea159"}, + {file = "pydantic_core-2.14.1-cp38-none-win_amd64.whl", hash = "sha256:144f2c1d5579108b6ed1193fcc9926124bd4142b0f7020a7744980d1235c8a40"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:893bf4fb9bfb9c4639bc12f3de323325ada4c6d60e478d5cded65453e9364890"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:052d8731aaf844f91fe4cd3faf28983b109a5865b3a256ec550b80a5689ead87"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb1c6ecb53e4b907ee8486f453dd940b8cbb509946e2b671e3bf807d310a96fc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:94cf6d0274eb899d39189144dcf52814c67f9b0fd196f211420d9aac793df2da"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36c3bf96f803e207a80dbcb633d82b98ff02a9faa76dd446e969424dec8e2b9f"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb290491f1f0786a7da4585250f1feee200fc17ff64855bdd7c42fb54526fa29"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6590ed9d13eb51b28ea17ddcc6c8dbd6050b4eb589d497105f0e13339f223b72"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69cd74e55a5326d920e7b46daa2d81c2bdb8bcf588eafb2330d981297b742ddc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d965bdb50725a805b083f5f58d05669a85705f50a6a864e31b545c589290ee31"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ca942a2dc066ca5e04c27feaa8dfb9d353ddad14c6641660c565149186095343"}, + {file = "pydantic_core-2.14.1-cp39-none-win32.whl", hash = "sha256:72c2ef3787c3b577e5d6225d73a77167b942d12cef3c1fbd5e74e55b7f881c36"}, + {file = "pydantic_core-2.14.1-cp39-none-win_amd64.whl", hash = "sha256:55713d155da1e508083c4b08d0b1ad2c3054f68b8ef7eb3d3864822e456f0bb5"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:53efe03cc383a83660cfdda6a3cb40ee31372cedea0fde0b2a2e55e838873ab6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:f523e116879bc6714e61d447ce934676473b068069dce6563ea040381dc7a257"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85bb66d661be51b2cba9ca06759264b3469d2dbb53c3e6effb3f05fec6322be6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f53a3ccdc30234cb4342cec541e3e6ed87799c7ca552f0b5f44e3967a5fed526"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1bfb63821ada76719ffcd703fc40dd57962e0d8c253e3c565252e6de6d3e0bc6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e2c689439f262c29cf3fcd5364da1e64d8600facecf9eabea8643b8755d2f0de"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a15f6e5588f7afb7f6fc4b0f4ff064749e515d34f34c666ed6e37933873d8ad8"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:f1a30eef060e21af22c7d23349f1028de0611f522941c80efa51c05a63142c62"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16f4a7e1ec6b3ea98a1e108a2739710cd659d68b33fbbeaba066202cab69c7b6"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd80a2d383940eec3db6a5b59d1820f947317acc5c75482ff8d79bf700f8ad6a"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a68a36d71c7f638dda6c9e6b67f6aabf3fa1471b198d246457bfdc7c777cdeb7"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ebc79120e105e4bcd7865f369e3b9dbabb0d492d221e1a7f62a3e8e292550278"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c8c466facec2ccdf025b0b1455b18f2c3d574d5f64d24df905d3d7b8f05d5f4e"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b91b5ec423e88caa16777094c4b2b97f11453283e7a837e5e5e1b886abba1251"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e49aa0cb316f743bc7792c36aefa39fc2221312f1d4b333b19edbdd71f2b1"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f483467c046f549572f8aca3b7128829e09ae3a9fe933ea421f7cb7c58120edb"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dee4682bd7947afc682d342a8d65ad1834583132383f8e801601a8698cb8d17a"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8d927d042c0ef04607ee7822828b208ab045867d20477ec6593d612156798547"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5a1570875eb0d1479fb2270ed80c88c231aaaf68b0c3f114f35e7fb610435e4f"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cb2fd3ab67558eb16aecfb4f2db4febb4d37dc74e6b8613dc2e7160fb58158a9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a7991f25b98038252363a03e6a9fe92e60fe390fda2631d238dc3b0e396632f8"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b45b7be9f99991405ecd6f6172fb6798908a8097106ae78d5cc5cc15121bad9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:51506e7652a2ef1d1cf763c4b51b972ff4568d1dddc96ca83931a6941f5e6389"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:66dc0e63349ec39c1ea66622aa5c2c1f84382112afd3ab2fa0cca4fb01f7db39"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8e17f0c3ba4cb07faa0038a59ce162de584ed48ba645c8d05a5de1e40d4c21e7"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d983222223f63e323a5f497f5b85e211557a5d8fb670dc88f343784502b466ba"}, + {file = "pydantic_core-2.14.1.tar.gz", hash = "sha256:0d82a6ee815388a362885186e431fac84c7a06623bc136f508e9f88261d8cadb"}, ] pydoc-markdown = [ {file = "pydoc_markdown-4.8.2-py3-none-any.whl", hash = "sha256:203f74119e6bb2f9deba43d452422de7c8ec31955b61e0620fa4dd8c2611715f"}, diff --git a/pyproject.toml b/pyproject.toml index 1c847a9a23..8fc34df0e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,7 +73,6 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -pydantic = {version = ">=1.10,<2.0", optional = true} adlfs = {version = ">=2022.4.0", optional = true} pyodbc = {version = "^4.0.39", optional = true} qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} @@ -97,7 +96,6 @@ motherduck = ["duckdb", "pyarrow"] cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] -pydantic = ["pydantic"] mssql = ["pyodbc"] qdrant = ["qdrant-client"] @@ -107,13 +105,8 @@ dlt = "dlt.cli._dlt:_main" [tool.poetry.group.dev.dependencies] requests-mock = "^1.10.0" types-click = "^7.1.8" -pandas = "^1.5.3" sqlfluff = "^2.3.2" -google-auth-oauthlib = "^1.0.0" types-deprecated = "^1.2.9.2" -tqdm = "^4.65.0" -enlighten = "^1.11.2" -alive-progress = "^3.1.1" pytest-console-scripts = "^1.4.1" pytest = "^6.2.4" mypy = "^1.6.1" @@ -139,6 +132,17 @@ types-tqdm = "^4.66.0.2" types-psutil = "^5.9.5.16" types-psycopg2 = "^2.9.21.14" +[tool.poetry.group.pipeline] +optional=true + +[tool.poetry.group.pipeline.dependencies] +google-auth-oauthlib = "^1.0.0" +tqdm = "^4.65.0" +enlighten = "^1.11.2" +alive-progress = "^3.1.1" +pydantic = ">2" +pandas = ">2" + [tool.poetry.group.airflow] optional = true @@ -151,6 +155,9 @@ optional = true [tool.poetry.group.providers.dependencies] google-api-python-client = "^2.86.0" +[tool.poetry.group.sentry-sdk] +optional = true + [tool.poetry.group.sentry-sdk.dependencies] sentry-sdk = "^1.5.6" diff --git a/pytest.ini b/pytest.ini index fc7ce9119b..88c8353a69 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,4 +6,5 @@ xfail_strict= true log_cli= 1 log_cli_level= INFO python_files = test_*.py *_test.py *snippets.py *snippet.pytest -python_functions = *_test test_* *_snippet \ No newline at end of file +python_functions = *_test test_* *_snippet +filterwarnings= ignore::DeprecationWarning \ No newline at end of file diff --git a/tests/common/data_writers/test_buffered_writer.py b/tests/common/data_writers/test_buffered_writer.py index 85cfcb2d0c..c275f22b2b 100644 --- a/tests/common/data_writers/test_buffered_writer.py +++ b/tests/common/data_writers/test_buffered_writer.py @@ -1,28 +1,14 @@ -import os -from typing import Iterator, Set, Literal +from typing import Iterator import pytest -from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter from dlt.common.data_writers.exceptions import BufferedDataWriterClosed -from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage from dlt.common.typing import DictStrAny -from tests.utils import TEST_STORAGE_ROOT, write_version, autouse_test_storage -import datetime # noqa: 251 - - -ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} - - -def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: - caps = DestinationCapabilitiesContext.generic_capabilities() - caps.preferred_loader_file_format = _format - file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") - return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) +from tests.common.data_writers.utils import ALL_WRITERS, get_writer def test_write_no_item() -> None: @@ -175,47 +161,3 @@ def test_writer_optional_schema(disable_compression: bool) -> None: with get_writer(_format="jsonl", disable_compression=disable_compression) as writer: writer.write_data_item([{"col1": 1}], None) writer.write_data_item([{"col1": 1}], None) - - -@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) -def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format=writer_format) as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item({"col1": 1}, columns=c1) - assert writer._buffered_items_count == 1 - # list - writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) - assert writer._buffered_items_count == 3 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 3 - - -def test_writer_items_count_arrow() -> None: - import pyarrow as pa - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format="arrow") as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) - assert writer._buffered_items_count == 1 - # single item with many rows - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) - assert writer._buffered_items_count == 3 - # empty list - writer.write_data_item([], columns=c1) - assert writer._buffered_items_count == 3 - # list with one item - writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) - assert writer._buffered_items_count == 4 - # list with many items - writer.write_data_item( - [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], - columns=c1 - ) - assert writer._buffered_items_count == 7 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py new file mode 100644 index 0000000000..e1a071903f --- /dev/null +++ b/tests/common/data_writers/utils.py @@ -0,0 +1,17 @@ +import os +from typing import Set, Literal + + +from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter +from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext + +from tests.utils import TEST_STORAGE_ROOT + +ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} + + +def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.preferred_loader_file_format = _format + file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") + return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index 6900c6fdcf..a4296279bf 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -1,7 +1,5 @@ from typing import List from fsspec import AbstractFileSystem -import pandas -from pyarrow import parquet from dlt.common import pendulum from dlt.common.storages import FilesystemConfiguration @@ -29,13 +27,16 @@ def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFile assert content == f.read() # read via various readers if item["mime_type"] == "text/csv": - with file_dict.open() as f: - df = pandas.read_csv(f, header="infer") - assert len(df.to_dict(orient="records")) > 0 + # parse csv + with file_dict.open(mode="rt") as f: + from csv import DictReader + elements = list(DictReader(f)) + assert len(elements) > 0 if item["mime_type"] == "application/parquet": + # verify it is a real parquet with file_dict.open() as f: - table = parquet.ParquetFile(f).read() - assert len(table.to_pylist()) + parquet: bytes = f.read() + assert parquet.startswith(b"PAR1") if item["mime_type"].startswith("text"): with file_dict.open(mode="rt") as f_txt: lines = f_txt.readlines() diff --git a/tests/common/test_pydantic.py b/tests/common/test_pydantic.py deleted file mode 100644 index 770fcce6e5..0000000000 --- a/tests/common/test_pydantic.py +++ /dev/null @@ -1,134 +0,0 @@ -import pytest -from typing import Union, Optional, List, Dict, Any -from enum import Enum - -from datetime import datetime, date, time # noqa: I251 -from dlt.common import Decimal -from dlt.common import json - -from pydantic import BaseModel, Json, AnyHttpUrl -from dlt.common.libs.pydantic import pydantic_to_table_schema_columns - - -class StrEnum(str, Enum): - a = "a_value" - b = "b_value" - c = "c_value" - - -class IntEnum(int, Enum): - a = 0 - b = 1 - c = 2 - - -class MixedEnum(Enum): - a_int = 0 - b_str = "b_value" - c_int = 2 - - -class NestedModel(BaseModel): - nested_field: str - - -class Model(BaseModel): - bigint_field: int - text_field: str - timestamp_field: datetime - date_field: date - decimal_field: Decimal - double_field: float - time_field: time - - nested_field: NestedModel - list_field: List[str] - - union_field: Union[int, str] - - optional_field: Optional[float] - - blank_dict_field: dict # type: ignore[type-arg] - parametrized_dict_field: Dict[str, int] - - str_enum_field: StrEnum - int_enum_field: IntEnum - # Both of these shouold coerce to str - mixed_enum_int_field: MixedEnum - mixed_enum_str_field: MixedEnum - - json_field: Json[List[str]] - - url_field: AnyHttpUrl - - any_field: Any - json_any_field: Json[Any] - - - -@pytest.mark.parametrize('instance', [True, False]) -def test_pydantic_model_to_columns(instance: bool) -> None: - if instance: - model = Model( - bigint_field=1, text_field="text", timestamp_field=datetime.now(), - date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, - time_field=time(1, 2, 3, 12345), - nested_field=NestedModel(nested_field="nested"), - list_field=["a", "b", "c"], - union_field=1, - optional_field=None, - blank_dict_field={}, - parametrized_dict_field={"a": 1, "b": 2, "c": 3}, - str_enum_field=StrEnum.a, - int_enum_field=IntEnum.a, - mixed_enum_int_field=MixedEnum.a_int, - mixed_enum_str_field=MixedEnum.b_str, - json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] - url_field="https://example.com", # type: ignore[arg-type] - any_field="any_string", - json_any_field=json.dumps("any_string"), - ) - else: - model = Model # type: ignore[assignment] - - result = pydantic_to_table_schema_columns(model) - - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" - assert result["date_field"]["data_type"] == "date" - assert result["decimal_field"]["data_type"] == "decimal" - assert result["double_field"]["data_type"] == "double" - assert result["time_field"]["data_type"] == "time" - assert result["nested_field"]["data_type"] == "complex" - assert result['list_field']['data_type'] == 'complex' - assert result['union_field']['data_type'] == 'bigint' - assert result['optional_field']['data_type'] == 'double' - assert result['optional_field']['nullable'] is True - assert result['blank_dict_field']['data_type'] == 'complex' - assert result['parametrized_dict_field']['data_type'] == 'complex' - assert result['str_enum_field']['data_type'] == 'text' - assert result['int_enum_field']['data_type'] == 'bigint' - assert result['mixed_enum_int_field']['data_type'] == 'text' - assert result['mixed_enum_str_field']['data_type'] == 'text' - assert result['json_field']['data_type'] == 'complex' - assert result['url_field']['data_type'] == 'text' - - # Any type fields are excluded from schema - assert 'any_field' not in result - assert 'json_any_field' not in result - - -def test_pydantic_model_skip_complex_types() -> None: - result = pydantic_to_table_schema_columns(Model, skip_complex_types=True) - - assert result["bigint_field"]["data_type"] == "bigint" - - assert "nested_field" not in result - assert "list_field" not in result - assert "blank_dict_field" not in result - assert "parametrized_dict_field" not in result - assert "json_field" not in result - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" diff --git a/tests/conftest.py b/tests/conftest.py index 56760508da..8a14fa1550 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,7 +59,7 @@ def _create_pipeline_instance_id(self) -> str: Pipeline._create_pipeline_instance_id = _create_pipeline_instance_id # type: ignore[method-assign] # push sentry to ci - os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + # os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" # disable sqlfluff logging for log in ["sqlfluff.parser", "sqlfluff.linter", "sqlfluff.templater", "sqlfluff.lexer"]: diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index ac3061ca60..af1d0a7107 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -22,11 +22,12 @@ from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing from dlt.pipeline.exceptions import PipelineStepFailed -from tests.extract.utils import AssertItems, data_to_item_format, TItemFormat, ALL_ITEM_FORMATS, data_item_to_list +from tests.extract.utils import AssertItems, data_item_to_list +from tests.utils import data_to_item_format, TDataItemFormat, ALL_DATA_ITEM_FORMATS -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -42,8 +43,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated_transformer(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated_transformer(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -61,8 +62,8 @@ def some_data(item, created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_batch_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_batch_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data1 = [{'created_at': i} for i in range(5)] data2 = [{'created_at': i} for i in range(5, 10)] @@ -81,8 +82,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_access_in_resource(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_access_in_resource(item_type: TDataItemFormat) -> None: values = [] data = [{'created_at': i} for i in range(6)] @@ -100,8 +101,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert values == [None, 5] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -137,8 +138,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert rows == [(1, 'a'), (2, 'b'), (3, 'c'), (3, 'd'), (3, 'e'), (3, 'f'), (4, 'g')] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_rows_by_hash_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_rows_by_hash_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -188,7 +189,7 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): @pytest.mark.parametrize("item_type", ["arrow", "pandas"]) -def test_nested_cursor_path_arrow_fails(item_type: TItemFormat) -> None: +def test_nested_cursor_path_arrow_fails(item_type: TDataItemFormat) -> None: data = [ {'data': {'items': [{'created_at': 2}]}} ] @@ -207,8 +208,8 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): assert ex.exception.json_path == "data.items[0].created_at" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_initial_value(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_initial_value(item_type: TDataItemFormat) -> None: @dlt.resource def some_data(created_at=dlt.sources.incremental('created_at')): data = [{"created_at": created_at.last_value}] @@ -221,8 +222,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 4242 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_incremental_instance(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_incremental_instance(item_type: TDataItemFormat) -> None: data = [{'inserted_at': 242, 'some_uq': 444}] source_items = data_to_item_format(item_type, data) @@ -237,7 +238,7 @@ def some_data(incremental=dlt.sources.incremental('created_at', initial_value=0) @dlt.resource -def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): +def some_data_from_config(call_no: int, item_type: TDataItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): assert created_at.cursor_path == 'created_at' # start value will update to the last_value on next call if call_no == 1: @@ -251,8 +252,8 @@ def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Opti yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_from_config(item_type: TDataItemFormat) -> None: os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__CURSOR_PATH'] = 'created_at' os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2022-02-03T00:00:00Z' @@ -262,8 +263,8 @@ def test_optional_incremental_from_config(item_type: TItemFormat) -> None: p.extract(some_data_from_config(2, item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_not_passed(item_type: TDataItemFormat) -> None: """Resource still runs when no incremental is passed""" data = [1,2,3] source_items = data_to_item_format(item_type, data) @@ -282,15 +283,15 @@ class OptionalIncrementalConfig(BaseConfiguration): @dlt.resource(spec=OptionalIncrementalConfig) -def optional_incremental_arg_resource(item_type: TItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: +def optional_incremental_arg_resource(item_type: TDataItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: data = [1,2,3] source_items = data_to_item_format(item_type, data) assert incremental is None yield source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_arg_from_spec_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_arg_from_spec_not_passed(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(optional_incremental_arg_resource(item_type)) @@ -302,7 +303,7 @@ class SomeDataOverrideConfiguration(BaseConfiguration): # provide what to inject via spec. the spec contain the default @dlt.resource(spec=SomeDataOverrideConfiguration) -def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): +def some_data_override_config(item_type: TDataItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): assert created_at.cursor_path == 'created_at' assert created_at.initial_value == '2000-02-03T00:00:00Z' data = [{'created_at': '2023-03-03T00:00:00Z'}] @@ -310,8 +311,8 @@ def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.in yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_initial_value_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_initial_value_from_config(item_type: TDataItemFormat) -> None: # use the shortest possible config version # os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_OVERRIDE_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' os.environ['CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' @@ -320,8 +321,8 @@ def test_override_initial_value_from_config(item_type: TItemFormat) -> None: p.extract(some_data_override_config(item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_primary_key_in_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_primary_key_in_pipeline(item_type: TDataItemFormat) -> None: """Primary key hint passed to pipeline is propagated through apply_hints """ data = [ @@ -341,8 +342,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): p.extract(some_data, primary_key=['id', 'other_id']) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_composite_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_composite_primary_key(item_type: TDataItemFormat) -> None: data = [ {'created_at': 1, 'isrc': 'AAA', 'market': 'DE'}, {'created_at': 2, 'isrc': 'BBB', 'market': 'DE'}, @@ -369,8 +370,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert set(rows) == expected -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_func_min(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_func_min(item_type: TDataItemFormat) -> None: data = [ {'created_at': 10}, {'created_at': 11}, @@ -409,8 +410,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', last_value_func=l assert s['last_value'] == 11 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_cursor_datetime_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_cursor_datetime_type(item_type: TDataItemFormat) -> None: initial_value = pendulum.now() data = [ {'created_at': initial_value + timedelta(minutes=1)}, @@ -433,8 +434,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', initial_value)): assert s['last_value'] == initial_value + timedelta(minutes=4) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_descending_order_unique_hashes(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_descending_order_unique_hashes(item_type: TDataItemFormat) -> None: """Resource returns items in descending order but using `max` last value function. Only hash matching last_value are stored. """ @@ -458,8 +459,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', 20)): assert list(some_data()) == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_json_identifiers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_json_identifiers(item_type: TDataItemFormat) -> None: """Uses primary key name that is matching the name of the JSON element in the original namespace but gets converted into destination namespace""" @dlt.resource(primary_key="DelTa") @@ -491,8 +492,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert rows2[-1][0] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_primary_key(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="DELTA") def some_data(last_timestamp=dlt.sources.incremental("ts")): @@ -505,8 +506,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert py_ex.value.primary_key_column == "DELTA" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_cursor_field(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_cursor_field(item_type: TDataItemFormat) -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @dlt.resource def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): @@ -565,12 +566,12 @@ def some_data(last_timestamp: dlt.sources.incremental[float] = dlt.sources.incre assert list(some_data(last_timestamp=None)) == [1] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_filter_processed_items(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_filter_processed_items(item_type: TDataItemFormat) -> None: """Checks if already processed items are filtered out""" @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -627,8 +628,8 @@ def some_data(step, last_timestamp=dlt.sources.incremental("ts")): p.run(r, destination="duckdb") -@pytest.mark.parametrize("item_type", set(ALL_ITEM_FORMATS) - {'json'}) -def test_start_value_set_to_last_value_arrow(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", set(ALL_DATA_ITEM_FORMATS) - {'json'}) +def test_start_value_set_to_last_value_arrow(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb') now = pendulum.now() @@ -654,13 +655,13 @@ def some_data(first: bool, last_timestamp=dlt.sources.incremental("ts")): p.run(some_data(False)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_replace_resets_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_replace_resets_state(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") now = pendulum.now() @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -727,8 +728,8 @@ def child(item): assert extracted[child._pipe.parent.name].write_disposition == "append" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_as_transform(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_as_transform(item_type: TDataItemFormat) -> None: now = pendulum.now().timestamp() @@ -750,8 +751,8 @@ def some_data(): assert len(info.loads_ids) == 1 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_explicit_disable_unique_check(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_explicit_disable_unique_check(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="delta") def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): data = [{"delta": i, "ts": pendulum.now().timestamp()} for i in range(-10, 10)] @@ -765,8 +766,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): assert s.state["incremental"]["ts"]["unique_hashes"] == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_apply_hints_incremental(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_apply_hints_incremental(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] @@ -880,7 +881,7 @@ def some_data(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sourc @dlt.resource def endless_sequence( - item_type: TItemFormat, + item_type: TDataItemFormat, updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=1) ) -> Any: max_values = 20 @@ -890,8 +891,8 @@ def endless_sequence( yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_chunked_ranges(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_chunked_ranges(item_type: TDataItemFormat) -> None: """Load chunked ranges with end value along with incremental""" pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -934,8 +935,8 @@ def test_chunked_ranges(item_type: TItemFormat) -> None: assert items == expected_range -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_with_batches(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_with_batches(item_type: TDataItemFormat) -> None: """Ensure incremental with end_value works correctly when resource yields lists instead of single items""" @dlt.resource def batched_sequence( @@ -970,8 +971,8 @@ def batched_sequence( assert items == list(range(1, 14)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_load_with_end_value_does_not_write_state(item_type: TDataItemFormat) -> None: """When loading chunk with initial/end value range. The resource state is untouched. """ pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -981,8 +982,8 @@ def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> Non assert pipeline.state.get('sources') is None -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_initial_value_errors(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_initial_value_errors(item_type: TDataItemFormat) -> None: @dlt.resource def some_data( updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at') @@ -1017,8 +1018,8 @@ def custom_last_value(items): assert "The result of 'custom_last_value([end_value, initial_value])' must equal 'end_value'" in str(ex.value) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_out_of_range_flags(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_out_of_range_flags(item_type: TDataItemFormat) -> None: """Test incremental.start_out_of_range / end_out_of_range flags are set when items are filtered out""" @dlt.resource def descending( @@ -1086,8 +1087,8 @@ def ascending_single_item( pipeline.extract(ascending_single_item()) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_get_incremental_value_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_get_incremental_value_type(item_type: TDataItemFormat) -> None: assert dlt.sources.incremental("id").get_incremental_value_type() is Any assert dlt.sources.incremental("id", initial_value=0).get_incremental_value_type() is int assert dlt.sources.incremental("id", initial_value=None).get_incremental_value_type() is Any @@ -1147,8 +1148,8 @@ def test_type_5(updated_at = dlt.sources.incremental("updated_at", allow_externa assert r.incremental._incremental.get_incremental_value_type() is Any -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1166,8 +1167,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment assert data_item_to_list(item_type, result) == [{'updated_at': 2}] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler_pipeline(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1195,8 +1196,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment pipeline.extract(r) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_allow_external_schedulers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_allow_external_schedulers(item_type: TDataItemFormat) -> None: @dlt.resource() def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at")): data = [{"updated_at": d} for d in [1, 2, 3]] diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 05366aaa94..ca95bded15 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1148,7 +1148,7 @@ def empty_gen(): empty_r = empty() # check defaults assert empty_r.name == empty.name == empty_r.table_name == empty.table_name == "empty_gen" - assert empty_r._table_schema_template is None + # assert empty_r._table_schema_template is None assert empty_r.compute_table_schema() == empty_table_schema assert empty_r.write_disposition == "append" @@ -1161,7 +1161,7 @@ def empty_gen(): empty_r.write_disposition = "append" assert empty_r.compute_table_schema()["write_disposition"] == "append" - empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"]) + empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"], schema_contract="freeze") table = empty_r.compute_table_schema() assert table["columns"]["a"] == {'merge_key': True, 'name': 'a', 'nullable': False, 'primary_key': True} assert table["columns"]["b"] == {'name': 'b', 'nullable': False, 'primary_key': True} @@ -1169,9 +1169,10 @@ def empty_gen(): assert table["name"] == "table" assert table["parent"] == "parent" assert empty_r.table_name == "table" + assert table["schema_contract"] == "freeze" # reset - empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY) + empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY, schema_contract={}) assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append'} table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index 64e06bcecc..7826eb84ef 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -1,15 +1,16 @@ """Tests for resource validation with pydantic schema """ import typing as t - import pytest + import dlt -from dlt.extract.typing import ValidateItem +from dlt.common import json from dlt.common.typing import TDataItems -from dlt.extract.validation import PydanticValidator -from dlt.extract.exceptions import ValidationError, ResourceExtractionError +from dlt.common.libs.pydantic import BaseModel, FullValidationError, ValidationError -from pydantic import BaseModel +from dlt.extract.typing import ValidateItem +from dlt.extract.validation import PydanticValidator +from dlt.extract.exceptions import ResourceExtractionError class SimpleModel(BaseModel): @@ -30,7 +31,8 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(some_data()) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + # compare content-wise. model names change due to extra settings on columns + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -50,7 +52,7 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(resource) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -68,7 +70,7 @@ def some_data() -> t.Iterator[TDataItems]: resource.validator = None data = list(resource) - assert data == [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] + assert json.dumpb(data) == json.dumpb([{"a": 1, "b": "2"}, {"a": 2, "b": "3"}]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -94,14 +96,15 @@ class AnotherModel(BaseModel): data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) # Ensure only one validator is applied in steps steps = resource._pipe.steps assert len(steps) == 2 assert isinstance(steps[-1], ValidateItem) - assert steps[-1].model is AnotherModel # type: ignore[attr-defined] + # model name will change according to extra items handling + assert steps[-1].model.__name__.startswith(AnotherModel.__name__) # type: ignore[attr-defined] @pytest.mark.parametrize("yield_list", [True, False]) @@ -117,20 +120,20 @@ def some_data() -> t.Iterator[TDataItems]: resource = some_data() - assert isinstance(resource.validator, PydanticValidator) and resource.validator.model is SimpleModel + assert isinstance(resource.validator, PydanticValidator) and resource.validator.model.__name__.startswith(SimpleModel.__name__) class AnotherModel(BaseModel): a: int b: str c: float = 0.5 - resource.validator = PydanticValidator(AnotherModel) + resource.validator = PydanticValidator(AnotherModel, column_mode="freeze", data_mode="freeze") - assert resource.validator and resource.validator.model is AnotherModel + assert resource.validator and resource.validator.model.__name__.startswith(AnotherModel.__name__) data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -148,5 +151,5 @@ def some_data() -> t.Iterator[TDataItems]: with pytest.raises(ResourceExtractionError) as exinfo: list(some_data()) - assert isinstance(exinfo.value.__cause__, ValidationError) - assert str(PydanticValidator(SimpleModel)) in str(exinfo.value) + assert isinstance(exinfo.value.__cause__, FullValidationError) + # assert str(PydanticValidator(SimpleModel)) in str(exinfo.value) diff --git a/tests/extract/utils.py b/tests/extract/utils.py index b109cdbdd9..006816b5cd 100644 --- a/tests/extract/utils.py +++ b/tests/extract/utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, List, Literal, get_args +from typing import Any, Optional, List import pytest from itertools import zip_longest @@ -7,13 +7,7 @@ from dlt.extract.extract import ExtractorStorage from dlt.extract.typing import ItemTransform -import pandas as pd -from dlt.common.libs.pyarrow import pyarrow as pa - - -TItemFormat = Literal["json", "pandas", "arrow"] - -ALL_ITEM_FORMATS = get_args(TItemFormat) +from tests.utils import TDataItemFormat def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_name: str, content: str) -> None: @@ -35,7 +29,7 @@ def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_nam class AssertItems(ItemTransform[TDataItem]): - def __init__(self, expected_items: Any, item_type: TItemFormat = "json") -> None: + def __init__(self, expected_items: Any, item_type: TDataItemFormat = "json") -> None: self.expected_items = expected_items self.item_type = item_type @@ -44,22 +38,8 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: return item -def data_to_item_format(item_format: TItemFormat, data: List[TDataItem]): - """Return the given data in the form of pandas, arrow table or json items""" - if item_format == "json": - return data - # Make dataframe from the data - df = pd.DataFrame(data) - if item_format == "pandas": - return [df] - elif item_format == "arrow": - return [pa.Table.from_pandas(df)] - else: - raise ValueError(f"Unknown item format: {item_format}") - - -def data_item_to_list(from_type: TItemFormat, values: List[TDataItem]): - if from_type == "arrow": +def data_item_to_list(from_type: TDataItemFormat, values: List[TDataItem]): + if from_type in ["arrow", "arrow-batch"]: return values[0].to_pylist() elif from_type == "pandas": return values[0].to_dict("records") diff --git a/tests/libs/__init__.py b/tests/libs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/libs/test_buffered_writer_arrow,py b/tests/libs/test_buffered_writer_arrow,py new file mode 100644 index 0000000000..f0f0968942 --- /dev/null +++ b/tests/libs/test_buffered_writer_arrow,py @@ -0,0 +1,50 @@ +import pytest + +from dlt.common.destination import TLoaderFileFormat +from dlt.common.schema.utils import new_column + +from tests.common.data_writers.utils import get_writer, ALL_WRITERS + + +@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) +def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format=writer_format) as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item({"col1": 1}, columns=c1) + assert writer._buffered_items_count == 1 + # list + writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) + assert writer._buffered_items_count == 3 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 3 + + +def test_writer_items_count_arrow() -> None: + import pyarrow as pa + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format="arrow") as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) + assert writer._buffered_items_count == 1 + # single item with many rows + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) + assert writer._buffered_items_count == 3 + # empty list + writer.write_data_item([], columns=c1) + assert writer._buffered_items_count == 3 + # list with one item + writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) + assert writer._buffered_items_count == 4 + # list with many items + writer.write_data_item( + [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], + columns=c1 + ) + assert writer._buffered_items_count == 7 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/test_parquet_writer.py b/tests/libs/test_parquet_writer.py similarity index 100% rename from tests/common/data_writers/test_parquet_writer.py rename to tests/libs/test_parquet_writer.py diff --git a/tests/common/test_pyarrow.py b/tests/libs/test_pyarrow.py similarity index 100% rename from tests/common/test_pyarrow.py rename to tests/libs/test_pyarrow.py diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index bd709e764d..9a72536329 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -138,4 +138,4 @@ def some_data(): result_tbl = pa.parquet.read_table(f) # Parquet schema is written with normalized column names - assert result_tbl.column_names == expected_column_names + assert result_tbl.schema.names == expected_column_names diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index a1ae4fd41a..338423d480 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,15 +1,13 @@ import itertools import logging import os -import random -from typing import Any, Optional, Iterator, Dict, Any, cast +from typing import Any, Any, cast from tenacity import retry_if_exception, Retrying, stop_after_attempt -from pydantic import BaseModel import pytest import dlt -from dlt.common import json, sleep, pendulum +from dlt.common import json, pendulum from dlt.common.configuration.container import Container from dlt.common.configuration.specs.aws_credentials import AwsCredentials from dlt.common.configuration.specs.exceptions import NativeValueError @@ -18,7 +16,7 @@ from dlt.common.destination.capabilities import TLoaderFileFormat from dlt.common.exceptions import DestinationHasFailedJobs, DestinationTerminalException, PipelineStateNotAvailable, UnknownDestinationModule from dlt.common.pipeline import PipelineContext -from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id @@ -28,14 +26,12 @@ from dlt.load.exceptions import LoadClientJobFailed from dlt.pipeline.exceptions import InvalidPipelineName, PipelineNotActive, PipelineStepFailed from dlt.pipeline.helpers import retry_load -from dlt.pipeline import TCollectorArg from tests.common.utils import TEST_SENTRY_DSN -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.utils import TEST_STORAGE_ROOT from tests.common.configuration.utils import environment +from tests.utils import TEST_STORAGE_ROOT from tests.extract.utils import expect_extracted_file -from tests.pipeline.utils import assert_load_info, airtable_emojis +from tests.pipeline.utils import assert_load_info, airtable_emojis, many_delayed def test_default_pipeline() -> None: @@ -188,22 +184,6 @@ def test_deterministic_salt(environment) -> None: assert p.pipeline_salt != p3.pipeline_salt -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: - # create pipelines, extract and normalize. that should be possible without installing any dependencies - p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) - # are capabilities injected - caps = p._container[DestinationCapabilitiesContext] - print(caps.naming_convention) - # are right naming conventions created - assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.extract([1, "2", 3], table_name="data") - # is default schema with right naming convention - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.normalize() - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - - def test_destination_explicit_credentials(environment: Any) -> None: # test redshift p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="redshift", credentials="redshift://loader:loader@localhost:5432/dlt_data") @@ -489,7 +469,7 @@ def data_schema_3(): os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p.run([data_schema_1(), data_schema_2()], write_disposition="replace") - assert p.schema_names == p._schema_storage.list_schemas() + assert set(p.schema_names) == set(p._schema_storage.list_schemas()) def test_run_with_table_name_exceeding_path_length() -> None: @@ -782,52 +762,6 @@ def reverse_order(item): assert list(p.default_schema.tables["order_2"]["columns"].keys()) == ["col_3", "col_2", "col_1", '_dlt_load_id', '_dlt_id'] -def run_deferred(iters): - - @dlt.defer - def item(n): - sleep(random.random() / 2) - return n - - for n in range(iters): - yield item(n) - - -@dlt.source -def many_delayed(many, iters): - for n in range(many): - yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) - - -@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) -def test_pipeline_progress(progress: TCollectorArg) -> None: - - os.environ["TIMEOUT"] = "3.0" - - p = dlt.pipeline(destination="dummy", progress=progress) - p.extract(many_delayed(5, 10)) - p.normalize() - - collector = p.collector - - # attach pipeline - p = dlt.attach(progress=collector) - p.extract(many_delayed(5, 10)) - p.run(dataset_name="dummy") - - assert collector == p.drop().collector - - # make sure a valid logger was used - if progress == "tqdm": - assert isinstance(collector, TqdmCollector) - if progress == "enlighten": - assert isinstance(collector, EnlightenCollector) - if progress == "alive_progress": - assert isinstance(collector, AliveCollector) - if progress == "log": - assert isinstance(collector, LogCollector) - - def test_pipeline_log_progress() -> None: os.environ["TIMEOUT"] = "3.0" @@ -1059,50 +993,6 @@ def res_return_yield(): assert "dlt.resource" in str(pip_ex.value) -@pytest.mark.parametrize('method', ('extract', 'run')) -def test_column_argument_pydantic(method: str) -> None: - """Test columns schema is created from pydantic model""" - p = dlt.pipeline(destination='duckdb') - - @dlt.resource - def some_data() -> Iterator[Dict[str, Any]]: - yield {} - - class Columns(BaseModel): - a: Optional[int] - b: Optional[str] - - if method == 'run': - p.run(some_data(), columns=Columns) - else: - p.extract(some_data(), columns=Columns) - - assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' - assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True - assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' - assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True - - -def test_extract_pydantic_models() -> None: - pipeline = dlt.pipeline(destination='duckdb') - - class User(BaseModel): - user_id: int - name: str - - @dlt.resource - def users() -> Iterator[User]: - yield User(user_id=1, name="a") - yield User(user_id=2, name="b") - - pipeline.extract(users()) - - storage = ExtractorStorage(pipeline._normalize_storage_config) - expect_extracted_file( - storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) - ) - - def test_resource_rename_same_table(): @dlt.resource(write_disposition="replace") def generic(start): diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py new file mode 100644 index 0000000000..7be06c364c --- /dev/null +++ b/tests/pipeline/test_pipeline_extra.py @@ -0,0 +1,105 @@ +import os +from typing import Any, Dict, Iterator, Optional +import pytest +from pydantic import BaseModel + +import dlt +from dlt.common import json +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.extract.storage import ExtractorStorage + +from dlt.pipeline import TCollectorArg + +from tests.extract.utils import expect_extracted_file +from tests.load.utils import DestinationTestConfiguration, destinations_configs +from tests.pipeline.utils import many_delayed + + +@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: + # create pipelines, extract and normalize. that should be possible without installing any dependencies + p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) + # are capabilities injected + caps = p._container[DestinationCapabilitiesContext] + print(caps.naming_convention) + # are right naming conventions created + assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.extract([1, "2", 3], table_name="data") + # is default schema with right naming convention + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.normalize() + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + + +@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) +def test_pipeline_progress(progress: TCollectorArg) -> None: + + os.environ["TIMEOUT"] = "3.0" + + p = dlt.pipeline(destination="dummy", progress=progress) + p.extract(many_delayed(5, 10)) + p.normalize() + + collector = p.collector + + # attach pipeline + p = dlt.attach(progress=collector) + p.extract(many_delayed(5, 10)) + p.run(dataset_name="dummy") + + assert collector == p.drop().collector + + # make sure a valid logger was used + if progress == "tqdm": + assert isinstance(collector, TqdmCollector) + if progress == "enlighten": + assert isinstance(collector, EnlightenCollector) + if progress == "alive_progress": + assert isinstance(collector, AliveCollector) + if progress == "log": + assert isinstance(collector, LogCollector) + + +@pytest.mark.parametrize('method', ('extract', 'run')) +def test_column_argument_pydantic(method: str) -> None: + """Test columns schema is created from pydantic model""" + p = dlt.pipeline(destination='duckdb') + + @dlt.resource + def some_data() -> Iterator[Dict[str, Any]]: + yield {} + + class Columns(BaseModel): + a: Optional[int] = None + b: Optional[str] = None + + if method == 'run': + p.run(some_data(), columns=Columns) + else: + p.extract(some_data(), columns=Columns) + + assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' + assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True + assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' + assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True + + +def test_extract_pydantic_models() -> None: + pipeline = dlt.pipeline(destination='duckdb') + + class User(BaseModel): + user_id: int + name: str + + @dlt.resource + def users() -> Iterator[User]: + yield User(user_id=1, name="a") + yield User(user_id=2, name="b") + + pipeline.extract(users()) + + storage = ExtractorStorage(pipeline._normalize_storage_config) + expect_extracted_file( + storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) + ) diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 55d77ee050..21b2adc699 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -1,6 +1,6 @@ import dlt, os, pytest import contextlib -from typing import Any, Union, Optional +from typing import Any, Callable, Iterator, Union, Optional from dlt.common.schema.typing import TSchemaContract from dlt.common.utils import uniq_id @@ -11,7 +11,7 @@ from dlt.common.schema import utils from tests.load.pipeline.utils import load_table_counts -from tests.utils import skip_if_not_active +from tests.utils import TDataItemFormat, skip_if_not_active, data_to_item_format, ALL_DATA_ITEM_FORMATS skip_if_not_active("duckdb") @@ -32,12 +32,13 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: def items(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract=settings) + # NOTE: names must be normalizeds + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { "id": index, - "some_int": 1, + "SomeInt": 1, "name": f"item {index}" } @@ -46,13 +47,13 @@ def load_items(): def items_with_variant(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract=settings) + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}", - "some_int": "hello" + "SomeInt": "hello" } return load_items @@ -60,13 +61,13 @@ def load_items(): def items_with_new_column(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract=settings) + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}", - "new_col": "hello" + "New^Col": "hello" } return load_items @@ -74,7 +75,7 @@ def load_items(): def items_with_subtable(settings: TSchemaContract) -> Any: - @dlt.resource(name="items", write_disposition="append", schema_contract=settings) + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { @@ -108,7 +109,7 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline: Pipeline, resource_fun, settings) -> None: +def run_resource(pipeline: Pipeline, resource_fun: Callable[..., DltResource], settings: Any, item_format: TDataItemFormat = "json", duplicates: int = 1) -> None: for item in settings.keys(): assert item in LOCATIONS @@ -120,8 +121,13 @@ def run_resource(pipeline: Pipeline, resource_fun, settings) -> None: assert key in SCHEMA_ELEMENTS @dlt.source(name="freeze_tests", schema_contract=settings.get("source")) - def source() -> DltResource: - return resource_fun(settings.get("resource")) + def source() -> Iterator[DltResource]: + for idx in range(duplicates): + resource: DltResource = resource_fun(settings.get("resource")) + if item_format != "json": + resource._pipe.replace_gen(data_to_item_format(item_format, resource._pipe.gen())) # type: ignore + resource.table_name = resource.name + yield resource.with_name(resource.name + str(idx)) # run pipeline pipeline.run(source(), schema_contract=settings.get("override")) @@ -142,7 +148,8 @@ def get_pipeline(): @pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) -def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None: +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_tables(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: pipeline = get_pipeline() @@ -150,41 +157,45 @@ def test_freeze_new_tables(contract_setting: str, setting_location: str) -> None setting_location: { "tables": contract_setting }} - run_resource(pipeline, items, {}) + run_resource(pipeline, items, {}, item_format) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - run_resource(pipeline, items_with_new_column, full_settings) + run_resource(pipeline, items_with_new_column, full_settings, item_format) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - run_resource(pipeline, items_with_variant, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 30 - assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - - # test adding new subtable + # test adding new table with raises_frozen_exception(contract_setting == "freeze"): - run_resource(pipeline, items_with_subtable, full_settings) + run_resource(pipeline, new_items, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) # delete extracted files if left after exception pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 - assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # run add variant column + run_resource(pipeline, items_with_variant, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # test adding new table - with raises_frozen_exception(contract_setting == "freeze"): - run_resource(pipeline, new_items, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) + # test adding new subtable + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_subtable, full_settings) + + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 + assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) @pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) -def test_freeze_new_columns(contract_setting: str, setting_location: str) -> None: +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_columns(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: full_settings = { setting_location: { @@ -192,26 +203,21 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non }} pipeline = get_pipeline() - run_resource(pipeline, items, {}) + run_resource(pipeline, items, {}, item_format) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # subtable should work - run_resource(pipeline, items_with_subtable, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 20 - assert table_counts[SUBITEMS_TABLE] == 10 - # new should work - run_resource(pipeline, new_items, full_settings) + run_resource(pipeline, new_items, full_settings, item_format) table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == 20 + expected_items_count = 10 + assert table_counts["items"] == expected_items_count assert table_counts[NEW_ITEMS_TABLE] == 10 - # test adding new column + # test adding new column twice: filter will try to catch it before it is added for the second time with raises_frozen_exception(contract_setting == "freeze"): - run_resource(pipeline, items_with_new_column, full_settings) + run_resource(pipeline, items_with_new_column, full_settings, item_format, duplicates=2) # delete extracted files if left after exception pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) @@ -220,19 +226,25 @@ def test_freeze_new_columns(contract_setting: str, setting_location: str) -> Non else: assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (30 if contract_setting in ["evolve", "discard_value"] else 20) + expected_items_count += (20 if contract_setting in ["evolve", "discard_value"] else 0) + assert table_counts["items"] == expected_items_count - # test adding variant column - # with raises_frozen_exception(contract_setting == "freeze"): - run_resource(pipeline, items_with_variant, full_settings) - # variants are not new columns and should be able to always evolve - assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # if contract_setting == "evolve": - # assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - # else: - # assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # subtable should work + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count + assert table_counts[SUBITEMS_TABLE] == 10 + + # test adding variant column + run_resource(pipeline, items_with_variant, full_settings) + # variants are not new columns and should be able to always evolve + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count @pytest.mark.parametrize("contract_setting", schema_contract) diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 3e61c9510c..53513103a7 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -1,9 +1,10 @@ import pytest +import random from os import environ import dlt -from dlt.common import json -from dlt.common.pipeline import LoadInfo, PipelineContext +from dlt.common import json, sleep +from dlt.common.pipeline import LoadInfo from dlt.common.typing import DictStrAny from tests.utils import TEST_STORAGE_ROOT @@ -59,3 +60,20 @@ def wide_peacock(): return budget, schedule, peacock, wide_peacock + + +def run_deferred(iters): + + @dlt.defer + def item(n): + sleep(random.random() / 2) + return n + + for n in range(iters): + yield item(n) + + +@dlt.source +def many_delayed(many, iters): + for n in range(many): + yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) diff --git a/tests/utils.py b/tests/utils.py index 823b1cca83..8ec15a20ad 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ import requests import pytest from os import environ -from typing import Iterator, List +from typing import Any, Iterable, Iterator, List, Literal, Union, get_args from unittest.mock import patch from requests import Response @@ -21,7 +21,7 @@ from dlt.common.storages import FileStorage from dlt.common.schema import Schema from dlt.common.storages.versioned_storage import VersionedStorage -from dlt.common.typing import StrAny +from dlt.common.typing import StrAny, TDataItem from dlt.common.utils import custom_environ, uniq_id from dlt.common.pipeline import PipelineContext @@ -55,6 +55,13 @@ for destination in ACTIVE_DESTINATIONS: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown active destination {destination}" + +# possible TDataItem types +TDataItemFormat = Literal["json", "pandas", "arrow", "arrow-batch"] +ALL_DATA_ITEM_FORMATS = get_args(TDataItemFormat) +"""List with TDataItem formats: json, arrow table/batch / pandas""" + + def TEST_DICT_CONFIG_PROVIDER(): # add test dictionary provider providers_context = Container()[ConfigProvidersContext] @@ -136,6 +143,7 @@ def unload_modules() -> Iterator[None]: @pytest.fixture(autouse=True) def wipe_pipeline() -> Iterator[None]: + """Wipes pipeline local state and deactivates it""" container = Container() if container[PipelineContext].is_active(): container[PipelineContext].deactivate() @@ -148,6 +156,26 @@ def wipe_pipeline() -> Iterator[None]: container[PipelineContext].deactivate() +def data_to_item_format(item_format: TDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]]) -> Any: + """Return the given data in the form of pandas, arrow table/batch or json items""" + if item_format == "json": + return data + + import pandas as pd + from dlt.common.libs.pyarrow import pyarrow as pa + + # Make dataframe from the data + df = pd.DataFrame(list(data)) + if item_format == "pandas": + return [df] + elif item_format == "arrow": + return [pa.Table.from_pandas(df)] + elif item_format == "arrow-batch": + return [pa.RecordBatch.from_pandas(df)] + else: + raise ValueError(f"Unknown item format: {item_format}") + + def init_test_logging(c: RunConfiguration = None) -> None: if not c: c = resolve_configuration(RunConfiguration()) @@ -182,6 +210,7 @@ def create_schema_with_name(schema_name) -> Schema: def assert_no_dict_key_starts_with(d: StrAny, key_prefix: str) -> None: assert all(not key.startswith(key_prefix) for key in d.keys()) + def skip_if_not_active(destination: str) -> None: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown skipped destination {destination}" if destination not in ACTIVE_DESTINATIONS: From c35ec2d0cbcd7055a509cd465ed734adf99377ed Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 17 Nov 2023 00:10:04 +0100 Subject: [PATCH 67/73] fixes deps in ci workflows --- .github/workflows/lint.yml | 2 +- .github/workflows/test_common.yml | 13 ------------ .github/workflows/test_destination_athena.yml | 2 +- .../test_destination_athena_iceberg.yml | 2 +- .../workflows/test_destination_bigquery.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .github/workflows/test_destination_qdrant.yml | 2 +- .../workflows/test_destination_snowflake.yml | 2 +- .../workflows/test_destination_synapse.yml | 8 ++++---- .../workflows/test_destination_weaviate.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_doc_snippets.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- tests/pipeline/test_pipeline.py | 18 +++++++++++++++++ tests/pipeline/test_pipeline_state.py | 20 +------------------ 15 files changed, 34 insertions(+), 47 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c3e546ecbd..155b429b92 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --all-extras --with airflow --with pipeline --with docs --with providers + run: poetry install --no-interaction --all-extras --with airflow --with docs --with providers --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 417a184ae7..ec97aac304 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -116,19 +116,6 @@ jobs: name: Run extract tests Windows shell: cmd - # - name: Install pipeline dependencies - # run: poetry install --no-interaction -E duckdb -E parquet --with pipeline - - # - run: | - # poetry run pytest tests/pipeline tests/libs - # if: runner.os != 'Windows' - # name: Run extra tests Linux/MAC - # - run: | - # poetry run pytest tests/pipeline tests/libs - # if: runner.os == 'Windows' - # name: Run extra tests Windows - # shell: cmd - matrix_job_required_check: name: Common tests needs: run_common diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 704e66522b..b849188ddd 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 6892a96bf1..97544f24d1 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index dcc7e7ba9b..e12d7bd0f0 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -79,7 +79,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index bba44e750d..6eb4427bbf 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 09ded40f59..0ce3e3a3f9 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -59,7 +59,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E qdrant -E parquet + run: poetry install --no-interaction -E qdrant -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index 4aae3ec62e..fe81c6121f 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -71,7 +71,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index e86e29ebf6..d0f364c382 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -5,9 +5,9 @@ on: branches: - master - devel - + workflow_dispatch: - + env: DESTINATION__SYNAPSE__CREDENTIALS: ${{ secrets.SYNAPSE_CREDENTIALS }} DESTINATION__SYNAPSE__CREDENTIALS__PASSWORD: ${{ secrets.SYNAPSE_PASSWORD }} @@ -42,7 +42,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - + - name: Check out uses: actions/checkout@master @@ -70,7 +70,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E s3 -E gs -E az + run: poetry install --no-interaction -E synapse -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_weaviate.yml b/.github/workflows/test_destination_weaviate.yml index 6a7a2e95cd..c771a28204 100644 --- a/.github/workflows/test_destination_weaviate.yml +++ b/.github/workflows/test_destination_weaviate.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E weaviate -E parquet + run: poetry install --no-interaction -E weaviate -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 64b8031e5e..f37feb872f 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -87,7 +87,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with pipeline + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index ad7d544219..004bafba05 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow --with sentry-sdk --with pipeline - name: Run linter and tests run: make test-and-lint-snippets diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 65d0af1139..42c3c2d13a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -84,7 +84,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with pipeline + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline - run: poetry run pytest tests/load && poetry run pytest tests/cli name: Run tests Linux diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 338423d480..c9e6c8b423 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -20,6 +20,7 @@ from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id +from dlt.destinations.sql_client import SqlClientBase from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage from dlt.extract.source import DltResource, DltSource @@ -1117,3 +1118,20 @@ def test_empty_rows_are_included() -> None: values = [r[0] for r in rows] assert values == [1, None, None, None, None, None, None, None] + + +def test_resource_state_name_not_normalized() -> None: + pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") + peacock_s = airtable_emojis().with_resources("🦚Peacock") + pipeline.extract(peacock_s) + assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} + pipeline.normalize() + pipeline.load() + + # get state from destination + from dlt.pipeline.state_sync import load_state_from_destination + client: SqlClientBase + with pipeline.destination_client() as client: # type: ignore[assignment] + state = load_state_from_destination(pipeline.pipeline_name, client) + assert "airtable_emojis" in state["sources"] + assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 14b881eedc..71e8d90406 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -10,14 +10,13 @@ from dlt.common.storages import FileStorage from dlt.common import pipeline as state_module from dlt.common.utils import uniq_id -from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException, PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import migrate_state, STATE_ENGINE_VERSION from tests.utils import test_storage -from tests.pipeline.utils import json_case_path, load_json_case, airtable_emojis +from tests.pipeline.utils import json_case_path, load_json_case @dlt.resource() @@ -427,20 +426,3 @@ def test_migrate_state(test_storage: FileStorage) -> None: p = dlt.attach(pipeline_name="debug_pipeline", pipelines_dir=test_storage.storage_path) assert p.dataset_name == "debug_pipeline_data" assert p.default_schema_name == "example_source" - - -def test_resource_state_name_not_normalized() -> None: - pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") - peacock_s = airtable_emojis().with_resources("🦚Peacock") - pipeline.extract(peacock_s) - assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} - pipeline.normalize() - pipeline.load() - - # get state from destination - from dlt.pipeline.state_sync import load_state_from_destination - client: SqlJobClientBase - with pipeline.destination_client() as client: # type: ignore[assignment] - state = load_state_from_destination(pipeline.pipeline_name, client) - assert "airtable_emojis" in state["sources"] - assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} From 340ed3d334dc805b3396166da8e3bc2e1daa30ee Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Fri, 17 Nov 2023 00:36:58 +0100 Subject: [PATCH 68/73] re-adds snowflake connector --- .github/workflows/test_airflow.yml | 2 +- .github/workflows/test_dbt_runner.yml | 2 +- poetry.lock | 118 +++++++------------------- pyproject.toml | 4 +- tests/pipeline/test_arrow_sources.py | 11 +-- tests/pipeline/test_pipeline.py | 16 +--- tests/pipeline/test_pipeline_extra.py | 12 +++ 7 files changed, 55 insertions(+), 110 deletions(-) diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index f1806321b3..bbed326344 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -41,7 +41,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet + run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet --with sentry-sdk - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index db3b53e9fa..1803a53fc1 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -68,7 +68,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres -E dbt + run: poetry install --no-interaction -E postgres -E dbt --with sentry-sdk - run: | poetry run pytest tests/helpers/dbt_tests -k '(not venv)' diff --git a/poetry.lock b/poetry.lock index fca923cfee..018c1357fe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3426,17 +3426,6 @@ category = "main" optional = false python-versions = ">=3.7" -[[package]] -name = "oscrypto" -version = "1.3.0" -description = "TLS (SSL) sockets, key generation, encryption, decryption, signing, verification and KDFs using the OS crypto libraries. Does not require a compiler, and relies on the OS for patching. Works on Windows, OS X and Linux/BSD." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -asn1crypto = ">=1.5.1" - [[package]] name = "packaging" version = "23.1" @@ -3449,7 +3438,7 @@ python-versions = ">=3.7" name = "pandas" version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" -category = "dev" +category = "main" optional = false python-versions = ">=3.8" @@ -3754,14 +3743,6 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -[[package]] -name = "pycryptodomex" -version = "3.18.0" -description = "Cryptographic library for Python" -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - [[package]] name = "pydantic" version = "2.5.0" @@ -4479,7 +4460,7 @@ python-versions = ">=3.7" [[package]] name = "snowflake-connector-python" -version = "3.1.1" +version = "3.5.0" description = "Snowflake Connector for Python" category = "main" optional = true @@ -4494,10 +4475,10 @@ cryptography = ">=3.1.0,<42.0.0" filelock = ">=3.5,<4" idna = ">=2.5,<4" keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} -oscrypto = "<2.0.0" packaging = "*" -platformdirs = ">=2.6.0,<3.9.0" -pycryptodomex = ">=3.2,<3.5.0 || >3.5.0,<4.0.0" +pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} +platformdirs = ">=2.6.0,<4.0.0" +pyarrow = {version = "*", optional = true, markers = "extra == \"pandas\""} pyjwt = "<3.0.0" pyOpenSSL = ">=16.2.0,<24.0.0" pytz = "*" @@ -4505,11 +4486,11 @@ requests = "<3.0.0" sortedcontainers = ">=2.4.0" tomlkit = "*" typing-extensions = ">=4.3,<5" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<2.0.0" [package.extras] -development = ["Cython", "coverage", "more-itertools", "numpy (<1.26.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] -pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow (>=10.0.1,<10.1.0)"] +development = ["Cython", "coverage", "more-itertools", "numpy (<1.27.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] +pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow"] secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] [[package]] @@ -5108,12 +5089,13 @@ postgres = ["psycopg2-binary", "psycopg2cffi"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["s3fs", "botocore"] +snowflake = ["snowflake-connector-python"] weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<3.13" -content-hash = "a29e5968dd174fabfd38815e470b4ad8c76a7aacb047644ce7eff073531796f9" +content-hash = "bbfaab078877deaa60ecf6bc95c0374e1967268ca24594a99b792b88c4ef270b" [metadata.files] about-time = [ @@ -7099,10 +7081,6 @@ orjson = [ {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] -oscrypto = [ - {file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"}, - {file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"}, -] packaging = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, @@ -7367,40 +7345,6 @@ pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -pycryptodomex = [ - {file = "pycryptodomex-3.18.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:160a39a708c36fa0b168ab79386dede588e62aec06eb505add870739329aecc6"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c2953afebf282a444c51bf4effe751706b4d0d63d7ca2cc51db21f902aa5b84e"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:ba95abd563b0d1b88401658665a260852a8e6c647026ee6a0a65589287681df8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2014_aarch64.whl", hash = "sha256:192306cf881fe3467dda0e174a4f47bb3a8bb24b90c9cdfbdc248eec5fc0578c"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:f9ab5ef0718f6a8716695dea16d83b671b22c45e9c0c78fd807c32c0192e54b5"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win32.whl", hash = "sha256:50308fcdbf8345e5ec224a5502b4215178bdb5e95456ead8ab1a69ffd94779cb"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win_amd64.whl", hash = "sha256:4d9379c684efea80fdab02a3eb0169372bca7db13f9332cb67483b8dc8b67c37"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5594a125dae30d60e94f37797fc67ce3c744522de7992c7c360d02fdb34918f8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8ff129a5a0eb5ff16e45ca4fa70a6051da7f3de303c33b259063c19be0c43d35"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2014_aarch64.whl", hash = "sha256:3d9314ac785a5b75d5aaf924c5f21d6ca7e8df442e5cf4f0fefad4f6e284d422"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:f237278836dda412a325e9340ba2e6a84cb0f56b9244781e5b61f10b3905de88"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac614363a86cc53d8ba44b6c469831d1555947e69ab3276ae8d6edc219f570f7"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:302a8f37c224e7b5d72017d462a2be058e28f7be627bdd854066e16722d0fc0c"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux2014_aarch64.whl", hash = "sha256:6421d23d6a648e83ba2670a352bcd978542dad86829209f59d17a3f087f4afef"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84e105787f5e5d36ec6a581ff37a1048d12e638688074b2a00bcf402f9aa1c2"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6875eb8666f68ddbd39097867325bd22771f595b4e2b0149739b5623c8bf899b"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:27072a494ce621cc7a9096bbf60ed66826bb94db24b49b7359509e7951033e74"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:1949e09ea49b09c36d11a951b16ff2a05a0ffe969dda1846e4686ee342fe8646"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6ed3606832987018615f68e8ed716a7065c09a0fe94afd7c9ca1b6777f0ac6eb"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win32.whl", hash = "sha256:d56c9ec41258fd3734db9f5e4d2faeabe48644ba9ca23b18e1839b3bdf093222"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win_amd64.whl", hash = "sha256:e00a4bacb83a2627e8210cb353a2e31f04befc1155db2976e5e239dd66482278"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2dc4eab20f4f04a2d00220fdc9258717b82d31913552e766d5f00282c031b70a"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-win32.whl", hash = "sha256:75672205148bdea34669173366df005dbd52be05115e919551ee97171083423d"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bec6c80994d4e7a38312072f89458903b65ec99bed2d65aa4de96d997a53ea7a"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d35a8ffdc8b05e4b353ba281217c8437f02c57d7233363824e9d794cf753c419"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76f0a46bee539dae4b3dfe37216f678769349576b0080fdbe431d19a02da42ff"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:71687eed47df7e965f6e0bf3cadef98f368d5221f0fb89d2132effe1a3e6a194"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73d64b32d84cf48d9ec62106aa277dbe99ab5fbfd38c5100bc7bddd3beb569f7"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbdcce0a226d9205560a5936b05208c709b01d493ed8307792075dedfaaffa5f"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58fc0aceb9c961b9897facec9da24c6a94c5db04597ec832060f53d4d6a07196"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:215be2980a6b70704c10796dd7003eb4390e7be138ac6fb8344bf47e71a8d470"}, - {file = "pycryptodomex-3.18.0.tar.gz", hash = "sha256:3e3ecb5fe979e7c1bb0027e518340acf7ee60415d79295e5251d13c68dde576e"}, -] pydantic = [ {file = "pydantic-2.5.0-py3-none-any.whl", hash = "sha256:7ce6e766c456ad026fe5712f7bcf036efc34bd5d107b3e669ef7ea01b3a9050c"}, {file = "pydantic-2.5.0.tar.gz", hash = "sha256:69bd6fb62d2d04b7055f59a396993486a2ee586c43a0b89231ce0000de07627c"}, @@ -8215,27 +8159,27 @@ sniffio = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] snowflake-connector-python = [ - {file = "snowflake-connector-python-3.1.1.tar.gz", hash = "sha256:2700503a5f99d6e22e412d7cf4fd2211296cc0e50b2a38ad9c6f48ddb8beff67"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3aec4ab6f6d66a0dc2b5bbd8fc2c11fd76090c63fdc65577af9d4e28055c51f2"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5d2589f39b1c1c91eda6711181afb7f197f7dd43204f26db48df90849d9f528b"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c540b4fe173cc9a24df285ce49c70fe0dadc6316b8a2160324c549086a71a118"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25007ccf5d9c0b87e29af40470f6f1e76d03621642a7492d62282215b7e9d67d"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:fff3caebd8b60cee09ad55674d12b8940b9d5f57a394c8467637167372710841"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7b7622be7bcad26786bf771341e3b4819df6e4d7858e5dd4c8700423ca7364e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:260d259a79e6120bf58fcec9a52705fd02a430f296a77a1531720906b7a02f5e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0163d5036f05a39977c6d7aba5e8bb1632be1117785a72e2602e3a34b89ded1c"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d38546ebcba7bca37a16cfcbbc0f8e7c19946b4e45e0c5dc2a8963f3b739958"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:484044c2d9aacd5c8a0a9d8d8b69b06352e3612f23c5e44d54771a96047d80b1"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e4a4aab55a4a3236625b738fad19524c9cef810fe041d567dc5dc1d9b1f9eb7"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:5d95eeaff7b085b0c8facab40391bede699ffc0865f2cdaa37b19a8429d47943"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a944a1862672552f8c00b98b576a8b16da46f9c5b918ba4b969bd7d1205c32a"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7abb142ba3ee5db6c61be0dc578fa10e59b7c1f33716b0c93ae6706b2a8bbee3"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:bf6ca8f8678dea6cf5275f69dbd9e4ebb18c2211be35379b65175e36e5953b92"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ceb263b95720ab645c2e60e37d436db51321e0192d399631d052387728911689"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:8b7fe82d8d1cdc90caadbcce419d3bcbf1bdeffb9bba974a81a46f389d8ee243"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d46b798507f6c7447e21c76bd71969e22e55fa848196f20de73b3e2b65373b5"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bdcce7069368b7b2ec8a855812c1b0e9e6bdf6b01660225ffff5ba163fa507d"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:daedeff774cd68df05e68dbfa66e83a877e63a99461b8262eb5c8cd37e309aa7"}, + {file = "snowflake-connector-python-3.5.0.tar.gz", hash = "sha256:654e4a1f68a491544bd8f7c5ab02eb8531df67c5f4309d5253bd204044f8a1b3"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a365fa4f23be27a4a46d04f73a48ccb1ddad5b9558f100ba592a49571c90a33c"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5b648b8f32aa540e9adf14e84ea5d77a6c3c6cbc3cbcf172622a0b8db0e99384"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722dc0100c3247788aeb975a8a5941f2f757e8524d2626cf6fe78df02b6384fb"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7029b8776c5d2153ed2b0254dc23ae1e3bde141b6634fc6c77b919ed29d5bb42"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:3472703fc4f308343d925c41dab976a42e10192fa0b8b9025e80b083ad7dcf1b"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:40f4a376b6da875d70383b60c66ad3723f0bed21d8bdbf7afb39525cb70c70ef"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:8a08d8df6f1b5b5d0bf9145e6339dbeaf294392529629d0bd7e4dd3e49d7892c"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac16a00bb3824069303e119cd049858c2caf92d174f9486ba273d19abf06a18d"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a820148b64436621b5db79c2e7848d5d12ece13b0948281c19dd2f8a50e4dbe"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ffa8f95a767e5077e82cf290a43950f37cfc25e34935f038abc96494a1595a03"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ef70cd89aee56fbbaeb68dc1f7612598b0c8a470d16ddb68ca7657bd70cbf8d7"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:09ff23c1aa4bf9e148e491512a81b097ce0b1c2a870f3d0bb0dc5febf764c45c"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e696f133c57494dce57a68a92d1e2cf20334361400fe3c4c73637627f7d9c0ec"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0136a9fb45013ea3d50045acb3cedb50b2d5d6ac1d0f9adc538e28cf86a1386"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:133e2a8a5e7b59d84e83886bb516d290edbd0b92dd69304f8f7ac613faca2aeb"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c463d11b05b57c40eb83d84044d761535a855e498ffd52456e92eed333e43b17"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:cdd198dbc0aff373bb9e95f315cdc0b922ae61186ba9bd7da4950835827cd7f9"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d8769b95a46040261a46dc58757c59b26e6122466222d8b8e518ea6aa62e83d"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee97a8ac0aaf40a7b7420c8936a66d8d33376cd40498ac3d38efa7bb5712d14a"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8cd747e2719ba44dd2ce0e9b1e6f8b03485b2b335a352f3b45138b56fad5888"}, ] sortedcontainers = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, diff --git a/pyproject.toml b/pyproject.toml index 8fc34df0e4..20c0534e60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,7 +68,7 @@ dbt-athena-community = {version = ">=1.2.0", optional = true} s3fs = {version = ">=2022.4.0", optional = true} gcsfs = {version = ">=2022.4.0", optional = true} botocore = {version = ">=1.28", optional = true} -# snowflake-connector-python = {version = ">=3.1.1", optional = true, extras = ["pandas"]} +snowflake-connector-python = {version = ">=3.5.0", optional = true, extras = ["pandas"]} cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} @@ -91,7 +91,7 @@ filesystem = ["s3fs", "botocore"] s3 = ["s3fs", "botocore"] gs = ["gcsfs"] az = ["adlfs"] -# snowflake = ["snowflake-connector-python"] +snowflake = ["snowflake-connector-python"] motherduck = ["duckdb", "pyarrow"] cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 1b32596786..686ad2ffd3 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -296,9 +296,10 @@ def some_data(): pipeline.run(item, table_name="some_data").raise_on_failed_jobs() # should be able to load arrow with a new column - item, records = arrow_table_all_data_types(item_type, num_rows=200) - item = item.append_column("static_int", [[0] * 200]) - pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + # TODO: uncomment when load_id fixed in normalizer + # item, records = arrow_table_all_data_types(item_type, num_rows=200) + # item = item.append_column("static_int", [[0] * 200]) + # pipeline.run(item, table_name="some_data").raise_on_failed_jobs() - schema = pipeline.default_schema - assert schema.tables['some_data']['columns']['static_int']['data_type'] == 'bigint' + # schema = pipeline.default_schema + # assert schema.tables['some_data']['columns']['static_int']['data_type'] == 'bigint' diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index c9e6c8b423..6064f6e00d 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -13,14 +13,13 @@ from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.destination.reference import WithStateSync from dlt.common.exceptions import DestinationHasFailedJobs, DestinationTerminalException, PipelineStateNotAvailable, UnknownDestinationModule from dlt.common.pipeline import PipelineContext from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id -from dlt.destinations.sql_client import SqlClientBase from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage from dlt.extract.source import DltResource, DltSource @@ -1032,17 +1031,6 @@ def generic(start): assert pipeline.default_schema.get_table("single_table")["resource"] == "state1" -@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) -def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: - - @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) - def generic(start=8): - yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] - - pipeline = dlt.pipeline(destination='duckdb') - pipeline.run(generic(), loader_file_format=file_format) - - def test_remove_autodetect() -> None: now = pendulum.now() @@ -1130,7 +1118,7 @@ def test_resource_state_name_not_normalized() -> None: # get state from destination from dlt.pipeline.state_sync import load_state_from_destination - client: SqlClientBase + client: WithStateSync with pipeline.destination_client() as client: # type: ignore[assignment] state = load_state_from_destination(pipeline.pipeline_name, client) assert "airtable_emojis" in state["sources"] diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index 7be06c364c..7870918c78 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -6,6 +6,7 @@ import dlt from dlt.common import json from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.capabilities import TLoaderFileFormat from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector from dlt.extract.storage import ExtractorStorage @@ -103,3 +104,14 @@ def users() -> Iterator[User]: expect_extracted_file( storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) ) + + +@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) +def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: + + @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) + def generic(start=8): + yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] + + pipeline = dlt.pipeline(destination='duckdb') + pipeline.run(generic(), loader_file_format=file_format) From 2d260e8a62747bba0f41246a42e7ceca6e3c08d9 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 20 Nov 2023 00:25:39 +0100 Subject: [PATCH 69/73] updates pydantic helper --- dlt/common/libs/pydantic.py | 137 ++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 39 deletions(-) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index 18fafa0138..9af7104710 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,9 +1,12 @@ -from typing import Generic, Sequence, TypedDict, List, Type, Union, TypeVar, get_type_hints, get_args, Any +import inspect +from copy import copy +from typing import Generic, Sequence, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_type_hints, get_args, Any -from dlt.common.exceptions import MissingDependencyException, DltException +from dlt.common.exceptions import MissingDependencyException +from dlt.common.schema import DataValidationError from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns from dlt.common.data_types import py_type_to_sc_type -from dlt.common.typing import TDataItem, TDataItems, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union +from dlt.common.typing import TDataItem, TDataItems, extract_union_types, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union try: from pydantic import BaseModel, ValidationError, Json, create_model @@ -100,6 +103,33 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) - return result + +def column_mode_to_extra(column_mode: TSchemaEvolutionMode) -> str: + extra = "forbid" + if column_mode == "evolve": + extra = "allow" + elif column_mode == "discard_value": + extra = "ignore" + return extra + + +def extra_to_column_mode(extra: str) -> TSchemaEvolutionMode: + if extra == "forbid": + return "freeze" + if extra == "allow": + return "evolve" + return "discard_value" + + +def get_extra_from_model(model: Type[BaseModel]) -> str: + default_extra = "ignore" + if _PYDANTIC_2: + default_extra = model.model_config.get("extra", default_extra) + else: + default_extra = model.Config.extra or default_extra # type: ignore[attr-defined] + return default_extra + + def apply_schema_contract_to_model( model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, @@ -116,26 +146,46 @@ def apply_schema_contract_to_model( # create a lenient model that accepts any data model = create_model(model.__name__ + "Any", **{n:(Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] elif data_mode == "discard_value": - raise ValueError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models. Not yet implemented.") + raise NotImplementedError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models.") - extra = "forbid" - if column_mode == "evolve": - extra = "allow" - elif column_mode == "discard_value": - extra = "ignore" + extra = column_mode_to_extra(column_mode) + + if extra == get_extra_from_model(model): + # no need to change the model + return model if _PYDANTIC_2: - config = model.model_config + config = copy(model.model_config) config["extra"] = extra # type: ignore[typeddict-item] else: - config = model.Config # type: ignore[attr-defined] + config = copy(model.Config) # type: ignore[attr-defined] config.extra = extra # type: ignore[attr-defined] - return create_model( # type: ignore[no-any-return, call-overload] + def _process_annotation(t_: Type[Any]) -> Any: + """Recursively recreates models with applied schema contract """ + if is_list_generic_type(t_): + l_t = get_args(t_)[0] + return get_origin(t_)[_process_annotation(l_t)] + elif is_dict_generic_type(t_): + k_t, v_t = get_args(t_) + return get_origin(t_)[k_t, _process_annotation(v_t)] + elif is_union(t_): + u_t_s = tuple(_process_annotation(u_t) for u_t in extract_union_types(t_)) + return Union[u_t_s] + elif inspect.isclass(t_) and issubclass(t_, BaseModel): + return apply_schema_contract_to_model(t_, column_mode, data_mode) + return t_ + + new_model: Type[_TPydanticModel] = create_model( # type: ignore[call-overload] model.__name__ + "Extra" + extra.title(), __config__ = config, - **{n:(f.annotation, f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] + **{n:(_process_annotation(f.annotation), f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] ) + # pass dlt config along + dlt_config = getattr(model, "dlt_config", None) + if dlt_config: + new_model.dlt_config = dlt_config # type: ignore[attr-defined] + return new_model def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze") -> Type[ListModel[_TPydanticModel]]: @@ -152,6 +202,7 @@ def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionM def validate_items( + table_name: str, list_model: Type[ListModel[_TPydanticModel]], items: List[TDataItem], column_mode: TSchemaEvolutionMode, @@ -164,57 +215,65 @@ def validate_items( try: return list_model(items=items).items except ValidationError as e: - delta_idx = 0 + deleted: Set[int] = set() for err in e.errors(): + # TODO: we can get rid of most of the code if we use LenientList as explained above if len(err["loc"]) >= 2: - err_idx = int(err["loc"][1]) - delta_idx - err_item = items[err_idx] + err_idx = int(err["loc"][1]) + if err_idx in deleted: + # already dropped + continue + err_item = items[err_idx - len(deleted)] else: # top level error which means misalignment of list model and items - raise FullValidationError(list_model, items, e) from e + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, items) from e # raise on freeze if err["type"] == 'extra_forbidden': if column_mode == "freeze": - raise FullValidationError(list_model, err_item, e) from e + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, err_item) from e elif column_mode == "discard_row": - items.pop(err_idx) - delta_idx += 1 - + # pop at the right index + items.pop(err_idx - len(deleted)) + # store original index so we do not pop again + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") else: if data_mode == "freeze": - raise FullValidationError(list_model, err_item, e) from e + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", list_model, {"data_type": "freeze"}, err_item) from e elif data_mode == "discard_row": - items.pop(err_idx) - delta_idx += 1 + items.pop(err_idx - len(deleted)) + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") # validate again with error items removed - return validate_items(list_model, items, column_mode, data_mode) + return validate_items(table_name, list_model, items, column_mode, data_mode) -def validate_item(model: Type[_TPydanticModel], item: TDataItems, column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> _TPydanticModel: +def validate_item( + table_name: str, + model: Type[_TPydanticModel], + item: TDataItems, + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> _TPydanticModel: """Validates `item` against model `model` and returns an instance of it""" try: return model.parse_obj(item) except ValidationError as e: - for err in e.errors(include_url=False, include_context=False): + for err in e.errors(): # raise on freeze if err["type"] == 'extra_forbidden': if column_mode == "freeze": - raise FullValidationError(model, item, e) from e + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", model, {"columns": "freeze"}, item) from e elif column_mode == "discard_row": return None + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") else: if data_mode == "freeze": - raise FullValidationError(model, item, e) from e + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", model, {"data_type": "freeze"}, item) from e elif data_mode == "discard_row": return None - # validate again with error items removed - return validate_item(model, item, column_mode, data_mode) - - -class FullValidationError(ValueError, DltException): - def __init__(self, validator: Type[BaseModel], data_item: TDataItems, original_exception: Exception) ->None: - self.original_exception = original_exception - self.validator = validator - self.data_item = data_item - super().__init__(f"Extracted data item could not be validated with {validator}. Original message: {original_exception}") + raise NotImplementedError(f"{data_mode} data mode not implemented for Pydantic validation") + raise AssertionError("unreachable") \ No newline at end of file From aa990f5819b82e512c647df92b766495579a760c Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 20 Nov 2023 00:26:02 +0100 Subject: [PATCH 70/73] improves contract violation exception --- dlt/common/schema/__init__.py | 3 ++- dlt/common/schema/exceptions.py | 42 +++++++++++++++++++++++++++------ dlt/common/schema/schema.py | 25 +++++++++++++------- 3 files changed, 53 insertions(+), 17 deletions(-) diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index 426264a1d9..ac320bef0a 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,10 +1,11 @@ from dlt.common.schema.typing import TSchemaContractDict, TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase from dlt.common.schema.typing import COLUMN_HINTS from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.utils import verify_schema_hash __all__ = [ "TSchemaUpdate", "TSchemaTables", "TTableSchema", "TStoredSchema", "TTableSchemaColumns", "TColumnHint", "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash", "TSchemaContractDict", - "DEFAULT_SCHEMA_CONTRACT_MODE" + "DEFAULT_SCHEMA_CONTRACT_MODE", "DataValidationError" ] diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 57c1d5c1df..96df6b7418 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -2,6 +2,7 @@ from dlt.common.exceptions import DltException from dlt.common.data_types import TDataType +from dlt.common.schema.typing import TSchemaContractDict, TSchemaContractEntities, TSchemaEvolutionMode class SchemaException(DltException): @@ -16,11 +17,6 @@ def __init__(self, name: str) -> None: super().__init__(f"{name} is an invalid schema/source name. The source or schema name must be a valid Python identifier ie. a snake case function name and have maximum {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small letters, numbers and underscores.") -# class InvalidDatasetName(ValueError, SchemaException): -# def __init__(self, name: str, normalized_name: str) -> None: -# self.name = name -# super().__init__(f"{name} is an invalid dataset name. The dataset name must conform to wide range of destinations and ideally should contain only small letters, numbers and underscores. Try {normalized_name} instead as suggested by current naming module.") - class InvalidDatasetName(ValueError, SchemaException): def __init__(self, destination_name: str) -> None: self.destination_name = destination_name @@ -71,11 +67,43 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") -class SchemaFrozenException(SchemaException): - def __init__(self, schema_name: str, table_name: str, msg: str) -> None: +class DataValidationError(SchemaException): + def __init__( + self, + schema_name: str, + table_name: str, + column_name: str, + contract_entity: TSchemaContractEntities, + contract_mode: TSchemaEvolutionMode, + table_schema: Any, + schema_contract: TSchemaContractDict, + data_item: Any = None, + extended_info: str = None + ) -> None: + """Raised when `data_item` violates `contract_mode` on a `contract_entity` as defined by `table_schema` + + Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence. + """ + msg = "" + if schema_name: + msg = f"Schema: {schema_name} " + msg += f"Table: {table_name} " + if column_name: + msg += f"Column: {column_name}" + msg = "In " + msg + f" . Contract on {contract_entity} with mode {contract_mode} is violated. " + (extended_info or "") super().__init__(msg) self.schema_name = schema_name self.table_name = table_name + self.column_name = column_name + + # violated contract + self.contract_entity = contract_entity + self.contract_mode = contract_mode + + # some evidence + self.table_schema = table_schema + self.schema_contract = schema_contract + self.data_item = data_item class UnknownTableException(SchemaException): diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index df7424e563..67ae345845 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -15,7 +15,7 @@ from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict -from dlt.common.schema.exceptions import SchemaFrozenException +from dlt.common.schema.exceptions import DataValidationError DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { @@ -202,12 +202,13 @@ def apply_schema_contract( self, schema_contract: TSchemaContractDict, partial_table: TPartialTableSchema, + data_item: TDataItem = None, raise_on_freeze: bool = True ) -> Tuple[TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]]]: """ Checks if `schema_contract` allows for the `partial_table` to update the schema. It applies the contract dropping the affected columns or the whole `partial_table`. It generates and returns a set of filters that should be applied to incoming data in order to modify it - so it conforms to the contract. + so it conforms to the contract. `data_item` is provided only as evidence in case DataValidationError is raised. Example `schema_contract`: { @@ -225,7 +226,7 @@ def apply_schema_contract( Returns a tuple where a first element is modified partial table and the second is a list of filters. The modified partial may be None in case the whole table is not allowed. Each filter is a tuple of (table|columns, entity name, freeze | discard_row | discard_value). - Note: by default `freeze` immediately raises SchemaFrozenException which is convenient in most use cases + Note: by default `freeze` immediately raises DataValidationError which is convenient in most use cases """ # default settings allow all evolutions, skip all else @@ -241,7 +242,9 @@ def apply_schema_contract( # check case where we have a new table if is_new_table and schema_contract["tables"] != "evolve": if raise_on_freeze and schema_contract["tables"] == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add table {table_name} but new tables are frozen.") + raise DataValidationError( + self.name, table_name, None, "tables", "freeze", None, schema_contract, data_item, f"Trying to add table {table_name} but new tables are frozen." + ) # filter tables with name below return None, [("tables", table_name, schema_contract["tables"])] @@ -260,7 +263,9 @@ def apply_schema_contract( # new column and contract prohibits that if column_mode != "evolve" and not is_variant: if raise_on_freeze and column_mode == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to add column {column_name} to table {table_name} but columns are frozen.") + raise DataValidationError( + self.name, table_name, column_name, "columns", "freeze", existing_table, schema_contract, data_item, f"Trying to add column {column_name} to table {table_name} but columns are frozen." + ) # filter column with name below filters.append(("columns", column_name, column_mode)) # pop the column @@ -269,7 +274,9 @@ def apply_schema_contract( # variant (data type evolution) and contract prohibits that if data_mode != "evolve" and is_variant: if raise_on_freeze and data_mode == "freeze": - raise SchemaFrozenException(self.name, table_name, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen.") + raise DataValidationError( + self.name, table_name, column_name, "data_type", "freeze", existing_table, schema_contract, data_item, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen." + ) # filter column with name below filters.append(("columns", column_name, data_mode)) # pop the column @@ -278,11 +285,11 @@ def apply_schema_contract( return partial_table, filters @staticmethod - def expand_schema_contract_settings(settings: TSchemaContract) -> TSchemaContractDict: - """Expand partial or shorthand settings into full settings dictionary""" + def expand_schema_contract_settings(settings: TSchemaContract, default: TSchemaContractDict = None) -> TSchemaContractDict: + """Expand partial or shorthand settings into full settings dictionary using `default` for unset entities""" if isinstance(settings, str): settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) - return cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **settings}) + return cast(TSchemaContractDict, {**(default or DEFAULT_SCHEMA_CONTRACT_MODE), **(settings or {})}) def resolve_contract_settings_for_table(self, table_name: str, new_table_schema: TTableSchema = None) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table `table_name`. `new_table_schema` is added to the tree during the resolution.""" From a6a782b8df7a87341a67da575a99be72ff963897 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 20 Nov 2023 00:26:45 +0100 Subject: [PATCH 71/73] splits source and resource in extract, adds more tests --- .../impl/qdrant/qdrant_adapter.py | 3 +- .../impl/weaviate/weaviate_adapter.py | 3 +- dlt/extract/__init__.py | 7 + dlt/extract/decorators.py | 3 +- dlt/extract/extractors.py | 4 +- dlt/extract/{schema.py => hints.py} | 41 +- dlt/extract/resource.py | 494 +++++++++++++++++ dlt/extract/source.py | 499 +----------------- dlt/extract/typing.py | 5 + dlt/extract/validation.py | 33 +- dlt/helpers/airflow_helper.py | 2 +- dlt/normalize/items_normalizers.py | 2 +- dlt/pipeline/mark.py | 2 +- dlt/pipeline/pipeline.py | 8 +- dlt/pipeline/state_sync.py | 2 +- dlt/pipeline/trace.py | 2 +- dlt/reflection/script_inspector.py | 3 +- dlt/sources/__init__.py | 3 +- docs/examples/archive/sources/rasa/rasa.py | 2 +- docs/examples/incremental_loading/zendesk.py | 5 +- .../code/zendesk-snippets.py | 5 +- tests/common/schema/test_schema_contract.py | 35 +- tests/extract/test_decorators.py | 11 +- tests/extract/test_extract.py | 5 +- tests/extract/test_incremental.py | 2 +- tests/extract/test_sources.py | 13 +- tests/extract/test_validation.py | 96 +++- tests/libs/test_pydantic.py | 162 +++++- tests/load/pipeline/test_drop.py | 2 +- tests/load/pipeline/test_merge_disposition.py | 2 +- tests/load/pipeline/test_pipelines.py | 2 +- tests/load/pipeline/utils.py | 157 +----- tests/pipeline/test_pipeline.py | 2 +- tests/pipeline/test_pipeline_extra.py | 65 ++- tests/pipeline/test_pipeline_trace.py | 2 +- tests/pipeline/test_schema_contracts.py | 8 +- tests/pipeline/utils.py | 153 ++++++ 37 files changed, 1096 insertions(+), 749 deletions(-) rename dlt/extract/{schema.py => hints.py} (90%) create mode 100644 dlt/extract/resource.py diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index ac51bd5f42..f37a1f6cd8 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -1,8 +1,7 @@ from typing import Any from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource VECTORIZE_HINT = "x-qdrant-embed" diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index 6829197273..bbb3f1c9da 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -1,8 +1,7 @@ from typing import Dict, Any, Literal, Set, get_args from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource TTokenizationTMethod = Literal["word", "lowercase", "whitespace", "field"] TOKENIZATION_METHODS: Set[TTokenizationTMethod] = set(get_args(TTokenizationTMethod)) diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index e69de29bb2..cc6ff15759 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -0,0 +1,7 @@ +from dlt.extract.resource import DltResource, with_table_name +from dlt.extract.source import DltSource +from dlt.extract.decorators import source, resource, transformer, defer +from dlt.extract.incremental import Incremental +from dlt.extract.wrappers import wrap_additional_type + +__all__ = ["DltResource", "DltSource", "with_table_name", "source", "resource", "transformer", "defer", "Incremental", "wrap_additional_type"] diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 1689bfaafd..b8abbc1d57 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -25,7 +25,8 @@ from dlt.extract.incremental import IncrementalResourceWrapper from dlt.extract.typing import TTableHintTemplate -from dlt.extract.source import DltResource, DltSource, TUnboundDltResource +from dlt.extract.source import DltSource +from dlt.extract.resource import DltResource, TUnboundDltResource @configspec diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index d1b636b6d3..0ec8aed968 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -13,7 +13,7 @@ from dlt.common.schema import Schema, utils from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema -from dlt.extract.source import DltResource +from dlt.extract.resource import DltResource from dlt.extract.typing import TableNameMeta from dlt.extract.storage import ExtractorStorage, ExtractorItemStorage try: @@ -155,7 +155,7 @@ def _compute_and_update_table(self, resource: DltResource, table_name: str, item diff_table = computed_table # apply contracts - diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table) + diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table, data_item=items) # merge with schema table if diff_table: diff --git a/dlt/extract/schema.py b/dlt/extract/hints.py similarity index 90% rename from dlt/extract/schema.py rename to dlt/extract/hints.py index 675c6b0f47..19d503f970 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/hints.py @@ -26,14 +26,15 @@ class TTableSchemaTemplate(TypedDict, total=False): incremental: Incremental[Any] schema_contract: TTableHintTemplate[TSchemaContract] validator: ValidateItem + original_columns: TTableHintTemplate[TAnySchemaColumns] -class DltResourceSchema: + +class DltResourceHints: def __init__(self, table_schema_template: TTableSchemaTemplate = None): self.__qualname__ = self.__name__ = self.name self._table_name_hint_fun: TFunHintTemplate[str] = None self._table_has_other_dynamic_hints: bool = False self._table_schema_template: TTableSchemaTemplate = None - self._original_columns: TTableHintTemplate[TAnySchemaColumns] = None if table_schema_template: self.set_template(table_schema_template) @@ -70,6 +71,10 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: return None return self._table_schema_template.get("columns") + @property + def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: + return self._table_schema_template.get("schema_contract") + def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data""" if not self._table_schema_template: @@ -85,7 +90,7 @@ def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: if self._table_name_hint_fun and item is None: raise DataItemRequiredForDynamicTableHints(self.name) # resolve - resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator"]} # type: ignore + resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator", "original_columns"]} # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name validate_dict_ignoring_xkeys( @@ -118,10 +123,6 @@ def apply_hints( Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using is to skip already loaded data. In non-aware resources, `dlt` will filter out the loaded values, however the resource will yield all the values again. """ - # keep original columns: ie in case it is a Pydantic model - if columns is not None: - self._original_columns = columns - t = None if not self._table_schema_template: # if there's no template yet, create and set new one @@ -142,6 +143,8 @@ def apply_hints( if write_disposition: t["write_disposition"] = write_disposition if columns is not None: + # keep original columns: ie in case it is a Pydantic model + t["original_columns"] = columns # if callable then override existing if callable(columns) or callable(t["columns"]): t["columns"] = ensure_table_schema_columns_hint(columns) @@ -170,7 +173,7 @@ def apply_hints( t.pop("schema_contract", None) # recreate validator if columns definition or contract changed if schema_contract is not None or columns is not None: - t["validator"], schema_contract = create_item_validator(self._original_columns, t.get("schema_contract")) + t["validator"], schema_contract = create_item_validator(t.get("original_columns"), t.get("schema_contract")) if schema_contract is not None: t["schema_contract"] = schema_contract @@ -183,7 +186,7 @@ def apply_hints( self.set_template(t) def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - DltResourceSchema.validate_dynamic_hints(table_schema_template) + DltResourceHints.validate_dynamic_hints(table_schema_template) # if "name" is callable in the template then the table schema requires actual data item to be inferred name_hint = table_schema_template.get("name") if callable(name_hint): @@ -228,9 +231,9 @@ def _merge_keys(t_: TTableSchemaTemplate) -> TPartialTableSchema: # assert not callable(t_["merge_key"]) # assert not callable(t_["primary_key"]) if "primary_key" in t_: - DltResourceSchema._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore + DltResourceHints._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore if "merge_key" in t_: - DltResourceSchema._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore + DltResourceHints._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore return partial @@ -246,21 +249,25 @@ def new_table_template( table_format: TTableHintTemplate[TTableFormat] = None ) -> TTableSchemaTemplate: validator, schema_contract = create_item_validator(columns, schema_contract) + clean_columns = columns if columns is not None: - columns = ensure_table_schema_columns_hint(columns) - if not callable(columns): - columns = columns.values() # type: ignore + clean_columns = ensure_table_schema_columns_hint(columns) + if not callable(clean_columns): + clean_columns = clean_columns.values() # type: ignore # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table( table_name, # type: ignore parent_table_name, # type: ignore write_disposition=write_disposition, # type: ignore - columns=columns, # type: ignore + columns=clean_columns, # type: ignore schema_contract=schema_contract, # type: ignore table_format=table_format # type: ignore ) if not table_name: new_template.pop("name") + # remember original columns + if columns is not None: + new_template["original_columns"] = columns # always remove resource new_template.pop("resource", None) # type: ignore if primary_key: @@ -269,12 +276,12 @@ def new_table_template( new_template["merge_key"] = merge_key if validator: new_template["validator"] = validator - DltResourceSchema.validate_dynamic_hints(new_template) + DltResourceHints.validate_dynamic_hints(new_template) return new_template @staticmethod def validate_dynamic_hints(template: TTableSchemaTemplate) -> None: table_name = template.get("name") # if any of the hints is a function then name must be as well - if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator"]) and not callable(table_name): + if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator", "original_columns"]) and not callable(table_name): raise InconsistentTableTemplate(f"Table name {table_name} must be a function if any other table hint is a function") diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py new file mode 100644 index 0000000000..2c3018e77d --- /dev/null +++ b/dlt/extract/resource.py @@ -0,0 +1,494 @@ +from copy import deepcopy +import inspect +from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Iterable, Iterator, Union, Any, Optional + +from dlt.common.configuration.resolve import inject_section +from dlt.common.configuration.specs import known_sections +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.configuration.container import Container +from dlt.common.pipeline import PipelineContext, StateInjectableContext, resource_state, pipeline_state +from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id + +from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TableNameMeta, + FilterItem, MapItem, YieldMapItem, ValidateItem) +from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep +from dlt.extract.hints import DltResourceHints, TTableSchemaTemplate +from dlt.extract.incremental import Incremental, IncrementalResourceWrapper +from dlt.extract.exceptions import ( + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, + InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, + InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, + InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer +) +from dlt.extract.wrappers import wrap_additional_type + + +def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: + """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" + return DataItemWithMeta(TableNameMeta(table_name), item) + + +class DltResource(Iterable[TDataItem], DltResourceHints): + """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" + Empty: ClassVar["DltResource"] = None + source_name: str + """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" + section: str + """A config section name""" + + def __init__( + self, + pipe: Pipe, + table_schema_template: TTableSchemaTemplate, + selected: bool, + incremental: IncrementalResourceWrapper = None, + section: str = None, + args_bound: bool = False + ) -> None: + self.section = section + self.selected = selected + self._pipe = pipe + self._args_bound = args_bound + self._explicit_args: DictStrAny = None + if incremental and not self.incremental: + self.add_step(incremental) + self.source_name = None + super().__init__(table_schema_template) + + @classmethod + def from_data( + cls, + data: Any, + name: str = None, + section: str = None, + table_schema_template: TTableSchemaTemplate = None, + selected: bool = True, + data_from: Union["DltResource", Pipe] = None, + incremental: IncrementalResourceWrapper = None + ) -> "DltResource": + if data is None: + raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore + + if isinstance(data, DltResource): + return data + + if isinstance(data, Pipe): + return cls(data, table_schema_template, selected, incremental=incremental, section=section) + + if callable(data): + name = name or get_callable_name(data) + + # if generator, take name from it + if inspect.isgenerator(data): + name = name or get_callable_name(data) # type: ignore + + # name is mandatory + if not name: + raise ResourceNameMissing() + + # wrap additional types + data = wrap_additional_type(data) + + # several iterable types are not allowed and must be excluded right away + if isinstance(data, (AsyncIterator, AsyncIterable)): + raise InvalidResourceDataTypeAsync(name, data, type(data)) + if isinstance(data, (str, dict)): + raise InvalidResourceDataTypeBasic(name, data, type(data)) + + # check if depends_on is a valid resource + parent_pipe: Pipe = None + if data_from is not None: + DltResource._ensure_valid_transformer_resource(name, data) + parent_pipe = DltResource._get_parent_pipe(name, data_from) + + # create resource from iterator, iterable or generator function + if isinstance(data, (Iterable, Iterator)) or callable(data): + pipe = Pipe.from_data(name, data, parent=parent_pipe) + return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) + else: + # some other data type that is not supported + raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") + + @property + def name(self) -> str: + """Resource name inherited from the pipe""" + return self._pipe.name + + def with_name(self, new_name: str) -> "DltResource": + """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" + return self._clone(new_name=new_name, with_parent=True) + + @property + def is_transformer(self) -> bool: + """Checks if the resource is a transformer that takes data from another resource""" + return self._pipe.has_parent + + @property + def requires_args(self) -> bool: + """Checks if resource has unbound arguments""" + try: + self._pipe.ensure_gen_bound() + return False + except (TypeError, ParametrizedResourceUnbound): + return True + + @property + def incremental(self) -> IncrementalResourceWrapper: + """Gets incremental transform if it is in the pipe""" + incremental: IncrementalResourceWrapper = None + step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) + if step_no >= 0: + incremental = self._pipe.steps[step_no] # type: ignore + return incremental + + @property + def validator(self) -> Optional[ValidateItem]: + """Gets validator transform if it is in the pipe""" + validator: ValidateItem = None + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + validator = self._pipe.steps[step_no] # type: ignore[assignment] + return validator + + @validator.setter + def validator(self, validator: Optional[ValidateItem]) -> None: + """Add/remove or replace the validator in pipe""" + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + self._pipe.remove_step(step_no) + if validator: + self.add_step(validator, insert_at=step_no if step_no >= 0 else None) + + def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: + """Replaces the parent in the transformer resource pipe from which the data is piped.""" + if self.is_transformer: + DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) + else: + raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") + parent_pipe = self._get_parent_pipe(self.name, data_from) + self._pipe.parent = parent_pipe + + def add_pipe(self, data: Any) -> None: + """Creates additional pipe for the resource from the specified data""" + # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer + raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) + + def select_tables(self, *table_names: Iterable[str]) -> "DltResource": + """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. + + Both `with_table_name` marker and data-based (function) table name hints are supported. + """ + def _filter(item: TDataItem, meta: Any = None) -> bool: + is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names + is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names + return is_in_meta or is_in_dyn + + # add filtering function at the end of pipe + self.add_filter(_filter) + return self + + def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. + insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(MapItem(item_map)) + else: + self._pipe.insert_step(MapItem(item_map), insert_at) + return self + + def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to + ie. pivot an item into sequence of rows. + + Args: + item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. + insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(YieldMapItem(item_map)) + else: + self._pipe.insert_step(YieldMapItem(item_map), insert_at) + return self + + def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` + + `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept + insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(FilterItem(item_filter)) + else: + self._pipe.insert_step(FilterItem(item_filter), insert_at) + return self + + def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 + """Adds a limit `max_items` to the resource pipe + + This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is + a no-op for transformers. Those should be limited by their input data. + + Args: + max_items (int): The maximum number of items to yield + Returns: + "DltResource": returns self + """ + def _gen_wrap(gen: TPipeStep) -> TPipeStep: + """Wrap a generator to take the first `max_items` records""" + nonlocal max_items + count = 0 + if inspect.isfunction(gen): + gen = gen() + try: + for i in gen: # type: ignore # TODO: help me fix this later + yield i + count += 1 + if count == max_items: + return + finally: + if inspect.isgenerator(gen): + gen.close() + return + # transformers should be limited by their input, so we only limit non-transformers + if not self.is_transformer: + self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) + return self + + def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 + if insert_at is None: + self._pipe.append_step(item_transform) + else: + self._pipe.insert_step(item_transform, insert_at) + return self + + def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: + super().set_template(table_schema_template) + incremental = self.incremental + # try to late assign incremental + if table_schema_template.get("incremental") is not None: + if incremental: + incremental._incremental = table_schema_template["incremental"] + else: + # if there's no wrapper add incremental as a transform + incremental = table_schema_template["incremental"] # type: ignore + self.add_step(incremental) + + if incremental: + primary_key = table_schema_template.get("primary_key", incremental.primary_key) + if primary_key is not None: + incremental.primary_key = primary_key + + if table_schema_template.get('validator') is not None: + self.validator = table_schema_template['validator'] + + def bind(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + orig_gen = self._pipe.gen + gen = self._pipe.bind_gen(*args, **kwargs) + if isinstance(gen, DltResource): + # the resource returned resource: update in place + old_pipe = self._pipe + self.__dict__.clear() + self.__dict__.update(gen.__dict__) + # keep old pipe instance + self._pipe = old_pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen._pipe.__dict__) + elif isinstance(gen, Pipe): + # the resource returned pipe: just replace pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen.__dict__) + else: + self._args_bound = True + self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore + return self + + @property + def explicit_args(self) -> StrAny: + """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" + if not self._args_bound: + raise TypeError(f"Resource {self.name} is not yet parametrized") + return self._explicit_args + + @property + def state(self) -> StrAny: + """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" + with inject_section(self._get_config_section_context()): + return resource_state(self.name) + + def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + r = self._clone() + return r.bind(*args, **kwargs) + + def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": + """Allows to pipe data from across resources and transform functions with | operator""" + # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") + if isinstance(transform, DltResource): + transform.pipe_data_from(self) + # return transformed resource for chaining + return transform + else: + # map or yield map + if inspect.isgeneratorfunction(inspect.unwrap(transform)): + return self.add_yield_map(transform) + else: + return self.add_map(transform) + + def __iter__(self) -> Iterator[TDataItem]: + """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. + + A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. + """ + # use the same state dict when opening iterator and when iterator is iterated + container = Container() + state, _ = pipeline_state(container, {}) + state_context = StateInjectableContext(state=state) + section_context = self._get_config_section_context() + + # managed pipe iterator will set the context on each call to __next__ + with inject_section(section_context), Container().injectable_context(state_context): + pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore + + pipe_iterator.set_context([state_context, section_context]) + _iter = map(lambda item: item.item, pipe_iterator) + return flatten_list_or_items(_iter) + + def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: + try: + sig = sig or inspect.signature(f) + self._explicit_args = sig.bind_partial(*args, **kwargs).arguments + except Exception: + pass + + def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": + """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source + """ + pipe = self._pipe + if self._pipe and not self._pipe.is_empty: + pipe = pipe._clone(new_name=new_name, with_parent=with_parent) + # incremental and parent are already in the pipe (if any) + return DltResource( + pipe, + deepcopy(self._table_schema_template), + selected=self.selected, + section=self.section + ) + + def _get_config_section_context(self) -> ConfigSectionContext: + container = Container() + proxy = container[PipelineContext] + pipeline = None if not proxy.is_active() else proxy.pipeline() + if pipeline: + pipeline_name = pipeline.pipeline_name + else: + pipeline_name = None + if pipeline: + default_schema_name = pipeline.default_schema_name + else: + default_schema_name = None + if not default_schema_name and pipeline_name: + default_schema_name = pipeline._make_schema_with_default_name().name + return ConfigSectionContext( + pipeline_name=pipeline_name, + # do not emit middle config section to not overwrite the resource section + # only sources emit middle config section + sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), + source_state_key=self.source_name or default_schema_name or self.section or uniq_id() + ) + + def __str__(self) -> str: + info = f"DltResource [{self.name}]" + if self.section: + info += f" in section [{self.section}]" + if self.source_name: + info += f" added to source [{self.source_name}]:" + else: + info += ":" + + if self.is_transformer: + info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" + else: + if self._pipe.is_data_bound: + if self.requires_args: + head_sig = inspect.signature(self._pipe.gen) # type: ignore + info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." + else: + info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." + else: + info += "\nThis resource is not bound to the data" + info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" + return info + + @staticmethod + def _ensure_valid_transformer_resource(name: str, data: Any) -> None: + # resource must be a callable with single argument + if callable(data): + valid_code = DltResource.validate_transformer_generator_function(data) + if valid_code != 0: + raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) + else: + raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) + + @staticmethod + def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: + # parent resource + if isinstance(data_from, Pipe): + return data_from + elif isinstance(data_from, DltResource): + return data_from._pipe + else: + # if this is generator function provide nicer exception + if callable(data_from): + raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) + else: + raise InvalidParentResourceDataType(name, data_from, type(data_from)) + + @staticmethod + def validate_transformer_generator_function(f: AnyFun) -> int: + sig = inspect.signature(f) + if len(sig.parameters) == 0: + return 1 + # transformer may take only one positional only argument + pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) + if pos_only_len > 1: + return 2 + first_ar = next(iter(sig.parameters.values())) + # and pos only must be first + if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: + return 2 + # first arg must be positional or kw_pos + if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): + return 3 + return 0 + + +# produce Empty resource singleton +DltResource.Empty = DltResource(Pipe(None), None, False) +TUnboundDltResource = Callable[..., DltResource] diff --git a/dlt/extract/source.py b/dlt/extract/source.py index a9cd27fa3f..771e8ca0cc 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -1,502 +1,27 @@ import warnings import contextlib -from copy import copy, deepcopy +from copy import copy import makefun import inspect -from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Dict, Iterable, Iterator, List, Sequence, Tuple, Union, Any, Optional +from typing import Dict, Iterable, Iterator, List, Sequence, Tuple, Any from typing_extensions import Self from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer, RelationalNormalizerConfigPropagation +from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnName, TSchemaContract -from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container -from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, resource_state, source_state, pipeline_state -from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, get_callable_name, graph_edges_to_nodes, multi_context_manager, uniq_id - -from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TDecompositionStrategy, TableNameMeta, - FilterItem, MapItem, YieldMapItem, ValidateItem) -from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep -from dlt.extract.schema import DltResourceSchema, TTableSchemaTemplate -from dlt.extract.incremental import Incremental, IncrementalResourceWrapper -from dlt.extract.exceptions import ( - InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, - DataItemRequiredForDynamicTableHints, InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, - InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer, ResourcesNotFoundError, DeletingResourcesNotSupported) -from dlt.extract.wrappers import wrap_additional_type - - -def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: - """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" - return DataItemWithMeta(TableNameMeta(table_name), item) - - -class DltResource(Iterable[TDataItem], DltResourceSchema): - """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" - Empty: ClassVar["DltResource"] = None - source_name: str - """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" - section: str - """A config section name""" - - def __init__( - self, - pipe: Pipe, - table_schema_template: TTableSchemaTemplate, - selected: bool, - incremental: IncrementalResourceWrapper = None, - section: str = None, - args_bound: bool = False - ) -> None: - self.section = section - self.selected = selected - self._pipe = pipe - self._args_bound = args_bound - self._explicit_args: DictStrAny = None - if incremental and not self.incremental: - self.add_step(incremental) - self.source_name = None - super().__init__(table_schema_template) - - @classmethod - def from_data( - cls, - data: Any, - name: str = None, - section: str = None, - table_schema_template: TTableSchemaTemplate = None, - selected: bool = True, - data_from: Union["DltResource", Pipe] = None, - incremental: IncrementalResourceWrapper = None - ) -> "DltResource": - if data is None: - raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore - - if isinstance(data, DltResource): - return data - - if isinstance(data, Pipe): - return cls(data, table_schema_template, selected, incremental=incremental, section=section) - - if callable(data): - name = name or get_callable_name(data) - - # if generator, take name from it - if inspect.isgenerator(data): - name = name or get_callable_name(data) # type: ignore - - # name is mandatory - if not name: - raise ResourceNameMissing() - - # wrap additional types - data = wrap_additional_type(data) - - # several iterable types are not allowed and must be excluded right away - if isinstance(data, (AsyncIterator, AsyncIterable)): - raise InvalidResourceDataTypeAsync(name, data, type(data)) - if isinstance(data, (str, dict)): - raise InvalidResourceDataTypeBasic(name, data, type(data)) - - # check if depends_on is a valid resource - parent_pipe: Pipe = None - if data_from is not None: - DltResource._ensure_valid_transformer_resource(name, data) - parent_pipe = DltResource._get_parent_pipe(name, data_from) - - # create resource from iterator, iterable or generator function - if isinstance(data, (Iterable, Iterator)) or callable(data): - pipe = Pipe.from_data(name, data, parent=parent_pipe) - return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) - else: - # some other data type that is not supported - raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") - - @property - def name(self) -> str: - """Resource name inherited from the pipe""" - return self._pipe.name - - def with_name(self, new_name: str) -> "DltResource": - """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" - return self._clone(new_name=new_name, with_parent=True) - - @property - def is_transformer(self) -> bool: - """Checks if the resource is a transformer that takes data from another resource""" - return self._pipe.has_parent - - @property - def requires_args(self) -> bool: - """Checks if resource has unbound arguments""" - try: - self._pipe.ensure_gen_bound() - return False - except (TypeError, ParametrizedResourceUnbound): - return True - - @property - def incremental(self) -> IncrementalResourceWrapper: - """Gets incremental transform if it is in the pipe""" - incremental: IncrementalResourceWrapper = None - step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) - if step_no >= 0: - incremental = self._pipe.steps[step_no] # type: ignore - return incremental - - @property - def validator(self) -> Optional[ValidateItem]: - """Gets validator transform if it is in the pipe""" - validator: ValidateItem = None - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - validator = self._pipe.steps[step_no] # type: ignore[assignment] - return validator - - @validator.setter - def validator(self, validator: Optional[ValidateItem]) -> None: - """Add/remove or replace the validator in pipe""" - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - self._pipe.remove_step(step_no) - if validator: - self.add_step(validator, insert_at=step_no if step_no >= 0 else None) - - def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: - """Replaces the parent in the transformer resource pipe from which the data is piped.""" - if self.is_transformer: - DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) - else: - raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") - parent_pipe = self._get_parent_pipe(self.name, data_from) - self._pipe.parent = parent_pipe - - def add_pipe(self, data: Any) -> None: - """Creates additional pipe for the resource from the specified data""" - # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer - raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) - - def select_tables(self, *table_names: Iterable[str]) -> "DltResource": - """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. - - Both `with_table_name` marker and data-based (function) table name hints are supported. - """ - def _filter(item: TDataItem, meta: Any = None) -> bool: - is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names - is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names - return is_in_meta or is_in_dyn - - # add filtering function at the end of pipe - self.add_filter(_filter) - return self - - def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. - insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(MapItem(item_map)) - else: - self._pipe.insert_step(MapItem(item_map), insert_at) - return self - - def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to - ie. pivot an item into sequence of rows. - - Args: - item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. - insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(YieldMapItem(item_map)) - else: - self._pipe.insert_step(YieldMapItem(item_map), insert_at) - return self - - def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` - - `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept - insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(FilterItem(item_filter)) - else: - self._pipe.insert_step(FilterItem(item_filter), insert_at) - return self - - def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 - """Adds a limit `max_items` to the resource pipe - - This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is - a no-op for transformers. Those should be limited by their input data. - - Args: - max_items (int): The maximum number of items to yield - Returns: - "DltResource": returns self - """ - def _gen_wrap(gen: TPipeStep) -> TPipeStep: - """Wrap a generator to take the first `max_items` records""" - nonlocal max_items - count = 0 - if inspect.isfunction(gen): - gen = gen() - try: - for i in gen: # type: ignore # TODO: help me fix this later - yield i - count += 1 - if count == max_items: - return - finally: - if inspect.isgenerator(gen): - gen.close() - return - # transformers should be limited by their input, so we only limit non-transformers - if not self.is_transformer: - self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) - return self - - def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 - if insert_at is None: - self._pipe.append_step(item_transform) - else: - self._pipe.insert_step(item_transform, insert_at) - return self - - def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - super().set_template(table_schema_template) - incremental = self.incremental - # try to late assign incremental - if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: - # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) - - if incremental: - primary_key = table_schema_template.get("primary_key", incremental.primary_key) - if primary_key is not None: - incremental.primary_key = primary_key - - if table_schema_template.get('validator') is not None: - self.validator = table_schema_template['validator'] - - def bind(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - orig_gen = self._pipe.gen - gen = self._pipe.bind_gen(*args, **kwargs) - if isinstance(gen, DltResource): - # the resource returned resource: update in place - old_pipe = self._pipe - self.__dict__.clear() - self.__dict__.update(gen.__dict__) - # keep old pipe instance - self._pipe = old_pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen._pipe.__dict__) - elif isinstance(gen, Pipe): - # the resource returned pipe: just replace pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen.__dict__) - else: - self._args_bound = True - self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore - return self - - @property - def explicit_args(self) -> StrAny: - """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" - if not self._args_bound: - raise TypeError(f"Resource {self.name} is not yet parametrized") - return self._explicit_args - - @property - def state(self) -> StrAny: - """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" - with inject_section(self._get_config_section_context()): - return resource_state(self.name) - - def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - r = self._clone() - return r.bind(*args, **kwargs) - - def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": - """Allows to pipe data from across resources and transform functions with | operator""" - # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") - if isinstance(transform, DltResource): - transform.pipe_data_from(self) - # return transformed resource for chaining - return transform - else: - # map or yield map - if inspect.isgeneratorfunction(inspect.unwrap(transform)): - return self.add_yield_map(transform) - else: - return self.add_map(transform) - - def __iter__(self) -> Iterator[TDataItem]: - """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. - - A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - """ - # use the same state dict when opening iterator and when iterator is iterated - container = Container() - state, _ = pipeline_state(container, {}) - state_context = StateInjectableContext(state=state) - section_context = self._get_config_section_context() - - # managed pipe iterator will set the context on each call to __next__ - with inject_section(section_context), Container().injectable_context(state_context): - pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore - - pipe_iterator.set_context([state_context, section_context]) - _iter = map(lambda item: item.item, pipe_iterator) - return flatten_list_or_items(_iter) - - def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: - try: - sig = sig or inspect.signature(f) - self._explicit_args = sig.bind_partial(*args, **kwargs).arguments - except Exception: - pass - - def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": - """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source - """ - pipe = self._pipe - if self._pipe and not self._pipe.is_empty: - pipe = pipe._clone(new_name=new_name, with_parent=with_parent) - # incremental and parent are already in the pipe (if any) - return DltResource( - pipe, - deepcopy(self._table_schema_template), - selected=self.selected, - section=self.section - ) +from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, source_state, pipeline_state +from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, graph_edges_to_nodes - def _get_config_section_context(self) -> ConfigSectionContext: - container = Container() - proxy = container[PipelineContext] - pipeline = None if not proxy.is_active() else proxy.pipeline() - if pipeline: - pipeline_name = pipeline.pipeline_name - else: - pipeline_name = None - if pipeline: - default_schema_name = pipeline.default_schema_name - else: - default_schema_name = None - if not default_schema_name and pipeline_name: - default_schema_name = pipeline._make_schema_with_default_name().name - return ConfigSectionContext( - pipeline_name=pipeline_name, - # do not emit middle config section to not overwrite the resource section - # only sources emit middle config section - sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), - source_state_key=self.source_name or default_schema_name or self.section or uniq_id() - ) - - def __str__(self) -> str: - info = f"DltResource [{self.name}]" - if self.section: - info += f" in section [{self.section}]" - if self.source_name: - info += f" added to source [{self.source_name}]:" - else: - info += ":" - - if self.is_transformer: - info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" - else: - if self._pipe.is_data_bound: - if self.requires_args: - head_sig = inspect.signature(self._pipe.gen) # type: ignore - info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." - else: - info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." - else: - info += "\nThis resource is not bound to the data" - info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" - return info - - @staticmethod - def _ensure_valid_transformer_resource(name: str, data: Any) -> None: - # resource must be a callable with single argument - if callable(data): - valid_code = DltResource.validate_transformer_generator_function(data) - if valid_code != 0: - raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) - else: - raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) - - @staticmethod - def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: - # parent resource - if isinstance(data_from, Pipe): - return data_from - elif isinstance(data_from, DltResource): - return data_from._pipe - else: - # if this is generator function provide nicer exception - if callable(data_from): - raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) - else: - raise InvalidParentResourceDataType(name, data_from, type(data_from)) - - @staticmethod - def validate_transformer_generator_function(f: AnyFun) -> int: - sig = inspect.signature(f) - if len(sig.parameters) == 0: - return 1 - # transformer may take only one positional only argument - pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) - if pos_only_len > 1: - return 2 - first_ar = next(iter(sig.parameters.values())) - # and pos only must be first - if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: - return 2 - # first arg must be positional or kw_pos - if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): - return 3 - return 0 - - -# produce Empty resource singleton -DltResource.Empty = DltResource(Pipe(None), None, False) -TUnboundDltResource = Callable[..., DltResource] +from dlt.extract.typing import TDecompositionStrategy +from dlt.extract.pipe import Pipe, ManagedPipeIterator +from dlt.extract.hints import DltResourceHints +from dlt.extract.resource import DltResource +from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, ResourcesNotFoundError, DeletingResourcesNotSupported class DltResourceDict(Dict[str, DltResource]): @@ -529,7 +54,7 @@ def extracted(self) -> Dict[str, DltResource]: resource = self[pipe.name] except KeyError: # resource for pipe not found: return mock resource - mock_template = DltResourceSchema.new_table_template( + mock_template = DltResourceHints.new_table_template( pipe.name, write_disposition=resource.write_disposition ) diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index ad4e23b84f..646267c539 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -138,3 +138,8 @@ class ValidateItem(ItemTransform[TDataItem]): Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. See `PydanticValidator` for possible implementation. """ + table_name: str + + def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: + self.table_name = pipe.name + return self diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index e345904337..8bd6c7afb9 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -2,11 +2,10 @@ from dlt.common.schema.schema import Schema try: - from pydantic import BaseModel as PydanticBaseModel, ValidationError as PydanticValidationError, create_model + from pydantic import BaseModel as PydanticBaseModel except ModuleNotFoundError: PydanticBaseModel = Any # type: ignore[misc, assignment] -# from dlt.extract.exceptions import ValidationError from dlt.common.typing import TDataItems from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode from dlt.extract.typing import TTableHintTemplate, ValidateItem @@ -23,12 +22,8 @@ def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMo self.column_mode: TSchemaEvolutionMode = column_mode self.data_mode: TSchemaEvolutionMode = data_mode - self.list_model = apply_schema_contract_to_model( - create_list_model(model, data_mode), - column_mode, - data_mode - ) self.model = apply_schema_contract_to_model(model, column_mode, data_mode) + self.list_model = create_list_model(self.model, data_mode) def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, List[_TPydanticModel]]: """Validate a data item against the pydantic model""" @@ -38,8 +33,8 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, from dlt.common.libs.pydantic import validate_item, validate_items if isinstance(item, list): - return validate_items(self.list_model, item, self.column_mode, self.data_mode) - return validate_item(self.model, item, self.column_mode, self.data_mode) + return validate_items(self.table_name, self.list_model, item, self.column_mode, self.data_mode) + return validate_item(self.table_name, self.model, item, self.column_mode, self.data_mode) def __str__(self, *args: Any, **kwargs: Any) -> str: return f"PydanticValidator(model={self.model.__qualname__})" @@ -56,11 +51,19 @@ def create_item_validator( """ if PydanticBaseModel is not None and isinstance(columns, type) and issubclass(columns, PydanticBaseModel): assert not callable(schema_contract), "schema_contract cannot be dynamic for Pydantic item validator" - if schema_contract is not None: - expanded_schema_contract = Schema.expand_schema_contract_settings(schema_contract) - else: - # freeze the columns if we have a fully defined table and no other explicit contract - expanded_schema_contract = {"tables": "evolve", "columns": "freeze", "data_type": "freeze"} - return PydanticValidator(columns, expanded_schema_contract["columns"], expanded_schema_contract["data_type"]), schema_contract or expanded_schema_contract + from dlt.common.libs.pydantic import extra_to_column_mode, get_extra_from_model + # freeze the columns if we have a fully defined table and no other explicit contract + expanded_schema_contract = Schema.expand_schema_contract_settings( + schema_contract, + # corresponds to default Pydantic behavior + default={"tables": "evolve", "columns": extra_to_column_mode(get_extra_from_model(columns)), "data_type": "freeze"} + ) + return (PydanticValidator( + columns, + expanded_schema_contract["columns"], + expanded_schema_contract["data_type"] + ), + schema_contract or expanded_schema_contract + ) return None, schema_contract diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 2a9c76cc76..e0329d583c 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -25,7 +25,7 @@ from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.runtime.collector import NULL_COLLECTOR -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.helpers import retry_load from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import log diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index f096afbe71..6146d864b6 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -125,7 +125,7 @@ def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_hav table_name, schema.resolve_contract_settings_for_table(parent_table or table_name) # parent_table, if present, exists in the schema ) - partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table) + partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table, data_item=row) if filters: for entity, name, mode in filters: if entity == "tables": diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 5f880d8711..14a7108683 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,2 @@ """Module with market functions that make data to be specially processed""" -from dlt.extract.source import with_table_name \ No newline at end of file +from dlt.extract import with_table_name \ No newline at end of file diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 11fbf57e68..b9eb958027 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -5,7 +5,6 @@ from functools import wraps from collections.abc import Sequence as C_Sequence from typing import Any, Callable, ClassVar, List, Iterator, Optional, Sequence, Tuple, cast, get_type_hints, ContextManager -from concurrent.futures import Executor from dlt import version from dlt.common import json, logger, pendulum @@ -20,7 +19,7 @@ from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract -from dlt.common.schema.utils import diff_tables +from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -32,13 +31,12 @@ from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo, PipelineContext, SupportsPipeline, TPipelineLocalState, TPipelineState, StateInjectableContext from dlt.common.schema import Schema -from dlt.common.schema.exceptions import SchemaFrozenException from dlt.common.utils import is_interactive from dlt.common.data_writers import TLoaderFileFormat +from dlt.extract import DltResource, DltSource from dlt.extract.exceptions import SourceExhausted from dlt.extract.extract import ExtractorStorage, extract_with_schema -from dlt.extract.source import DltResource, DltSource from dlt.normalize import Normalize from dlt.normalize.configuration import NormalizeConfiguration from dlt.destinations.sql_client import SqlClientBase @@ -52,8 +50,6 @@ from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace, describe_extract_data from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state - -from dlt.common.schema.utils import normalize_schema_name from dlt.pipeline.deprecations import credentials_argument_deprecated diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 581ed4c2bd..a9603b8f66 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -12,7 +12,7 @@ from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import JobClientBase, WithStateSync -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException from dlt.common.utils import compressed_b64decode, compressed_b64encode diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 2ba71396f6..46ab524aa1 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -14,7 +14,7 @@ from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.exceptions import PipelineStepFailed diff --git a/dlt/reflection/script_inspector.py b/dlt/reflection/script_inspector.py index 204135dcd7..9899e2b157 100644 --- a/dlt/reflection/script_inspector.py +++ b/dlt/reflection/script_inspector.py @@ -12,7 +12,8 @@ from dlt.common.typing import DictStrAny from dlt.pipeline import Pipeline -from dlt.extract.source import DltSource, ManagedPipeIterator +from dlt.extract import DltSource +from dlt.extract.pipe import ManagedPipeIterator def patch__init__(self: Any, *args: Any, **kwargs: Any) -> None: diff --git a/dlt/sources/__init__.py b/dlt/sources/__init__.py index 6e418a3cb2..465467db67 100644 --- a/dlt/sources/__init__.py +++ b/dlt/sources/__init__.py @@ -1,7 +1,6 @@ """Module with built in sources and source building blocks""" -from dlt.extract.incremental import Incremental as incremental -from dlt.extract.source import DltSource, DltResource from dlt.common.typing import TDataItem, TDataItems +from dlt.extract import DltSource, DltResource, Incremental as incremental from . import credentials from . import config from . import filesystem diff --git a/docs/examples/archive/sources/rasa/rasa.py b/docs/examples/archive/sources/rasa/rasa.py index aa31b3c482..b498f9c3de 100644 --- a/docs/examples/archive/sources/rasa/rasa.py +++ b/docs/examples/archive/sources/rasa/rasa.py @@ -3,7 +3,7 @@ import dlt from dlt.common.typing import StrAny, TDataItem, TDataItems from dlt.common.time import timestamp_within -from dlt.extract.source import DltResource +from dlt.extract.resource import DltResource @dlt.source diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py index 6370f29811..3f433e3fef 100644 --- a/docs/examples/incremental_loading/zendesk.py +++ b/docs/examples/incremental_loading/zendesk.py @@ -1,10 +1,9 @@ -from typing import Iterator, Optional, Dict, Any, Tuple +from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime -from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime -from dlt.extract.source import DltResource +from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 2d674407bc..4c3d3f0b3a 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -10,13 +10,12 @@ def incremental_snippet() -> None: # @@@DLT_SNIPPET_START example # @@@DLT_SNIPPET_START markdown_source - from typing import Iterator, Optional, Dict, Any, Tuple + from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime - from dlt.extract.source import DltResource + from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py index 160aca9fd9..2f6b4743f3 100644 --- a/tests/common/schema/test_schema_contract.py +++ b/tests/common/schema/test_schema_contract.py @@ -4,7 +4,7 @@ import copy from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE, TSchemaContractDict -from dlt.common.schema.exceptions import SchemaFrozenException +from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.typing import TTableSchema def get_schema() -> Schema: @@ -191,8 +191,15 @@ def test_check_adding_table(base_settings) -> None: partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, raise_on_freeze=False) assert (partial, filters) == (None, [("tables", "new_table", "freeze")]) - with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table) + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, data_item={"item": 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == "new_table" + assert val_ex.value.column_name is None + assert val_ex.value.contract_entity == "tables" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is None # there's no validating schema on new table + assert val_ex.value.data_item == {"item": 1} @pytest.mark.parametrize("base_settings", base_settings) @@ -213,8 +220,15 @@ def assert_new_column(table_update: TTableSchema, column_name: str) -> None: partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) assert (partial, filters) == (popped_table_update, [("columns", column_name, "freeze")]) - with pytest.raises(SchemaFrozenException): - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update)) + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), {column_name: 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == column_name + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item == {column_name: 1} # # check adding new column @@ -290,8 +304,15 @@ def test_check_adding_new_variant() -> None: partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "freeze")]) - with pytest.raises(SchemaFrozenException): + with pytest.raises(DataValidationError) as val_ex: schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == "column_2_variant" + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item is None # we do not pass it to apply_schema_contract # variants are not new columns - new data types partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(table_update)) @@ -299,5 +320,5 @@ def test_check_adding_new_variant() -> None: # evolve once does not apply to variant evolution table_update["name"] = "evolve_once_table" - with pytest.raises(SchemaFrozenException): + with pytest.raises(DataValidationError): schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index b8a6b80cfa..28f3d34dcf 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -17,12 +17,15 @@ from dlt.common.schema import Schema from dlt.common.schema.utils import new_table, new_column from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.common.typing import TDataItem from dlt.cli.source_detection import detect_source_configs -from dlt.common.typing import TDataItem -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, DynamicNameNotStandaloneResource, ExplicitSourceNameInvalid, InconsistentTableTemplate, InvalidResourceDataTypeFunctionNotAGenerator, InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable -from dlt.extract.source import DltResource, DltSource -from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.extract import DltResource, DltSource +from dlt.extract.exceptions import (DynamicNameNotStandaloneResource, InvalidResourceDataTypeFunctionNotAGenerator, + InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, + PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, + SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable) from dlt.extract.typing import TableNameMeta from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7 diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 864424dad9..7ed74b41f2 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -1,9 +1,10 @@ import dlt from dlt.common import json from dlt.common.storages import NormalizeStorageConfiguration + +from dlt.extract import DltResource, DltSource from dlt.extract.extract import ExtractorStorage, extract -from dlt.extract.source import DltResource, DltSource -from dlt.common.schema import Schema + from tests.utils import clean_test_storage from tests.extract.utils import expect_extracted_file diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index af1d0a7107..d03b125777 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -17,7 +17,7 @@ from dlt.common.utils import uniq_id, digest128, chunks from dlt.common.json import json -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.sources.helpers.transform import take_first from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing from dlt.pipeline.exceptions import PipelineStepFailed diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index ca95bded15..130e0a8d93 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -9,11 +9,14 @@ from dlt.common.pipeline import StateInjectableContext, source_state from dlt.common.schema import Schema from dlt.common.typing import TDataItems -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, ResourcesNotFoundError -from dlt.extract.incremental import Incremental + +from dlt.extract import DltResource, DltSource, Incremental +from dlt.extract.source import DltResourceDict +from dlt.extract.exceptions import (DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, + InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, + ParametrizedResourceUnbound, ResourcesNotFoundError) from dlt.extract.pipe import Pipe -from dlt.extract.typing import FilterItem, MapItem -from dlt.extract.source import DltResource, DltResourceDict, DltSource def test_call_data_resource() -> None: @@ -1173,7 +1176,7 @@ def empty_gen(): # reset empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY, schema_contract={}) - assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append'} + assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append', 'original_columns': {}} table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" assert "parent" not in table diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index 7826eb84ef..db39530567 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -5,12 +5,15 @@ import dlt from dlt.common import json +from dlt.common.schema.exceptions import DataValidationError from dlt.common.typing import TDataItems -from dlt.common.libs.pydantic import BaseModel, FullValidationError, ValidationError +from dlt.common.libs.pydantic import BaseModel +from dlt.extract import DltResource from dlt.extract.typing import ValidateItem from dlt.extract.validation import PydanticValidator from dlt.extract.exceptions import ResourceExtractionError +from dlt.pipeline.exceptions import PipelineStepFailed class SimpleModel(BaseModel): @@ -137,7 +140,7 @@ class AnotherModel(BaseModel): @pytest.mark.parametrize("yield_list", [True, False]) -def test_failed_validation(yield_list: bool) -> None: +def test_default_validation(yield_list: bool) -> None: @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: # yield item that fails schema validation @@ -147,9 +150,94 @@ def some_data() -> t.Iterator[TDataItems]: else: yield from items + # some_data must have default Pydantic schema contract + assert some_data().schema_contract == {"tables": "evolve", "columns": "discard_value", "data_type": "freeze"} + # extraction fails with ValidationError with pytest.raises(ResourceExtractionError) as exinfo: list(some_data()) - assert isinstance(exinfo.value.__cause__, FullValidationError) - # assert str(PydanticValidator(SimpleModel)) in str(exinfo.value) + val_ex = exinfo.value.__cause__ + assert isinstance(val_ex, DataValidationError) + assert val_ex.schema_name is None + assert val_ex.table_name == "some_data" + assert val_ex.column_name == "('items', 1, 'a')" if yield_list else "('a',)" + assert val_ex.data_item == {"a": "not_int", "b": "x"} + assert val_ex.contract_entity == "data_type" + + # fail in pipeline + @dlt.resource(columns=SimpleModel) + def some_data_extra() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z", "c": 1.3}, {"a": "not_int", "b": "x"}] + if yield_list: + yield items + else: + yield from items + + pipeline = dlt.pipeline() + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(some_data_extra()) + assert isinstance(py_ex.value.__cause__, ResourceExtractionError) + assert isinstance(py_ex.value.__cause__.__cause__, DataValidationError) + val_ex = py_ex.value.__cause__.__cause__ + assert val_ex.table_name == "some_data_extra" + assert val_ex.contract_entity == "data_type" # extra field is the cause + assert val_ex.data_item == {"a": "not_int", "b": "x"} + + +@pytest.mark.parametrize("yield_list", [True, False]) +def test_validation_with_contracts(yield_list: bool) -> None: + + def some_data() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z"}, {"a": "not_int", "b": "x"}, {"c": "not_int"}] + if yield_list: + yield items + else: + yield from items + + # let it evolve + r: DltResource = dlt.resource(some_data(), schema_contract="evolve", columns=SimpleModel) + validator: PydanticValidator[SimpleModel] = r.validator # type: ignore[assignment] + assert validator.column_mode == "evolve" + assert validator.data_mode == "evolve" + assert validator.model.__name__.endswith("AnyExtraAllow") + items = list(r) + assert len(items) == 3 + # fully valid + assert items[0].a == 1 + assert items[0].b == "z" + # data type not valid + assert items[1].a == "not_int" + assert items[1].b == "x" + # extra attr and data invalid + assert items[2].a is None + assert items[2].b is None + assert items[2].c == "not_int" + + # let it drop + r = dlt.resource(some_data(), schema_contract="discard_row", columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_row" + assert validator.data_mode == "discard_row" + assert validator.model.__name__.endswith("ExtraForbid") + items = list(r) + assert len(items) == 1 + assert items[0].a == 1 + assert items[0].b == "z" + + # filter just offending values + with pytest.raises(NotImplementedError): + # pydantic data_type cannot be discard_value + dlt.resource(some_data(), schema_contract="discard_value", columns=SimpleModel) + r = dlt.resource(some_data(), schema_contract={"columns": "discard_value", "data_type": "evolve"}, columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_value" + assert validator.data_mode == "evolve" + # ignore is the default so no Extra in name + assert validator.model.__name__.endswith("Any") + items = list(r) + assert len(items) == 3 + # c is gone from the last model + assert not hasattr(items[2], "c") diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py index 274ce20fcd..48227577c4 100644 --- a/tests/libs/test_pydantic.py +++ b/tests/libs/test_pydantic.py @@ -10,6 +10,8 @@ from dlt.common.libs.pydantic import DltConfig, pydantic_to_table_schema_columns, apply_schema_contract_to_model, validate_item, validate_items, create_list_model from pydantic import BaseModel, Json, AnyHttpUrl, ConfigDict, ValidationError +from dlt.common.schema.exceptions import DataValidationError + class StrEnum(str, Enum): a = "a_value" @@ -199,27 +201,159 @@ def test_model_for_column_mode() -> None: model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "evolve") print(model_freeze.parse_obj(instance_extra_2).dict()) - with pytest.raises(ValueError): + with pytest.raises(NotImplementedError): apply_schema_contract_to_model(ModelWithConfig, "evolve", "discard_value") -def test_items_validation() -> None: +def test_item_list_validation() -> None: class ItemModel(BaseModel): b: bool - dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} - + opt: Optional[int] = None + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + discard_list_model = create_list_model(discard_model) + # violate data type + items = validate_items( + "items", + discard_list_model, + [{"b": True}, {"b": 2, "opt": "not int", "extra": 1.2}, {"b": 3}, {"b": False}], + "discard_row", "discard_row" + ) + # {"b": 2, "opt": "not int", "extra": 1.2} - note that this will generate 3 errors for the same item + # and is crucial in our tests when discarding rows + assert len(items) == 2 + assert items[0].b is True + assert items[1].b is False + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "discard_row") + assert len(items) == 1 + assert items[0].b is True - item = ItemModel(b=True) - print(ItemModel.dlt_config) - print(item.dlt_config) + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + freeze_list_model = create_list_model(freeze_model) + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'b')) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"a": 2, "b": False}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'a')) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + discard_list_model = create_list_model(discard_value_model) + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": False, "a": False}], "discard_value", "freeze") + assert len(items) == 2 + # "a" extra got remove + assert items[1].dict() == {"b": False, "opt": None} + # violate data type + with pytest.raises(NotImplementedError): + apply_schema_contract_to_model(ItemModel, "discard_value", "discard_value") + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + evolve_list_model = create_list_model(evolve_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields allowed + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[3].b is False + assert items[3].a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + mixed_list_model = create_list_model(mixed_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "discard_row", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields forbidden - full rows discarded + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "evolve") + assert len(items) == 3 - #ItemRootModel = RootModel(bool) - list_model = create_list_model(ItemModel) - list_model = apply_schema_contract_to_model(list_model, "freeze", "discard_row") +def test_item_validation() -> None: - items = validate_items(list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "discard_row") - assert len(items) == 2 - assert items[0].b is True - assert items[1].b is False \ No newline at end of file + class ItemModel(BaseModel): + b: bool + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + # violate data type + assert validate_item("items", discard_model, {"b": 2}, "discard_row", "discard_row") is None + # violate extra field + assert validate_item("items", discard_model, {"b": False, "a": False}, "discard_row", "discard_row") is None + + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"b": 2}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('b',)) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"a": 2, "b": False}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('a',)) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + # violate extra field + item = validate_item("items", discard_value_model, {"b": False, "a": False}, "discard_value", "freeze") + # "a" extra got removed + assert item.dict() == {"b": False} + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", evolve_model, {"b": 2}, "evolve", "evolve") + assert item.b == 2 + # extra fields allowed + item = validate_item("items", evolve_model, {"b": False, "a": False}, "evolve", "evolve") + assert item.b is False + assert item.a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", mixed_model, {"b": 3}, "discard_row", "evolve") + assert item.b == 3 + # extra fields forbidden - full rows discarded + assert validate_item("items", mixed_model, {"b": False, "a": False}, "discard_row", "evolve") is None diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 2a20db62b4..4354460374 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -6,7 +6,7 @@ import pytest import dlt -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.common.utils import uniq_id from dlt.pipeline import helpers, state_sync, Pipeline from dlt.load import Load diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index fbc5088ab2..4e8d1f9049 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -13,7 +13,7 @@ from dlt.common.pipeline import StateInjectableContext from dlt.common.typing import AnyFun, StrAny from dlt.common.utils import digest128 -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.sources.helpers.transform import skip_first, take_first from tests.pipeline.utils import assert_load_info diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 2fc4aad1a8..004aac0285 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -14,7 +14,7 @@ from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceNameMissing -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.exceptions import CannotRestorePipelineException, PipelineConfigMissing, PipelineStepFailed from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.exceptions import DestinationHasFailedJobs diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 113585f669..94fbc80cf8 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -1,23 +1,20 @@ -import posixpath, os -from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Optional, Tuple, Dict, Callable +from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Callable import pytest import dlt from dlt.common.destination.reference import WithStagingDataset -from dlt.pipeline.pipeline import Pipeline -from dlt.common import json from dlt.common.configuration.container import Container from dlt.common.pipeline import LoadInfo, PipelineContext -from dlt.common.typing import DictStrAny -from dlt.pipeline.exceptions import SqlClientNotAvailable -from dlt.common.schema.typing import LOADS_TABLE_NAME +from tests.pipeline.utils import (load_table_counts, load_data_table_counts, assert_data_table_counts, load_file, + load_files, load_tables_to_dicts, load_table_distinct_counts) from tests.load.utils import DestinationTestConfiguration, destinations_configs if TYPE_CHECKING: from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + @pytest.fixture(autouse=True) def drop_pipeline(request) -> Iterator[None]: yield @@ -120,149 +117,3 @@ def assert_query_data(p: dlt.Pipeline, sql: str, table_data: List[Any], schema_n # the second is load id if info: assert row[1] in info.loads_ids - - -def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: - """ - util function to load a filesystem destination file and return parsed content - values may not be cast to the right type, especially for insert_values, please - make sure to do conversions and casting if needed in your tests - """ - result: List[Dict[str, Any]] = [] - - # check if this is a file we want to read - file_name_items = file.split(".") - ext = file_name_items[-1] - if ext not in ["jsonl", "insert_values", "parquet"]: - return "skip", [] - - # table name will be last element of path - table_name = path.split("/")[-1] - - # skip loads table - if table_name == "_dlt_loads": - return table_name, [] - - full_path = posixpath.join(path, file) - - # load jsonl - if ext == "jsonl": - with open(full_path, "rU", encoding="utf-8") as f: - for line in f: - result.append(json.loads(line)) - - # load insert_values (this is a bit volatile if the exact format of the source file changes) - elif ext == "insert_values": - with open(full_path, "rU", encoding="utf-8") as f: - lines = f.readlines() - # extract col names - cols = lines[0][15:-2].split(",") - for line in lines[2:]: - values = line[1:-3].split(",") - result.append(dict(zip(cols, values))) - - # load parquet - elif ext == "parquet": - import pyarrow.parquet as pq - with open(full_path, "rb") as f: - table = pq.read_table(f) - cols = table.column_names - count = 0 - for column in table: - column_name = cols[count] - item_count = 0 - for item in column.to_pylist(): - if len(result) <= item_count: - result.append({column_name: item}) - else: - result[item_count][column_name] = item - item_count += 1 - count += 1 - - return table_name, result - - -def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client: FilesystemClient = p.destination_client() # type: ignore[assignment] - result: Dict[str, Any] = {} - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): - for file in files: - table_name, items = load_file(basedir, file) - if table_name not in table_names: - continue - if table_name in result: - result[table_name] = result[table_name] + items - else: - result[table_name] = items - - # loads file is special case - if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): - result[LOADS_TABLE_NAME] = [] - - return result - - -def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: - """Returns row counts for `table_names` as dict""" - - # try sql, could be other destination though - try: - with p.sql_client() as c: - qualified_names = [c.make_qualified_table_name(name) for name in table_names] - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} - except SqlClientNotAvailable: - pass - - # try filesystem - file_tables = load_files(p, *table_names) - result = {} - for table_name, items in file_tables.items(): - result[table_name] = len(items) - return result - -def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: - tables = [table["name"] for table in p.default_schema.data_tables()] - return load_table_counts(p, *tables) - - -def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: - table_counts = load_data_table_counts(p) - assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" - - -def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - - # try sql, could be other destination though - try: - result = {} - for table_name in table_names: - table_rows = [] - columns = p.default_schema.get_table_columns(table_name).keys() - query_columns = ",".join(columns) - - with p.sql_client() as c: - f_q_table_name = c.make_qualified_table_name(table_name) - query = f"SELECT {query_columns} FROM {f_q_table_name}" - with c.execute_query(query) as cur: - for row in list(cur.fetchall()): - table_rows.append(dict(zip(columns, row))) - result[table_name] = table_rows - return result - - except SqlClientNotAvailable: - pass - - # try files - return load_files(p, *table_names) - -def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: - """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) - with p.sql_client() as c: - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 5aab70a11c..309511b95f 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -24,7 +24,7 @@ from dlt.destinations import filesystem, redshift, dummy from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.load.exceptions import LoadClientJobFailed from dlt.pipeline.exceptions import InvalidPipelineName, PipelineNotActive, PipelineStepFailed from dlt.pipeline.helpers import retry_load diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index 7870918c78..d29bac13f2 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -1,20 +1,22 @@ import os -from typing import Any, Dict, Iterator, Optional +from typing import Any, ClassVar, Dict, Iterator, List, Optional import pytest from pydantic import BaseModel import dlt -from dlt.common import json +from dlt.common import json, pendulum from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.libs.pydantic import DltConfig from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector from dlt.extract.storage import ExtractorStorage +from dlt.extract.validation import PydanticValidator from dlt.pipeline import TCollectorArg from tests.extract.utils import expect_extracted_file from tests.load.utils import DestinationTestConfiguration, destinations_configs -from tests.pipeline.utils import many_delayed +from tests.pipeline.utils import assert_load_info, load_data_table_counts, many_delayed @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) @@ -86,6 +88,63 @@ class Columns(BaseModel): assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True +@pytest.mark.parametrize("yield_list", [True, False]) +def test_pydantic_columns_with_contracts(yield_list: bool) -> None: + from datetime import datetime # noqa + + class UserLabel(BaseModel): + label: str + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + user = User( + user_id=1, + name="u1", + created_at=pendulum.now(), + labels=["l1", "l2"], + user_label=UserLabel(label="in_l1"), + user_labels=[UserLabel(label="l_l1"), UserLabel(label="l_l1")] + ) + + @dlt.resource(columns=User) + def users(users_list: List[Any]) -> Iterator[Any]: + if yield_list: + yield users_list + else: + yield from users_list + + pipeline = dlt.pipeline(destination='duckdb') + info = pipeline.run(users([user.dict(), user.dict()])) + assert_load_info(info) + print(pipeline.last_trace.last_normalize_info) + # data is passing validation, all filled in + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + + # produce two users with extra attrs in the child model but set the rows to discard so nothing is loaded + u1 = user.dict() + u1["user_labels"][0]["extra_1"] = "extra" + u1["user_labels"][1]["extra_1"] = "extra" + u2 = user.dict() + u2["user_labels"][0]["is_extra"] = True + + r = users([u1, u2]) + r.apply_hints(schema_contract="discard_row") + validator: PydanticValidator[User] = r.validator # type: ignore[assignment] + assert validator.data_mode == "discard_row" + assert validator.column_mode == "discard_row" + pipeline.run(r) + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + print(pipeline.last_trace.last_normalize_info) + + def test_extract_pydantic_models() -> None: pipeline = dlt.pipeline(destination='duckdb') diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 706644b60e..cd3e2444c8 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -22,7 +22,7 @@ from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.trace import PipelineTrace, SerializableResolvedValueTrace, describe_extract_data, load_trace from dlt.pipeline.track import slack_notify_load_success -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.extract.pipe import Pipe from tests.utils import start_test_telemetry diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 21b2adc699..93a5abf44c 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -4,11 +4,11 @@ from dlt.common.schema.typing import TSchemaContract from dlt.common.utils import uniq_id -from dlt.extract.source import DltSource, DltResource +from dlt.common.schema.exceptions import DataValidationError + +from dlt.extract import DltResource from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.exceptions import PipelineStepFailed -from dlt.common.schema.exceptions import SchemaFrozenException -from dlt.common.schema import utils from tests.load.pipeline.utils import load_table_counts from tests.utils import TDataItemFormat, skip_if_not_active, data_to_item_format, ALL_DATA_ITEM_FORMATS @@ -27,7 +27,7 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: return with pytest.raises(PipelineStepFailed) as py_exc: yield - assert isinstance(py_exc.value.__context__, SchemaFrozenException) + assert isinstance(py_exc.value.__context__, DataValidationError) def items(settings: TSchemaContract) -> Any: diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 53513103a7..0d36ff3021 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -1,3 +1,5 @@ +import posixpath +from typing import Any, Dict, List, Tuple import pytest import random from os import environ @@ -5,7 +7,10 @@ import dlt from dlt.common import json, sleep from dlt.common.pipeline import LoadInfo +from dlt.common.schema.typing import LOADS_TABLE_NAME from dlt.common.typing import DictStrAny +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import TEST_STORAGE_ROOT @@ -36,6 +41,154 @@ def load_json_case(name: str) -> DictStrAny: return json.load(f) +def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: + """Returns row counts for `table_names` as dict""" + + # try sql, could be other destination though + try: + with p.sql_client() as c: + qualified_names = [c.make_qualified_table_name(name) for name in table_names] + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + except SqlClientNotAvailable: + pass + + # try filesystem + file_tables = load_files(p, *table_names) + result = {} + for table_name, items in file_tables.items(): + result[table_name] = len(items) + return result + +def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: + tables = [table["name"] for table in p.default_schema.data_tables()] + return load_table_counts(p, *tables) + + +def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: + table_counts = load_data_table_counts(p) + assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" + + +def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: + """ + util function to load a filesystem destination file and return parsed content + values may not be cast to the right type, especially for insert_values, please + make sure to do conversions and casting if needed in your tests + """ + result: List[Dict[str, Any]] = [] + + # check if this is a file we want to read + file_name_items = file.split(".") + ext = file_name_items[-1] + if ext not in ["jsonl", "insert_values", "parquet"]: + return "skip", [] + + # table name will be last element of path + table_name = path.split("/")[-1] + + # skip loads table + if table_name == "_dlt_loads": + return table_name, [] + + full_path = posixpath.join(path, file) + + # load jsonl + if ext == "jsonl": + with open(full_path, "rU", encoding="utf-8") as f: + for line in f: + result.append(json.loads(line)) + + # load insert_values (this is a bit volatile if the exact format of the source file changes) + elif ext == "insert_values": + with open(full_path, "rU", encoding="utf-8") as f: + lines = f.readlines() + # extract col names + cols = lines[0][15:-2].split(",") + for line in lines[2:]: + values = line[1:-3].split(",") + result.append(dict(zip(cols, values))) + + # load parquet + elif ext == "parquet": + import pyarrow.parquet as pq + with open(full_path, "rb") as f: + table = pq.read_table(f) + cols = table.column_names + count = 0 + for column in table: + column_name = cols[count] + item_count = 0 + for item in column.to_pylist(): + if len(result) <= item_count: + result.append({column_name: item}) + else: + result[item_count][column_name] = item + item_count += 1 + count += 1 + + return table_name, result + + +def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" + client: FilesystemClient = p.destination_client() # type: ignore[assignment] + result: Dict[str, Any] = {} + for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for file in files: + table_name, items = load_file(basedir, file) + if table_name not in table_names: + continue + if table_name in result: + result[table_name] = result[table_name] + items + else: + result[table_name] = items + + # loads file is special case + if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): + result[LOADS_TABLE_NAME] = [] + + return result + + + +def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + + # try sql, could be other destination though + try: + result = {} + for table_name in table_names: + table_rows = [] + columns = p.default_schema.get_table_columns(table_name).keys() + query_columns = ",".join(columns) + + with p.sql_client() as c: + f_q_table_name = c.make_qualified_table_name(table_name) + query = f"SELECT {query_columns} FROM {f_q_table_name}" + with c.execute_query(query) as cur: + for row in list(cur.fetchall()): + table_rows.append(dict(zip(columns, row))) + result[table_name] = table_rows + return result + + except SqlClientNotAvailable: + pass + + # try files + return load_files(p, *table_names) + + +def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: + """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) + with p.sql_client() as c: + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + + @dlt.source def airtable_emojis(): From f4d2cac1f14e18f26db7df38b6b9e3c1896def89 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 20 Nov 2023 00:27:34 +0100 Subject: [PATCH 72/73] temp disable pydantic 1 tests --- .github/workflows/test_common.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index ec97aac304..24c8215c2b 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -103,18 +103,18 @@ jobs: name: Run extract tests Windows shell: cmd - - name: Install Pydantic 1.0 - run: pip install "pydantic<2" - - - run: | - poetry run pytest tests/libs - if: runner.os != 'Windows' - name: Run extract and pipeline tests Linux/MAC - - run: | - poetry run pytest tests/libs - if: runner.os == 'Windows' - name: Run extract tests Windows - shell: cmd + # - name: Install Pydantic 1.0 + # run: pip install "pydantic<2" + + # - run: | + # poetry run pytest tests/libs + # if: runner.os != 'Windows' + # name: Run extract and pipeline tests Linux/MAC + # - run: | + # poetry run pytest tests/libs + # if: runner.os == 'Windows' + # name: Run extract tests Windows + # shell: cmd matrix_job_required_check: name: Common tests From f556584240337179adec95ff5486cb9c819def25 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 20 Nov 2023 18:21:50 +0100 Subject: [PATCH 73/73] fixes generic type parametrization on 3.8 --- dlt/common/libs/pydantic.py | 32 +++++++++++++++++++++++++------- tests/libs/test_pydantic.py | 34 +++++++++++++++++++++++++++++++++- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index 9af7104710..1b65fa3a7e 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,6 +1,7 @@ +from __future__ import annotations import inspect from copy import copy -from typing import Generic, Sequence, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_type_hints, get_args, Any +from typing import Dict, Generic, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_args, Any from dlt.common.exceptions import MissingDependencyException from dlt.common.schema import DataValidationError @@ -161,19 +162,36 @@ def apply_schema_contract_to_model( config = copy(model.Config) # type: ignore[attr-defined] config.extra = extra # type: ignore[attr-defined] - def _process_annotation(t_: Type[Any]) -> Any: + _child_models: Dict[int, Type[BaseModel]] = {} + + def _process_annotation(t_: Type[Any]) -> Type[Any]: """Recursively recreates models with applied schema contract """ if is_list_generic_type(t_): - l_t = get_args(t_)[0] - return get_origin(t_)[_process_annotation(l_t)] + l_t: Type[Any] = get_args(t_)[0] + try: + return get_origin(t_)[_process_annotation(l_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return List[_process_annotation(l_t)] # type: ignore elif is_dict_generic_type(t_): + k_t: Type[Any] + v_t: Type[Any] k_t, v_t = get_args(t_) - return get_origin(t_)[k_t, _process_annotation(v_t)] + try: + return get_origin(t_)[k_t, _process_annotation(v_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return Dict[k_t, _process_annotation(v_t)] # type: ignore elif is_union(t_): u_t_s = tuple(_process_annotation(u_t) for u_t in extract_union_types(t_)) - return Union[u_t_s] + return Union[u_t_s] # type: ignore[return-value] elif inspect.isclass(t_) and issubclass(t_, BaseModel): - return apply_schema_contract_to_model(t_, column_mode, data_mode) + # types must be same before and after processing + if id(t_) in _child_models: + return _child_models[id(t_)] + else: + _child_models[id(t_)] = child_model = apply_schema_contract_to_model(t_, column_mode, data_mode) + return child_model return t_ new_model: Type[_TPydanticModel] = create_model( # type: ignore[call-overload] diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py index 48227577c4..5606bd25b2 100644 --- a/tests/libs/test_pydantic.py +++ b/tests/libs/test_pydantic.py @@ -1,6 +1,6 @@ from copy import copy import pytest -from typing import ClassVar, Union, Optional, List, Dict, Any +from typing import ClassVar, Sequence, Mapping, Dict, MutableMapping, MutableSequence, Union, Optional, List, Dict, Any from enum import Enum from datetime import datetime, date, time # noqa: I251 @@ -205,6 +205,38 @@ def test_model_for_column_mode() -> None: apply_schema_contract_to_model(ModelWithConfig, "evolve", "discard_value") +def test_nested_model_config_propagation() -> None: + class UserLabel(BaseModel): + label: str + + class UserAddress(BaseModel): + street: str + zip_code: Sequence[int] + label: Optional[UserLabel] + ro_labels: Mapping[str, UserLabel] + wr_labels: MutableMapping[str, List[UserLabel]] + ro_list: Sequence[UserLabel] + wr_list: MutableSequence[Dict[str, UserLabel]] + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + address: UserAddress + unity: Union[UserAddress, UserLabel, Dict[str, UserAddress]] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + model_freeze = apply_schema_contract_to_model(User, "evolve", "freeze") + from typing import get_type_hints + print(get_type_hints(model_freeze)) + print(get_type_hints(model_freeze.model_fields["address"].annotation)) + + + def test_item_list_validation() -> None: class ItemModel(BaseModel):