From f4faa836df37cf810b2eb5b8ba754aa80f946719 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 2 Dec 2024 16:24:57 +0100 Subject: [PATCH 1/2] #2087 allows double underscores in identifiers (#2098) * removes astunparse and aiohttp * allows for built-in ast unparse if present * uses break path for normalization to allow names containing path separators, migrates old schema to enable compat mode with old behavior * adds removeprefix util * updates docs * bumps dlt to version 1.4.1 * linter fixes * fixes tests * fixes and tests saving pandas indexes * fixes sqllite read interface tests * updates docs --- dlt/cli/deploy_command_helpers.py | 13 +- dlt/cli/source_detection.py | 5 +- dlt/common/destination/reference.py | 1 - dlt/common/libs/pandas.py | 5 +- dlt/common/normalizers/json/helpers.py | 141 + dlt/common/normalizers/json/relational.py | 172 +- dlt/common/normalizers/naming/naming.py | 2 + dlt/common/normalizers/typing.py | 2 + dlt/common/reflection/utils.py | 14 +- dlt/common/schema/configuration.py | 2 + dlt/common/schema/migrations.py | 7 +- dlt/common/schema/normalizers.py | 7 +- dlt/common/schema/schema.py | 26 +- dlt/common/schema/typing.py | 2 +- dlt/common/utils.py | 5 + dlt/destinations/dataset.py | 6 +- .../impl/clickhouse/sql_client.py | 6 +- .../impl/filesystem/filesystem.py | 3 +- dlt/extract/extractors.py | 14 +- dlt/normalize/worker.py | 5 +- dlt/reflection/script_visitor.py | 9 +- dlt/sources/sql_database/arrow_helpers.py | 5 +- .../dlt-ecosystem/destinations/filesystem.md | 2 +- .../verified-sources/arrow-pandas.md | 2 + .../docs/general-usage/naming-convention.md | 39 + mypy.ini | 2 +- poetry.lock | 110 +- pyproject.toml | 5 +- .../cases/schemas/eth/ethereum_schema_v11.yml | 394 +++ .../cases/schemas/github/issues.schema.json | 2404 ++++++++--------- .../normalizers/test_json_relational.py | 10 +- .../normalizers/test_naming_snake_case.py | 8 + .../common/schema/test_import_normalizers.py | 36 +- .../schema/test_normalize_identifiers.py | 62 +- tests/common/schema/test_schema.py | 20 +- tests/common/schema/test_versioning.py | 12 +- tests/common/storages/test_schema_storage.py | 12 +- tests/common/storages/utils.py | 4 +- tests/common/test_utils.py | 9 + tests/common/test_validation.py | 2 +- tests/common/utils.py | 6 +- .../cases/eth_source/ethereum.schema.yaml | 4 +- tests/extract/test_decorators.py | 4 +- tests/extract/test_incremental.py | 76 +- tests/libs/pyarrow/test_pyarrow_normalizer.py | 4 +- .../test_clickhouse_configuration.py | 26 +- tests/load/conftest.py | 2 +- tests/load/duckdb/test_duckdb_client.py | 2 +- tests/load/filesystem/test_aws_credentials.py | 1 - .../load/filesystem/test_filesystem_common.py | 1 - tests/load/pipeline/conftest.py | 2 +- tests/load/pipeline/test_merge_disposition.py | 2 +- tests/load/pipeline/test_scd2.py | 3 +- tests/load/qdrant/utils.py | 1 - tests/load/redshift/test_redshift_client.py | 2 +- tests/load/test_job_client.py | 2 +- tests/load/test_read_interfaces.py | 11 +- tests/load/test_sql_client.py | 2 +- tests/load/weaviate/utils.py | 1 - .../cases/github_pipeline/github_rev.py | 26 + tests/pipeline/test_dlt_versions.py | 56 + .../test_max_nesting.py | 0 tests/pipeline/test_pipeline.py | 105 + 63 files changed, 2203 insertions(+), 1721 deletions(-) create mode 100644 dlt/common/normalizers/json/helpers.py create mode 100644 tests/common/cases/schemas/eth/ethereum_schema_v11.yml create mode 100644 tests/pipeline/cases/github_pipeline/github_rev.py rename tests/{normalize => pipeline}/test_max_nesting.py (100%) diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index b508b32226..e3719fbe38 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -5,7 +5,6 @@ from yaml import Dumper from itertools import chain from typing import List, Optional, Sequence, Tuple, Any, Dict -from astunparse import unparse # optional dependencies import pipdeptree @@ -23,7 +22,7 @@ from dlt.common.git import get_origin, get_repo, Repo from dlt.common.configuration.specs.runtime_configuration import get_default_pipeline_name from dlt.common.typing import StrAny -from dlt.common.reflection.utils import evaluate_node_literal +from dlt.common.reflection.utils import evaluate_node_literal, ast_unparse from dlt.common.pipeline import LoadInfo, TPipelineState, get_dlt_repos_dir from dlt.common.storages import FileStorage from dlt.common.utils import set_working_dir @@ -313,7 +312,7 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio if f_r_value is None: fmt.warning( "The value of `dev_mode` in call to `dlt.pipeline` cannot be" - f" determined from {unparse(f_r_node).strip()}. We assume that you know" + f" determined from {ast_unparse(f_r_node).strip()}. We assume that you know" " what you are doing :)" ) if f_r_value is True: @@ -331,8 +330,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipelines_dir' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipelines-dir option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipelines-dir option.", ) p_n_node = call_args.arguments.get("pipeline_name") @@ -342,8 +341,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipeline_name' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipeline-name option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipeline-name option.", ) pipelines.append((pipeline_name, pipelines_dir)) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index f4e9b3e050..7067f8b896 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -1,11 +1,10 @@ import ast import inspect -from astunparse import unparse from typing import Dict, Tuple, Set, List from dlt.common.configuration import is_secret_hint from dlt.common.configuration.specs import BaseConfiguration -from dlt.common.reflection.utils import creates_func_def_name_node +from dlt.common.reflection.utils import creates_func_def_name_node, ast_unparse from dlt.common.typing import is_optional_type from dlt.sources import SourceReference @@ -65,7 +64,7 @@ def find_source_calls_to_replace( for calls in visitor.known_sources_resources_calls.values(): for call in calls: transformed_nodes.append( - (call.func, ast.Name(id=pipeline_name + "_" + unparse(call.func))) + (call.func, ast.Name(id=pipeline_name + "_" + ast_unparse(call.func))) ) return transformed_nodes diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index d1024eb28c..e27f99cde7 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -81,7 +81,6 @@ DataFrame = Any ArrowTable = Any IbisBackend = Any - else: DataFrame = Any ArrowTable = Any diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index a165ea8747..35cfe623bb 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -8,8 +8,9 @@ raise MissingDependencyException("dlt Pandas Helpers", ["pandas"]) -def pandas_to_arrow(df: pandas.DataFrame) -> Any: +def pandas_to_arrow(df: pandas.DataFrame, preserve_index: bool = False) -> Any: """Converts pandas to arrow or raises an exception if pyarrow is not installed""" from dlt.common.libs.pyarrow import pyarrow as pa - return pa.Table.from_pandas(df) + # NOTE: None preserves named indexes but ignores unnamed + return pa.Table.from_pandas(df, preserve_index=preserve_index) diff --git a/dlt/common/normalizers/json/helpers.py b/dlt/common/normalizers/json/helpers.py new file mode 100644 index 0000000000..96c9ab4954 --- /dev/null +++ b/dlt/common/normalizers/json/helpers.py @@ -0,0 +1,141 @@ +""" +Cached helper methods for all operations that are called often +""" +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple, cast + +from dlt.common.json import json +from dlt.common.destination.utils import resolve_merge_strategy +from dlt.common.normalizers.naming import NamingConvention +from dlt.common.normalizers.typing import TRowIdType +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES +from dlt.common.schema import Schema +from dlt.common.schema.typing import TColumnSchema, C_DLT_ID, DLT_NAME_PREFIX +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + get_first_column_name_with_prop, + is_nested_table, +) +from dlt.common.utils import digest128 + + +@lru_cache(maxsize=None) +def shorten_fragments(naming: NamingConvention, *idents: str) -> str: + return naming.shorten_fragments(*idents) + + +@lru_cache(maxsize=None) +def normalize_table_identifier(schema: Schema, naming: NamingConvention, table_name: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_tables_path(table_name) + else: + return naming.normalize_table_identifier(table_name) + + +@lru_cache(maxsize=None) +def normalize_identifier(schema: Schema, naming: NamingConvention, identifier: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_path(identifier) + else: + return naming.normalize_identifier(identifier) + + +@lru_cache(maxsize=None) +def get_table_nesting_level( + schema: Schema, table_name: str, default_nesting: int = 1000 +) -> Optional[int]: + """gets table nesting level, will inherit from parent if not set""" + + table = schema.tables.get(table_name) + if ( + table + and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) is not None + ): + return max_nesting + return default_nesting + + +@lru_cache(maxsize=None) +def get_primary_key(schema: Schema, table_name: str) -> List[str]: + if table_name not in schema.tables: + return [] + table = schema.get_table(table_name) + return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) + + +@lru_cache(maxsize=None) +def is_nested_type( + schema: Schema, + table_name: str, + field_name: str, + _r_lvl: int, +) -> bool: + """For those paths the nested objects should be left in place. + Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster + """ + + # nesting level is counted backwards + # is we have traversed to or beyond the calculated nesting level, we detect a nested type + if _r_lvl <= 0: + return True + + column: TColumnSchema = None + table = schema.tables.get(table_name) + if table: + column = table["columns"].get(field_name) + if column is None or "data_type" not in column: + data_type = schema.get_preferred_type(field_name) + else: + data_type = column["data_type"] + + return data_type == "json" + + +@lru_cache(maxsize=None) +def get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: + """Gets type of row id to be added to nested table and if linking information should be added""" + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): + return "random", False + else: + # table will be created, use standard linking + pass + return "row_hash", True + + +@lru_cache(maxsize=None) +def get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy == "upsert": + return "key_hash" + elif merge_strategy == "scd2": + x_row_version_col = get_first_column_name_with_prop( + schema.get_table(table_name), + "x-row-version", + include_incomplete=True, + ) + if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): + return "row_hash" + return "random" + + +def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: + """Returns hash of row. + + Hash includes column names and values and is ordered by column name. + Excludes dlt system columns. + Can be used as deterministic row identifier. + """ + row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} + if subset is not None: + row_filtered = {k: v for k, v in row.items() if k in subset} + row_str = json.dumps(row_filtered, sort_keys=True) + return digest128(row_str, DLT_ID_LENGTH_BYTES) + + +def get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: + # create deterministic unique id of the nested row taking into account that all lists are ordered + # and all nested tables must be lists + return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index c5338192a0..e365017125 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,34 +1,27 @@ -from functools import lru_cache from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any -from dlt.common.destination.utils import resolve_merge_strategy -from dlt.common.json import json -from dlt.common.normalizers.exceptions import InvalidJsonNormalizer -from dlt.common.normalizers.typing import TJSONNormalizer, TRowIdType -from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES +from dlt.common.normalizers.exceptions import InvalidJsonNormalizer +from dlt.common.normalizers.typing import TJSONNormalizer +from dlt.common.normalizers.utils import generate_dlt_id from dlt.common.typing import DictStrAny, TDataItem, StrAny from dlt.common.schema import Schema from dlt.common.schema.typing import ( C_DLT_ID, C_DLT_LOAD_ID, - TColumnSchema, TColumnName, TSimpleRegex, - DLT_NAME_PREFIX, ) from dlt.common.schema.utils import ( column_name_validator, - get_columns_names_with_prop, - get_first_column_name_with_prop, - has_column_with_prop, is_nested_table, ) -from dlt.common.utils import digest128, update_dict_nested +from dlt.common.utils import update_dict_nested from dlt.common.normalizers.json import ( TNormalizedRowIterator, wrap_in_dict, DataItemNormalizer as DataItemNormalizerBase, ) +from dlt.common.normalizers.json import helpers from dlt.common.validation import validate_dict @@ -103,18 +96,18 @@ def _flatten( def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -> None: for k, v in dict_row.items(): if k.strip(): - norm_k = self._normalize_identifier(self.schema, k) + norm_k = helpers.normalize_identifier(self.schema, self.naming, k) else: # for empty keys in the data use _ norm_k = self.EMPTY_KEY_IDENTIFIER # if norm_k != k: # print(f"{k} -> {norm_k}") nested_name = ( - norm_k if path == () else self._shorten_fragments(self.schema, *path, norm_k) + norm_k if path == () else helpers.shorten_fragments(self.naming, *path, norm_k) ) # for lists and dicts we must check if type is possibly nested if isinstance(v, (dict, list)): - if not self._is_nested_type(self.schema, table, nested_name, __r_lvl): + if not helpers.is_nested_type(self.schema, table, nested_name, __r_lvl): # TODO: if schema contains table {table}__{nested_name} then convert v into single element list if isinstance(v, dict): # flatten the dict more @@ -122,7 +115,8 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - else: # pass the list to out_rec_list out_rec_list[ - path + (self._normalize_table_identifier(self.schema, k),) + path + + (helpers.normalize_table_identifier(self.schema, self.naming, k),) ] = v continue else: @@ -134,26 +128,6 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_row_dicts(dict_row, _r_lvl) return out_rec_row, out_rec_list - @staticmethod - def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: - """Returns hash of row. - - Hash includes column names and values and is ordered by column name. - Excludes dlt system columns. - Can be used as deterministic row identifier. - """ - row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} - if subset is not None: - row_filtered = {k: v for k, v in row.items() if k in subset} - row_str = json.dumps(row_filtered, sort_keys=True) - return digest128(row_str, DLT_ID_LENGTH_BYTES) - - @staticmethod - def _get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: - # create deterministic unique id of the nested row taking into account that all lists are ordered - # and all nested tables must be lists - return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id row[self.c_dlt_parent_id] = parent_row_id @@ -175,20 +149,20 @@ def _add_row_id( is_root: bool = False, ) -> str: if is_root: # root table - row_id_type = self._get_root_row_id_type(self.schema, table) + row_id_type = helpers.get_root_row_id_type(self.schema, table) if row_id_type in ("key_hash", "row_hash"): subset = None if row_id_type == "key_hash": - subset = self._get_primary_key(self.schema, table) + subset = helpers.get_primary_key(self.schema, table) # base hash on `dict_row` instead of `flattened_row` # so changes in nested tables lead to new row id - row_id = self.get_row_hash(dict_row, subset=subset) + row_id = helpers.get_row_hash(dict_row, subset=subset) else: row_id = generate_dlt_id() else: # nested table - row_id_type, is_nested = self._get_nested_row_id_type(self.schema, table) + row_id_type, is_nested = helpers.get_nested_row_id_type(self.schema, table) if row_id_type == "row_hash": - row_id = DataItemNormalizer._get_nested_row_hash(parent_row_id, table, pos) + row_id = helpers.get_nested_row_hash(parent_row_id, table, pos) # link to parent table if is_nested: self._link_row(flattened_row, parent_row_id, pos) @@ -227,7 +201,7 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - table = self._shorten_fragments(self.schema, *parent_path, *ident_path) + table = helpers.shorten_fragments(self.naming, *parent_path, *ident_path) for idx, v in enumerate(seq): if isinstance(v, dict): @@ -251,7 +225,7 @@ def _normalize_list( wrap_v = wrap_in_dict(self.c_value, v) DataItemNormalizer._extend_row(extend, wrap_v) self._add_row_id(table, wrap_v, wrap_v, parent_row_id, idx) - yield (table, self._shorten_fragments(self.schema, *parent_path)), wrap_v + yield (table, helpers.shorten_fragments(self.naming, *parent_path)), wrap_v def _normalize_row( self, @@ -264,8 +238,8 @@ def _normalize_row( _r_lvl: int = 0, is_root: bool = False, ) -> TNormalizedRowIterator: - schema = self.schema - table = self._shorten_fragments(schema, *parent_path, *ident_path) + naming = self.naming + table = helpers.shorten_fragments(naming, *parent_path, *ident_path) # flatten current row and extract all lists to recur into flattened_row, lists = self._flatten(table, dict_row, _r_lvl) # always extend row @@ -280,7 +254,7 @@ def _normalize_row( # yield parent table first should_descend = yield ( - (table, self._shorten_fragments(schema, *parent_path)), + (table, helpers.shorten_fragments(naming, *parent_path)), flattened_row, ) if should_descend is False: @@ -361,8 +335,10 @@ def normalize_data_item( # identify load id if loaded data must be processed after loading incrementally row[self.c_dlt_load_id] = load_id # get table name and nesting level - root_table_name = self._normalize_table_identifier(self.schema, table_name) - max_nesting = self._get_table_nesting_level(self.schema, root_table_name, self.max_nesting) + root_table_name = helpers.normalize_table_identifier(self.schema, self.naming, table_name) + max_nesting = helpers.get_table_nesting_level( + self.schema, root_table_name, self.max_nesting + ) yield from self._normalize_row( row, @@ -426,103 +402,3 @@ def _normalize_prop( "./normalizers/json/config", validator_f=column_name_validator(schema.naming), ) - - # - # Cached helper methods for all operations that are called often - # - @staticmethod - @lru_cache(maxsize=None) - def _shorten_fragments(schema: Schema, *idents: str) -> str: - return schema.naming.shorten_fragments(*idents) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_table_identifier(schema: Schema, table_name: str) -> str: - return schema.naming.normalize_table_identifier(table_name) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_identifier(schema: Schema, identifier: str) -> str: - return schema.naming.normalize_path(identifier) - - @staticmethod - @lru_cache(maxsize=None) - def _get_table_nesting_level( - schema: Schema, table_name: str, default_nesting: int = 1000 - ) -> Optional[int]: - """gets table nesting level, will inherit from parent if not set""" - - table = schema.tables.get(table_name) - if ( - table - and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) - is not None - ): - return max_nesting - return default_nesting - - @staticmethod - @lru_cache(maxsize=None) - def _get_primary_key(schema: Schema, table_name: str) -> List[str]: - if table_name not in schema.tables: - return [] - table = schema.get_table(table_name) - return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) - - @staticmethod - @lru_cache(maxsize=None) - def _is_nested_type( - schema: Schema, - table_name: str, - field_name: str, - _r_lvl: int, - ) -> bool: - """For those paths the nested objects should be left in place. - Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster - """ - - # nesting level is counted backwards - # is we have traversed to or beyond the calculated nesting level, we detect a nested type - if _r_lvl <= 0: - return True - - column: TColumnSchema = None - table = schema.tables.get(table_name) - if table: - column = table["columns"].get(field_name) - if column is None or "data_type" not in column: - data_type = schema.get_preferred_type(field_name) - else: - data_type = column["data_type"] - - return data_type == "json" - - @staticmethod - @lru_cache(maxsize=None) - def _get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: - """Gets type of row id to be added to nested table and if linking information should be added""" - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): - return "random", False - else: - # table will be created, use standard linking - pass - return "row_hash", True - - @staticmethod - @lru_cache(maxsize=None) - def _get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy == "upsert": - return "key_hash" - elif merge_strategy == "scd2": - x_row_version_col = get_first_column_name_with_prop( - schema.get_table(table_name), - "x-row-version", - include_incomplete=True, - ) - if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): - return "row_hash" - return "random" diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index 5ae5847963..9953d25913 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -45,6 +45,8 @@ def make_path(self, *identifiers: str) -> str: def break_path(self, path: str) -> Sequence[str]: """Breaks path into sequence of identifiers""" + # TODO: this is no longer needed if we modify all naming convention to do not contract + # underscores then also normalize_path will not be needed return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] def normalize_path(self, path: str) -> str: diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 9840f3a4d2..16ad097fde 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -18,5 +18,7 @@ class TJSONNormalizer(TypedDict, total=False): class TNormalizersConfig(TypedDict, total=False): names: str allow_identifier_change_on_table_with_data: Optional[bool] + use_break_path_on_normalize: Optional[bool] + """Post 1.4.0 to allow table and column names that contain table separators""" detections: Optional[List[str]] json: TJSONNormalizer diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index cbf38a7327..c612c5a4f1 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -1,7 +1,13 @@ import ast import inspect -import astunparse -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, Callable + +try: + import astunparse + + ast_unparse: Callable[[ast.AST], str] = astunparse.unparse +except ImportError: + ast_unparse = ast.unparse # type: ignore[attr-defined, unused-ignore] from dlt.common.typing import AnyFun @@ -25,7 +31,7 @@ def get_literal_defaults(node: Union[ast.FunctionDef, ast.AsyncFunctionDef]) -> literal_defaults: Dict[str, str] = {} for arg, default in zip(reversed(args), reversed(defaults)): if default: - literal_defaults[str(arg.arg)] = astunparse.unparse(default).strip() + literal_defaults[str(arg.arg)] = ast_unparse(default).strip() return literal_defaults @@ -99,7 +105,7 @@ def rewrite_python_script( script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # replace node value - script_lines.append(astunparse.unparse(t_value).strip()) + script_lines.append(ast_unparse(t_value).strip()) last_line = node.end_lineno - 1 last_offset = node.end_col_offset diff --git a/dlt/common/schema/configuration.py b/dlt/common/schema/configuration.py index e64dd57494..72f79026da 100644 --- a/dlt/common/schema/configuration.py +++ b/dlt/common/schema/configuration.py @@ -14,3 +14,5 @@ class SchemaConfiguration(BaseConfiguration): naming: Optional[TNamingConventionReferenceArg] = None # Union[str, NamingConvention] json_normalizer: Optional[DictStrAny] = None allow_identifier_change_on_table_with_data: Optional[bool] = None + use_break_path_on_normalize: Optional[bool] = None + """Post 1.4.0 to allow table and column names that contain table separators""" diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py index d9e758f204..06eb35c0f6 100644 --- a/dlt/common/schema/migrations.py +++ b/dlt/common/schema/migrations.py @@ -29,13 +29,13 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> schema_dict["excludes"] = [] from_engine = 2 if from_engine == 2 and to_engine > 2: - from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers + from dlt.common.schema.normalizers import import_normalizers, configured_normalizers # current version of the schema current = cast(TStoredSchema, schema_dict) # add default normalizers and root hash propagation # use explicit None to get default settings. ignore any naming conventions - normalizers = explicit_normalizers(naming=None, json_normalizer=None) + normalizers = configured_normalizers(naming=None, json_normalizer=None) current["normalizers"], _, _ = import_normalizers(normalizers, normalizers) current["normalizers"]["json"]["config"] = { "propagation": {"root": {"_dlt_id": "_dlt_root_id"}} @@ -169,6 +169,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: json_config.pop("generate_dlt_id", None) from_engine = 10 + if from_engine == 10 and to_engine > 10: + schema_dict["normalizers"]["use_break_path_on_normalize"] = False + from_engine = 11 schema_dict["engine_version"] = from_engine if from_engine != to_engine: diff --git a/dlt/common/schema/normalizers.py b/dlt/common/schema/normalizers.py index 9b2a37e708..8f42e90596 100644 --- a/dlt/common/schema/normalizers.py +++ b/dlt/common/schema/normalizers.py @@ -40,13 +40,14 @@ def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: @with_config(spec=SchemaConfiguration, sections=_section_for_schema) # type: ignore[call-overload] -def explicit_normalizers( +def configured_normalizers( naming: TNamingConventionReferenceArg = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value, allow_identifier_change_on_table_with_data: bool = None, + use_break_path_on_normalize: Optional[bool] = None, schema_name: Optional[str] = None, ) -> TNormalizersConfig: - """Gets explicitly configured normalizers without any defaults or capabilities injection. If `naming` + """Gets explicitly onfigured normalizers without any defaults or capabilities injection. If `naming` is a module or a type it will get converted into string form via import. If `schema_name` is present, a section ("sources", schema_name, "schema") is used to inject the config @@ -57,6 +58,8 @@ def explicit_normalizers( norm_conf["allow_identifier_change_on_table_with_data"] = ( allow_identifier_change_on_table_with_data ) + if use_break_path_on_normalize is not None: + norm_conf["use_break_path_on_normalize"] = use_break_path_on_normalize return norm_conf diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 0dbeda93cf..d6031a08fa 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -57,7 +57,7 @@ SchemaCorruptedException, TableIdentifiersFrozen, ) -from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers +from dlt.common.schema.normalizers import import_normalizers, configured_normalizers from dlt.common.schema.exceptions import DataValidationError from dlt.common.validation import validate_dict @@ -439,7 +439,8 @@ def update_schema(self, schema: "Schema") -> None: """Updates this schema from an incoming schema. Normalizes identifiers after updating normalizers.""" # pass normalizer config self._settings = deepcopy(schema.settings) - self._configure_normalizers(schema._normalizers_config) + # make shallow copy of normalizer settings + self._configure_normalizers(copy(schema._normalizers_config)) self._compile_settings() # update all tables for table in schema.tables.values(): @@ -753,7 +754,7 @@ def update_normalizers(self) -> None: Default hints, preferred data types and normalize configs (ie. column propagation) are normalized as well. Regexes are included as long as textual parts can be extracted from an expression. """ - self._configure_normalizers(explicit_normalizers(schema_name=self._schema_name)) + self._configure_normalizers(configured_normalizers(schema_name=self._schema_name)) self._compile_settings() def will_update_normalizers(self) -> bool: @@ -761,7 +762,7 @@ def will_update_normalizers(self) -> bool: # import desired modules _, to_naming, _ = import_normalizers( - explicit_normalizers(schema_name=self._schema_name), self._normalizers_config + configured_normalizers(schema_name=self._schema_name), self._normalizers_config ) return type(to_naming) is not type(self.naming) # noqa @@ -1106,13 +1107,13 @@ def _verify_identifiers(table: TTableSchema, norm_table: TTableSchema) -> None: else: return self._schema_tables - def _renormalize_schema_identifiers( + def _replace_and_apply_naming( self, normalizers_config: TNormalizersConfig, to_naming: NamingConvention, from_naming: NamingConvention, ) -> None: - """Normalizes all identifiers in the schema in place""" + """Normalizes all identifiers in the schema in place according to `to_naming`""" self._schema_tables = self._verify_update_normalizers( normalizers_config, to_naming, from_naming ) @@ -1140,10 +1141,19 @@ def _renormalize_schema_identifiers( def _configure_normalizers(self, explicit_normalizers: TNormalizersConfig) -> None: """Gets naming and item normalizer from schema yaml, config providers and destination capabilities and applies them to schema.""" + # preserve current schema settings if not explicitly set in `explicit_normalizers` + if explicit_normalizers and self._normalizers_config: + for prop_ in [ + "use_break_path_on_normalize", + "allow_identifier_change_on_table_with_data", + ]: + if prop_ in self._normalizers_config and prop_ not in explicit_normalizers: + explicit_normalizers[prop_] = self._normalizers_config[prop_] # type: ignore[literal-required] + normalizers_config, to_naming, item_normalizer_class = import_normalizers( explicit_normalizers, self._normalizers_config ) - self._renormalize_schema_identifiers(normalizers_config, to_naming, self.naming) + self._replace_and_apply_naming(normalizers_config, to_naming, self.naming) # data item normalization function self.data_item_normalizer = item_normalizer_class(self) self.data_item_normalizer.extend_schema() @@ -1174,7 +1184,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_hints() # configure normalizers, including custom config if present if not normalizers: - normalizers = explicit_normalizers(schema_name=self._schema_name) + normalizers = configured_normalizers(schema_name=self._schema_name) self._configure_normalizers(normalizers) # add version tables self._add_standard_tables() diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index c8f5de03ed..6f5d6213c9 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -28,7 +28,7 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 10 +SCHEMA_ENGINE_VERSION = 11 # dlt tables VERSION_TABLE_NAME = "_dlt_version" diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 3ff23c9bae..58e1dbd824 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -647,3 +647,8 @@ def is_typeerror_due_to_wrong_call(exc: Exception, func: AnyFun) -> bool: func_name = func.__name__ message = str(exc) return message.__contains__(f"{func_name}()") + + +removeprefix = getattr( + str, "removeprefix", lambda s_, p_: s_[len(p_) :] if s_.startswith(p_) else s_ +) diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py index 411c876c19..27a7f5a7af 100644 --- a/dlt/destinations/dataset.py +++ b/dlt/destinations/dataset.py @@ -3,12 +3,8 @@ from contextlib import contextmanager from dlt import version - from dlt.common.json import json - -from dlt.common.normalizers.naming.naming import NamingConvention from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination import AnyDestination from dlt.common.destination.reference import ( SupportsReadableRelation, @@ -109,7 +105,7 @@ def query(self) -> Any: return self._provided_query table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_path(self._table_name) + self.schema.naming.normalize_tables_path(self._table_name) ) maybe_limit_clause_1 = "" diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index 00f35da082..a6c4ee0458 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -28,6 +28,7 @@ from dlt.common import logger from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.typing import DictStrAny +from dlt.common.utils import removeprefix from dlt.destinations.exceptions import ( DatabaseUndefinedRelation, @@ -88,9 +89,8 @@ def has_dataset(self) -> bool: sentinel_table = self.config.dataset_sentinel_table_name all_ds_tables = self._list_tables() if self.dataset_name: - return sentinel_table in [ - t.split(self.config.dataset_table_separator)[1] for t in all_ds_tables - ] + prefix = self.dataset_name + self.config.dataset_table_separator + return sentinel_table in [removeprefix(t, prefix) for t in all_ds_tables] else: # if no dataset specified we look for sentinel table return sentinel_table in all_ds_tables diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 0cf63b3ac9..1739c87fb3 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -37,7 +37,7 @@ TPipelineStateDoc, load_package as current_load_package, ) -from dlt.destinations.sql_client import DBApiCursor, WithSqlClient, SqlClientBase +from dlt.destinations.sql_client import WithSqlClient, SqlClientBase from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJobRequest, @@ -63,7 +63,6 @@ from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations import path_utils from dlt.destinations.fs_client import FSClientBase -from dlt.destinations.dataset import ReadableDBAPIDataset from dlt.destinations.utils import verify_schema_merge_disposition INIT_FILE_NAME = "init" diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 41d3035a9f..03f8a31462 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -18,6 +18,8 @@ TTableSchemaColumns, TPartialTableSchema, ) +from dlt.common.normalizers.json import helpers as normalize_helpers + from dlt.extract.hints import HintsMeta, TResourceHints from dlt.extract.resource import DltResource from dlt.extract.items import DataItemWithMeta, TableNameMeta @@ -141,7 +143,9 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_to_dynamic_table(resource, items, meta) def write_empty_items_file(self, table_name: str) -> None: - table_name = self.naming.normalize_table_identifier(table_name) + table_name = normalize_helpers.normalize_table_identifier( + self.schema, self.naming, table_name + ) self.item_storage.write_empty_items_file(self.load_id, self.schema.name, table_name, None) def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: @@ -151,10 +155,12 @@ def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[s table_name = meta.table_name else: table_name = resource.table_name # type: ignore[assignment] - return self.naming.normalize_table_identifier(table_name) + return normalize_helpers.normalize_table_identifier(self.schema, self.naming, table_name) def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: - return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + return normalize_helpers.normalize_table_identifier( + self.schema, self.naming, resource._table_name_hint_fun(item) + ) def _write_item( self, @@ -322,7 +328,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No ) for tbl in ( ( - # 1. Convert pandas frame(s) to arrow Table + # 1. Convert pandas frame(s) to arrow Table, remove indexes because we store pandas_to_arrow(item) if (pandas and isinstance(item, pandas.DataFrame)) else item diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index 53a856f7d0..5eccdf5433 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -20,6 +20,7 @@ ParsedLoadJobFileName, ) from dlt.common.schema import TSchemaUpdate, Schema +from dlt.common.normalizers.json import helpers as normalize_helpers from dlt.normalize.configuration import NormalizeConfiguration from dlt.normalize.exceptions import NormalizeJobFailed @@ -218,8 +219,8 @@ def _gather_metrics_and_close( parsed_file_name = ParsedLoadJobFileName.parse(extracted_items_file) # normalize table name in case the normalization changed # NOTE: this is the best we can do, until a full lineage information is in the schema - root_table_name = schema.naming.normalize_table_identifier( - parsed_file_name.table_name + root_table_name = normalize_helpers.normalize_table_identifier( + schema, schema.naming, parsed_file_name.table_name ) root_tables.add(root_table_name) root_table = stored_schema["tables"].get(root_table_name, {"name": root_table_name}) diff --git a/dlt/reflection/script_visitor.py b/dlt/reflection/script_visitor.py index f4a5569ed0..c49fed20ab 100644 --- a/dlt/reflection/script_visitor.py +++ b/dlt/reflection/script_visitor.py @@ -1,10 +1,9 @@ import inspect import ast -import astunparse from ast import NodeVisitor from typing import Any, Dict, List -from dlt.common.reflection.utils import find_outer_func_def +from dlt.common.reflection.utils import find_outer_func_def, ast_unparse import dlt.reflection.names as n @@ -68,9 +67,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: for deco in node.decorator_list: # decorators can be function calls, attributes or names if isinstance(deco, (ast.Name, ast.Attribute)): - alias_name = astunparse.unparse(deco).strip() + alias_name = ast_unparse(deco).strip() elif isinstance(deco, ast.Call): - alias_name = astunparse.unparse(deco.func).strip() + alias_name = ast_unparse(deco.func).strip() else: raise ValueError( self.source_segment(deco), type(deco), "Unknown decorator form" @@ -87,7 +86,7 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: def visit_Call(self, node: ast.Call) -> Any: if self._curr_pass == 2: # check if this is a call to any of known functions - alias_name = astunparse.unparse(node.func).strip() + alias_name = ast_unparse(node.func).strip() fn = self.func_aliases.get(alias_name) if not fn: # try a fallback to "run" function that may be called on pipeline or source diff --git a/dlt/sources/sql_database/arrow_helpers.py b/dlt/sources/sql_database/arrow_helpers.py index 1f72205a2a..1de9dffc87 100644 --- a/dlt/sources/sql_database/arrow_helpers.py +++ b/dlt/sources/sql_database/arrow_helpers.py @@ -4,9 +4,6 @@ from dlt.common.configuration import with_config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.libs.pyarrow import ( - row_tuples_to_arrow as _row_tuples_to_arrow, -) @with_config @@ -20,6 +17,8 @@ def row_tuples_to_arrow( is always the case if run within the pipeline. This will generate arrow schema compatible with the destination. Otherwise generic capabilities are used """ + from dlt.common.libs.pyarrow import row_tuples_to_arrow as _row_tuples_to_arrow + return _row_tuples_to_arrow( rows, caps or DestinationCapabilitiesContext.generic_capabilities(), columns, tz ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index aa0a5fe68a..9b243b9429 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -181,7 +181,7 @@ bucket_url = "abfss://@.dfs.core.windows.n You can use `az`, `abfss`, `azure` and `abfs` url schemes. -If you need to use a custom host to account your storage account you can set it up like below: +If you need to use a custom host for your storage account, you can set it up like below: ```toml [destination.filesystem.credentials] # The storage account name is always required diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 11d4382a22..fa5cf7b128 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -39,6 +39,8 @@ pipeline = dlt.pipeline("orders_pipeline", destination="snowflake") pipeline.run(df, table_name="orders") ``` +Note that Pandas indexes are not save by default (up from `dlt` version 1.4.1). If for some reason you need the destination, +use `Table.from_pandas` with `preserve_index` set to True to explicitly convert the dataframe into arrow table. A `pyarrow` table can be loaded in the same way: diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index f1766d1797..c10ac3e3d0 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -69,6 +69,45 @@ Note that many destinations are exclusively case-insensitive, of which some pres ### Identifier shortening Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This (with a high probability) avoids shortened identifier collisions. +### Compound (flattened) identifiers +`dlt` combines several identifiers in order to name nested tables and flattened columns. For example: +```json +{ + "column": + { + "value": 1 + } +} +``` +generates flattened column name `column__value`. Where `__` is a path separator (in **snake case**). Each component in the combined identifier is normalized +separately and shortened as a whole. + +:::note +Combined identifier is also a valid single identifier. Starting from +`dlt` version above 1.4.0 normalization is fully idempotent and normalized +`column__value` will be still `column__value`. +::: + +:::caution +Previously double underscores were contracted into single underscore. That +prevented using data loaded by `dlt` as a data source without identifier modifications. `dlt` maintains backward compatibility for version >1.4.0 as follows: + +* All schemas stored locally or at destination will be migrated to backward compatible mode by setting a flag `use_break_path_on_normalize` ie.: +```yaml +normalizers: + names: dlt.common.normalizers.names.snake_case + use_break_path_on_normalize: true + json: + module: dlt.common.normalizers.json.relational +``` +* Backward compatible behavior may be explicitly enabled by setting +`SCHEMA__USE_BREAK_PATH_ON_NORMALIZE` to `TRUE` or via `config.toml`: +```toml +[schema] +use_break_path_on_normalize=true +``` +::: + ### 🚧 [WIP] Name convention changes are lossy `dlt` does not store the source identifiers in the schema so when the naming convention changes (or we increase the maximum identifier length), it is not able to generate a fully correct set of new identifiers. Instead, it will re-normalize already normalized identifiers. We are currently working to store the full identifier lineage - source identifiers will be stored and mapped to the destination in the schema. diff --git a/mypy.ini b/mypy.ini index eee4db6126..769e84b13a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -134,4 +134,4 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-time_machine.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 9ae26bd04c..732ba0e219 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -13,13 +13,13 @@ files = [ [[package]] name = "adlfs" -version = "2024.4.1" +version = "2024.7.0" description = "Access Azure Datalake Gen1 with fsspec and dask" optional = true python-versions = ">=3.8" files = [ - {file = "adlfs-2024.4.1-py3-none-any.whl", hash = "sha256:acea94612ddacaa34ea8c6babcc95b8da6982f930cdade7a86fbd17382403e16"}, - {file = "adlfs-2024.4.1.tar.gz", hash = "sha256:75530a45447f358ae53c5c39c298b8d966dae684be84db899f63b94cd96fc000"}, + {file = "adlfs-2024.7.0-py3-none-any.whl", hash = "sha256:2005c8e124fda3948f2a6abb2dbebb2c936d2d821acaca6afd61932edfa9bc07"}, + {file = "adlfs-2024.7.0.tar.gz", hash = "sha256:106995b91f0eb5e775bcd5957d180d9a14faef3271a063b1f65c66fd5ab05ddf"}, ] [package.dependencies] @@ -3900,106 +3900,6 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, - {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, - {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, - {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, - {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, - {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, - {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, - {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, - {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, - {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, - {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, - {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, - {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, - {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, - {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, - {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, - {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, - {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, - {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, - {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, - {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -10618,4 +10518,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "24e262ce6bb496fad6e587c76bb9ad60a2cc45a00f52e368b59978093e57b77c" +content-hash = "c0607d05ab37a1a6addf3ae7264bf5972cb6ce6e46df1dcdc2da3cff72e5008e" diff --git a/pyproject.toml b/pyproject.toml index 638653ffcf..8afb332422 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1a0" +version = "1.4.1a1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -40,7 +40,7 @@ click = ">=7.1" requirements-parser = ">=0.5.0" setuptools = ">=65.6.0" humanize = ">=4.4.0" -astunparse = ">=1.6.3" +astunparse = { "version" = ">=1.6.3", "python" = "<3.9"} gitpython = ">=3.1.29" pytz = ">=2022.6" giturlparse = ">=0.10.0" @@ -89,7 +89,6 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } -aiohttp = { version = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v11.yml b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml new file mode 100644 index 0000000000..fd6717c614 --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml @@ -0,0 +1,394 @@ +version: 18 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 +name: ethereum +tables: + _dlt_loads: + columns: + load_id: + nullable: false + data_type: text + schema_name: + nullable: true + data_type: text + status: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_version_hash: + nullable: true + data_type: text + write_disposition: skip + description: Created by DLT. Tracks completed loads + schema_contract: {} + resource: _dlt_loads + _dlt_version: + columns: + version: + nullable: false + data_type: bigint + engine_version: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_name: + nullable: false + data_type: text + version_hash: + nullable: false + data_type: text + schema: + nullable: false + data_type: text + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_contract: {} + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + description: load id coming from the extractor + data_type: text + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + number: + nullable: false + primary_key: true + data_type: bigint + parent_hash: + nullable: true + data_type: text + hash: + nullable: false + cluster: true + unique: true + data_type: text + base_fee_per_gas: + nullable: false + data_type: wei + difficulty: + nullable: false + data_type: wei + extra_data: + nullable: true + data_type: text + gas_limit: + nullable: false + data_type: bigint + gas_used: + nullable: false + data_type: bigint + logs_bloom: + nullable: true + data_type: binary + miner: + nullable: true + data_type: text + mix_hash: + nullable: true + data_type: text + nonce: + nullable: true + data_type: text + receipts_root: + nullable: true + data_type: text + sha3_uncles: + nullable: true + data_type: text + size: + nullable: true + data_type: bigint + state_root: + nullable: false + data_type: text + timestamp: + nullable: false + unique: true + sort: true + data_type: timestamp + total_difficulty: + nullable: true + data_type: wei + transactions_root: + nullable: false + data_type: text + schema_contract: {} + resource: blocks + x-normalizer: + seen-data: true + blocks__transactions: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + block_number: + nullable: false + primary_key: true + data_type: bigint + merge_key: true + transaction_index: + nullable: false + primary_key: true + data_type: bigint + hash: + nullable: false + unique: true + data_type: text + block_hash: + nullable: false + cluster: true + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + chain_id: + nullable: true + data_type: text + from: + nullable: true + data_type: text + gas: + nullable: true + data_type: bigint + gas_price: + nullable: true + data_type: bigint + input: + nullable: true + data_type: text + max_fee_per_gas: + nullable: true + data_type: wei + max_priority_fee_per_gas: + nullable: true + data_type: wei + nonce: + nullable: true + data_type: bigint + r: + nullable: true + data_type: text + s: + nullable: true + data_type: text + status: + nullable: true + data_type: bigint + to: + nullable: true + data_type: text + type: + nullable: true + data_type: text + v: + nullable: true + data_type: bigint + value: + nullable: false + data_type: wei + eth_value: + nullable: true + data_type: decimal + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions + blocks__transactions__logs: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + address: + nullable: false + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + block_hash: + nullable: false + cluster: true + data_type: text + block_number: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + transaction_index: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + log_index: + nullable: false + primary_key: true + data_type: bigint + data: + nullable: true + data_type: text + removed: + nullable: true + data_type: bool + transaction_hash: + nullable: false + data_type: text + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + address: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true +settings: + default_hints: + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + row_key: + - _dlt_id + parent_key: + - _dlt_parent_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_contract: {} +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash +previous_hashes: +- oHfYGTI2GHOxuzwVz6+yvMilXUvHYhxrxkanC2T6MAI= +- C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE= +- yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= + diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json index 4c4f5425ae..5a1b0c6f84 100644 --- a/tests/common/cases/schemas/github/issues.schema.json +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -1,1322 +1,1100 @@ { - "version": 2, - "version_hash": "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=", - "engine_version": 5, - "name": "event", - "tables": { - "_dlt_version": { - "name": "_dlt_version", - "columns": { - "version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version", - "data_type": "bigint", - "nullable": false - }, - "engine_version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "engine_version", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": false - }, - "version_hash": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version_hash", - "data_type": "text", - "nullable": false - }, - "schema": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema", - "data_type": "text", - "nullable": false - } + "version": 3, + "version_hash": "o6olKmaCAQVWDWR4eT4aZ1V/RiH+003516xq7Zrva+Q=", + "engine_version": 11, + "name": "event", + "tables": { + "_dlt_version": { + "columns": { + "version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "engine_version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": false + }, + "schema_name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "version_hash": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "schema": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + } + }, + "write_disposition": "skip", + "description": "Created by DLT. Tracks schema updates", + "schema_contract": {}, + "resource": "_dlt_version" }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks schema updates" - }, - "_dlt_loads": { - "name": "_dlt_loads", - "columns": { - "load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "load_id", - "data_type": "text", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": true - }, - "status": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "status", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - } + "_dlt_loads": { + "columns": { + "load_id": { + "data_type": "text", + "nullable": false + }, + "schema_name": { + "data_type": "text", + "nullable": true + }, + "status": { + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "data_type": "timestamp", + "nullable": false + }, + "schema_version_hash": { + "data_type": "text", + "nullable": true + } + }, + "write_disposition": "skip", + "resource": "_dlt_loads", + "description": "Created by DLT. Tracks completed loads", + "schema_contract": {} }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks completed loads" - }, - "issues": { - "name": "issues", - "columns": { - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "repository_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repository_url", - "data_type": "text", - "nullable": true - }, - "labels_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "labels_url", - "data_type": "text", - "nullable": true - }, - "comments_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "number": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "number", - "data_type": "bigint", - "nullable": true - }, - "title": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "title", - "data_type": "text", - "nullable": true - }, - "user__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__login", - "data_type": "text", - "nullable": true - }, - "user__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__id", - "data_type": "bigint", - "nullable": true - }, - "user__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__node_id", - "data_type": "text", - "nullable": true - }, - "user__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__avatar_url", - "data_type": "text", - "nullable": true - }, - "user__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gravatar_id", - "data_type": "text", - "nullable": true - }, - "user__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__url", - "data_type": "text", - "nullable": true - }, - "user__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__html_url", - "data_type": "text", - "nullable": true - }, - "user__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__followers_url", - "data_type": "text", - "nullable": true - }, - "user__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__following_url", - "data_type": "text", - "nullable": true - }, - "user__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gists_url", - "data_type": "text", - "nullable": true - }, - "user__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__starred_url", - "data_type": "text", - "nullable": true - }, - "user__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "user__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__organizations_url", - "data_type": "text", - "nullable": true - }, - "user__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__repos_url", - "data_type": "text", - "nullable": true - }, - "user__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__events_url", - "data_type": "text", - "nullable": true - }, - "user__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__received_events_url", - "data_type": "text", - "nullable": true - }, - "user__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__type", - "data_type": "text", - "nullable": true - }, - "user__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__site_admin", - "data_type": "bool", - "nullable": true - }, - "state": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state", - "data_type": "text", - "nullable": true - }, - "locked": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "locked", - "data_type": "bool", - "nullable": true - }, - "assignee__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__login", - "data_type": "text", - "nullable": true - }, - "assignee__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__id", - "data_type": "bigint", - "nullable": true - }, - "assignee__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__node_id", - "data_type": "text", - "nullable": true - }, - "assignee__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__avatar_url", - "data_type": "text", - "nullable": true - }, - "assignee__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gravatar_id", - "data_type": "text", - "nullable": true - }, - "assignee__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__url", - "data_type": "text", - "nullable": true - }, - "assignee__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__html_url", - "data_type": "text", - "nullable": true - }, - "assignee__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__followers_url", - "data_type": "text", - "nullable": true - }, - "assignee__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__following_url", - "data_type": "text", - "nullable": true - }, - "assignee__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gists_url", - "data_type": "text", - "nullable": true - }, - "assignee__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__starred_url", - "data_type": "text", - "nullable": true - }, - "assignee__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "assignee__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__organizations_url", - "data_type": "text", - "nullable": true - }, - "assignee__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__repos_url", - "data_type": "text", - "nullable": true - }, - "assignee__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__events_url", - "data_type": "text", - "nullable": true - }, - "assignee__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__received_events_url", - "data_type": "text", - "nullable": true - }, - "assignee__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__type", - "data_type": "text", - "nullable": true - }, - "assignee__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__site_admin", - "data_type": "bool", - "nullable": true - }, - "comments": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments", - "data_type": "bigint", - "nullable": true - }, - "created_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "created_at", - "data_type": "timestamp", - "nullable": true - }, - "updated_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "updated_at", - "data_type": "timestamp", - "nullable": true - }, - "closed_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "closed_at", - "data_type": "timestamp", - "nullable": true - }, - "author_association": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "author_association", - "data_type": "text", - "nullable": true - }, - "body": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "body", - "data_type": "text", - "nullable": true - }, - "reactions__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__url", - "data_type": "text", - "nullable": true - }, - "reactions__total_count": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__total_count", - "data_type": "bigint", - "nullable": true - }, - "reactions___1": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions___1", - "data_type": "bigint", - "nullable": true - }, - "reactions__laugh": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__laugh", - "data_type": "bigint", - "nullable": true - }, - "reactions__hooray": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__hooray", - "data_type": "bigint", - "nullable": true - }, - "reactions__confused": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__confused", - "data_type": "bigint", - "nullable": true - }, - "reactions__heart": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__heart", - "data_type": "bigint", - "nullable": true - }, - "reactions__rocket": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__rocket", - "data_type": "bigint", - "nullable": true - }, - "reactions__eyes": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__eyes", - "data_type": "bigint", - "nullable": true - }, - "timeline_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "timeline_url", - "data_type": "text", - "nullable": true - }, - "state_reason": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state_reason", - "data_type": "text", - "nullable": true - }, - "_dlt_load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_load_id", - "data_type": "text", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false - }, - "draft": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "draft", - "data_type": "bool", - "nullable": true - }, - "pull_request__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__url", - "data_type": "text", - "nullable": true - }, - "pull_request__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__html_url", - "data_type": "text", - "nullable": true - }, - "pull_request__diff_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__diff_url", - "data_type": "text", - "nullable": true - }, - "pull_request__patch_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__patch_url", - "data_type": "text", - "nullable": true - }, - "pull_request__merged_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__merged_at", - "data_type": "timestamp", - "nullable": true - } + "issues": { + "columns": { + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repository_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "labels_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "comments_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "number": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "title": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "user__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "state": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "locked": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "assignee__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "assignee__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "comments": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "created_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "updated_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "closed_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "author_association": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "body": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__total_count": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions___1": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__laugh": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__hooray": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__confused": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__heart": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__rocket": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__eyes": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "timeline_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "state_reason": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_load_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + }, + "draft": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "pull_request__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__diff_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__patch_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__merged_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + } + }, + "write_disposition": "append", + "schema_contract": {}, + "x-normalizer": { + "seen-data": true + }, + "resource": "issues" }, - "write_disposition": "append" - }, - "issues__labels": { - "name": "issues__labels", - "columns": { - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "name", - "data_type": "text", - "nullable": true - }, - "color": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "color", - "data_type": "text", - "nullable": true - }, - "default": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "default", - "data_type": "bool", - "nullable": true - }, - "description": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "description", - "data_type": "text", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__labels": { + "columns": { + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "color": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "default": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "description": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } }, - "parent": "issues" - }, - "issues__assignees": { - "name": "issues__assignees", - "columns": { - "login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "login", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "avatar_url", - "data_type": "text", - "nullable": true - }, - "gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gravatar_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "followers_url", - "data_type": "text", - "nullable": true - }, - "following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "following_url", - "data_type": "text", - "nullable": true - }, - "gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gists_url", - "data_type": "text", - "nullable": true - }, - "starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "starred_url", - "data_type": "text", - "nullable": true - }, - "subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "subscriptions_url", - "data_type": "text", - "nullable": true - }, - "organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "organizations_url", - "data_type": "text", - "nullable": true - }, - "repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repos_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "received_events_url", - "data_type": "text", - "nullable": true - }, - "type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "type", - "data_type": "text", - "nullable": true - }, - "site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "site_admin", - "data_type": "bool", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__assignees": { + "columns": { + "login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } - }, - "parent": "issues" - } - }, - "settings": { - "detections": [ - "timestamp", - "iso_timestamp", - "iso_date" - ], - "default_hints": { - "not_null": [ - "_dlt_id", - "_dlt_root_id", - "_dlt_parent_id", - "_dlt_list_idx", - "_dlt_load_id" - ], - "foreign_key": [ - "_dlt_parent_id" + } + }, + "settings": { + "detections": [ + "timestamp", + "iso_timestamp", + "iso_date" ], - "unique": [ - "_dlt_id" - ] - } - }, - "normalizers": { - "names": "dlt.common.normalizers.names.snake_case", - "json": { - "module": "dlt.common.normalizers.json.relational" - } + "default_hints": { + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "_dlt_load_id" + ], + "unique": [ + "_dlt_id" + ], + "row_key": [ + "_dlt_id" + ], + "parent_key": [ + "_dlt_parent_id" + ] + }, + "schema_contract": {} + }, + "normalizers": { + "names": "dlt.common.normalizers.names.snake_case", + "json": { + "module": "dlt.common.normalizers.json.relational" + } + }, + "previous_hashes": [ + "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=" + ] } -} diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 1553cea04f..35bc80add2 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -6,14 +6,12 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.schema import Schema from dlt.common.schema.utils import new_table - +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES from dlt.common.normalizers.json.relational import ( RelationalNormalizerConfigPropagation, DataItemNormalizer as RelationalNormalizer, - DLT_ID_LENGTH_BYTES, ) - -# _flatten, _get_child_row_hash, _normalize_row, normalize_data_item, +from dlt.common.normalizers.json import helpers as normalize_helpers from tests.utils import create_schema_with_name @@ -420,7 +418,7 @@ def test_list_in_list() -> None: schema.update_table(path_table) assert "zen__webpath" in schema.tables # clear cache with json paths - schema.data_item_normalizer._is_nested_type.cache_clear() # type: ignore[attr-defined] + normalize_helpers.is_nested_type.cache_clear() rows = list(schema.normalize_data_item(chats, "1762162.1212", "zen")) # both lists are json types now @@ -890,7 +888,7 @@ def test_caching_perf(norm: RelationalNormalizer) -> None: table["x-normalizer"] = {} start = time() for _ in range(100000): - norm._is_nested_type(norm.schema, "test", "field", 0) + normalize_helpers.is_nested_type(norm.schema, "test", "field", 0) # norm._get_table_nesting_level(norm.schema, "test") print(f"{time() - start}") diff --git a/tests/common/normalizers/test_naming_snake_case.py b/tests/common/normalizers/test_naming_snake_case.py index ee4f43e7f0..e03de65696 100644 --- a/tests/common/normalizers/test_naming_snake_case.py +++ b/tests/common/normalizers/test_naming_snake_case.py @@ -50,6 +50,14 @@ def test_normalize_path(naming_unlimited: NamingConvention) -> None: assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" + # paths with non normalized underscores + # NOTE: empty idents created during break path are removed so underscores are contracted + assert ( + naming_unlimited.normalize_path("Small___Love____Potion_____x") + == "small___love__potion___x" + ) + assert naming_unlimited.normalize_path("small___love__potion___x") == "small___love__potion___x" + def test_normalize_non_alpha_single_underscore() -> None: assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "-=!*") == "_" diff --git a/tests/common/schema/test_import_normalizers.py b/tests/common/schema/test_import_normalizers.py index a1e3d775f0..d444259946 100644 --- a/tests/common/schema/test_import_normalizers.py +++ b/tests/common/schema/test_import_normalizers.py @@ -16,7 +16,7 @@ ) from dlt.common.schema.normalizers import ( DEFAULT_NAMING_NAMESPACE, - explicit_normalizers, + configured_normalizers, import_normalizers, naming_from_reference, serialize_reference, @@ -26,25 +26,25 @@ def test_explicit_normalizers() -> None: - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None assert config["json"] is None # pass explicit - config = explicit_normalizers("direct", {"module": "custom"}) + config = configured_normalizers("direct", {"module": "custom"}) assert config["names"] == "direct" assert config["json"] == {"module": "custom"} # pass modules and types, make sure normalizer config is serialized - config = explicit_normalizers(direct) + config = configured_normalizers(direct) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" - config = explicit_normalizers(direct.NamingConvention) + config = configured_normalizers(direct.NamingConvention) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" # use environ os.environ["SCHEMA__NAMING"] = "direct" os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "custom"}' - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] == "direct" assert config["json"] == {"module": "custom"} @@ -54,7 +54,7 @@ def test_explicit_normalizers_caps_ignored() -> None: destination_caps = DestinationCapabilitiesContext.generic_capabilities() destination_caps.naming_convention = "direct" with Container().injectable_context(destination_caps): - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None @@ -121,7 +121,7 @@ def test_naming_from_reference() -> None: def test_import_normalizers() -> None: - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert isinstance(naming, snake_case.NamingConvention) # no maximum length: we do not know the destination capabilities assert naming.max_length is None @@ -133,7 +133,7 @@ def test_import_normalizers() -> None: os.environ["SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} assert isinstance(naming, direct.NamingConvention) @@ -142,7 +142,7 @@ def test_import_normalizers() -> None: def test_import_normalizers_with_defaults() -> None: - explicit = explicit_normalizers() + explicit = configured_normalizers() default_: TNormalizersConfig = { "names": "dlt.destinations.impl.weaviate.naming", "json": {"module": "tests.common.normalizers.custom_normalizers"}, @@ -170,7 +170,7 @@ def test_config_sections(sections: str) -> None: os.environ[f"{sections}SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, _, _ = import_normalizers(explicit_normalizers(schema_name="test_schema")) + config, _, _ = import_normalizers(configured_normalizers(schema_name="test_schema")) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} @@ -181,11 +181,11 @@ def test_import_normalizers_with_caps() -> None: destination_caps.naming_convention = "direct" destination_caps.max_identifier_length = 127 with Container().injectable_context(destination_caps): - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) assert isinstance(naming, direct.NamingConvention) assert naming.max_length == 127 - _, naming, _ = import_normalizers(explicit_normalizers(snake_case)) + _, naming, _ = import_normalizers(configured_normalizers(snake_case)) assert isinstance(naming, snake_case.NamingConvention) assert naming.max_length == 127 @@ -196,23 +196,23 @@ def test_import_normalizers_with_caps() -> None: } destination_caps.max_table_nesting = 0 with Container().injectable_context(destination_caps): - config, _, relational = import_normalizers(explicit_normalizers()) + config, _, relational = import_normalizers(configured_normalizers()) assert config["json"]["config"]["max_nesting"] == 0 assert relational is RelationalNormalizer # wrong normalizer - config, _, relational = import_normalizers(explicit_normalizers(), default_) + config, _, relational = import_normalizers(configured_normalizers(), default_) assert "config" not in config["json"] def test_import_invalid_naming_module() -> None: with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("unknown")) + import_normalizers(configured_normalizers("unknown")) assert py_ex.value.naming_module == "unknown" with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("dlt.common.tests")) + import_normalizers(configured_normalizers("dlt.common.tests")) assert py_ex.value.naming_module == "dlt.common.tests" with pytest.raises(InvalidNamingType) as py_ex2: - import_normalizers(explicit_normalizers("dlt.pipeline.helpers")) + import_normalizers(configured_normalizers("dlt.pipeline.helpers")) assert py_ex2.value.naming_module == "dlt.pipeline" assert py_ex2.value.naming_class == "helpers" diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py index f84d857e26..a1cb181525 100644 --- a/tests/common/schema/test_normalize_identifiers.py +++ b/tests/common/schema/test_normalize_identifiers.py @@ -271,12 +271,7 @@ def test_normalize_table_identifiers_table_reference() -> None: def test_update_normalizers() -> None: - schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") - schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] - # drop seen data - del schema.tables["issues"]["x-normalizer"] - del schema.tables["issues__labels"]["x-normalizer"] - del schema.tables["issues__assignees"]["x-normalizer"] + schema = make_issues_schema_for_normalizers_update() # save default hints in original form default_hints = schema._settings["default_hints"] @@ -307,8 +302,8 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.configuration.container import Container - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - orig_schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + orig_schema = Schema.from_dict(eth_V11) # save schema schema_storage_no_import.save_schema(orig_schema) @@ -317,7 +312,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) as caps: assert caps.naming_convention is sql_upper # creating a schema from dict keeps original normalizers - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert_schema_identifiers_case(schema, str.lower) assert schema._normalizers_config["names"].endswith("snake_case") @@ -350,7 +345,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) norm_schema = Schema.from_dict( - deepcopy(eth_V9), remove_processing_hints=True, bump_version=False + deepcopy(eth_V11), remove_processing_hints=True, bump_version=False ) norm_schema.update_normalizers() assert_schema_identifiers_case(norm_schema, str.upper) @@ -452,3 +447,50 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: assert schema.naming.break_path("A__B__!C") == ["A", "B", "!C"] row = list(schema.normalize_data_item({"bool": True}, "load_id", "a_table")) assert row[0] == (("a_table", None), {"bool": True}) + + +def test_update_schema_normalizer_props() -> None: + schema = make_issues_schema_for_normalizers_update() + schema_2 = make_issues_schema_for_normalizers_update() + # remove issues table + del schema_2._schema_tables["issues"] + schema_2.update_schema(schema) + + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" + # apply normalizers + schema_2.update_normalizers() + + # preserve schema_2 str + schema_2_str = schema_2.to_pretty_json() + + # make sure that normalizer props in original schema are preserved + schema._normalizers_config["allow_identifier_change_on_table_with_data"] = True + schema._normalizers_config["use_break_path_on_normalize"] = True + + # set some fake naming convention. during schema update it should not be used + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper_X" + schema.update_schema(schema_2) + assert isinstance(schema.naming, sql_upper.NamingConvention) + assert_schema_identifiers_case(schema, str.upper) + # make sure norm setting still in schema + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is True + assert schema._normalizers_config["use_break_path_on_normalize"] is True + # schema 2 not modified during the update + assert schema_2_str == schema_2.to_pretty_json() + + # make sure that explicit settings are passed + schema_2._normalizers_config["allow_identifier_change_on_table_with_data"] = False + schema_2._normalizers_config["use_break_path_on_normalize"] = False + schema.update_schema(schema_2) + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is False + assert schema._normalizers_config["use_break_path_on_normalize"] is False + + +def make_issues_schema_for_normalizers_update() -> Schema: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # drop seen data + del schema.tables["issues"]["x-normalizer"] + del schema.tables["issues__labels"]["x-normalizer"] + del schema.tables["issues__assignees"]["x-normalizer"] + return schema diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 7124ca5c80..5cdd42e448 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -570,8 +570,8 @@ def test_update_preferred_types(schema: Schema) -> None: def test_default_table_resource() -> None: """Parent tables without `resource` set default to table name""" - eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") - tables = Schema.from_dict(eth_v5).tables + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") + tables = Schema.from_dict(eth_v11).tables assert tables["blocks"]["resource"] == "blocks" assert all([t.get("resource") is None for t in tables.values() if t.get("parent")]) @@ -737,7 +737,7 @@ def assert_new_schema_props_custom_normalizers(schema: Schema) -> None: def assert_is_new_schema(schema: Schema) -> None: assert schema.stored_version is None assert schema.stored_version_hash is None - assert schema.ENGINE_VERSION == 10 + assert schema.ENGINE_VERSION == 11 assert schema._stored_previous_hashes == [] assert schema.is_modified assert schema.is_new @@ -845,9 +845,9 @@ def test_group_tables_by_resource(schema: Schema) -> None: def test_remove_processing_hints() -> None: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # here tables contain processing hints - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert "x-normalizer" in schema.tables["blocks"] # clone with hints removal, note that clone does not bump version @@ -867,16 +867,10 @@ def test_remove_processing_hints() -> None: assert "x-normalizer" not in to_json # load without hints - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True, bump_version=False) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True, bump_version=False) assert no_hints.stored_version_hash == cloned.stored_version_hash # now load without hints but with version bump cloned._bump_version() - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True) assert no_hints.stored_version_hash == cloned.stored_version_hash - - -# def test_get_new_table_columns() -> None: -# pytest.fail(reason="must implement!") -# pass -# get_new_table_columns() diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 39f1ad3211..1577b51115 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -86,10 +86,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v10: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v10") - version = eth_v10["version"] - version_hash = eth_v10["version_hash"] - schema = Schema.from_dict(eth_v10) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + version = eth_v11["version"] + version_hash = eth_v11["version_hash"] + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -98,8 +98,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") - schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 0dcf2930de..2818ea9622 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -3,7 +3,7 @@ import yaml from dlt.common import json -from dlt.common.schema.normalizers import explicit_normalizers +from dlt.common.schema.normalizers import configured_normalizers from dlt.common.schema.schema import Schema from dlt.common.storages.exceptions import ( InStorageSchemaModified, @@ -304,7 +304,7 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No def test_save_store_schema(storage: SchemaStorage) -> None: - d_n = explicit_normalizers() + d_n = configured_normalizers() d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) assert schema.is_new @@ -357,16 +357,16 @@ def test_save_initial_import_schema(ie_storage: LiveSchemaStorage) -> None: ie_storage.load_schema("ethereum") # save initial import schema where processing hints are removed - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") assert "x-normalizer" not in eth.tables["blocks"] # won't overwrite initial schema - del eth_V9["tables"]["blocks__uncles"] - schema = Schema.from_dict(eth_V9) + del eth_V11["tables"]["blocks__uncles"] + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index a1334ba1da..5366d8b06f 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -218,9 +218,9 @@ def assert_package_info( def prepare_eth_import_folder(storage: SchemaStorage) -> Schema: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V9, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) storage._export_schema(eth, storage.config.import_schema_path) return eth diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e3098a1a77..9eeded1229 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -26,6 +26,7 @@ get_exception_trace, get_exception_trace_chain, update_dict_nested, + removeprefix, ) @@ -440,3 +441,11 @@ def _function_test(a, *, b=None): except Exception as exc: assert str(exc) == "wrong type" assert is_typeerror_due_to_wrong_call(exc, function_typeerror_exc) is False + + +def test_removeprefix() -> None: + assert removeprefix("a_data", "a_") == "data" + assert removeprefix("a_data", "a_data") == "" + assert removeprefix("a_data", "a_data_1") == "a_data" + assert removeprefix("", "a_data_1") == "" + assert removeprefix("a_data", "") == "a_data" diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index f3ebb02b46..6899d8d5fe 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -111,7 +111,7 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: with open( - "tests/common/cases/schemas/eth/ethereum_schema_v10.yml", mode="r", encoding="utf-8" + "tests/common/cases/schemas/eth/ethereum_schema_v11.yml", mode="r", encoding="utf-8" ) as f: schema_dict: TStoredSchema = yaml.safe_load(f) diff --git a/tests/common/utils.py b/tests/common/utils.py index 9b5e6bccce..a0760ffe86 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -19,11 +19,11 @@ def IMPORTED_VERSION_HASH_ETH_V10() -> str: # for import schema tests, change when upgrading the schema version - eth_V10 = load_yml_case("schemas/eth/ethereum_schema_v10") - assert eth_V10["version_hash"] == "veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos=" + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + assert eth_V11["version_hash"] == "XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI=" # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V10, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) return eth.stored_version_hash diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index d224088f8b..e20260bfe7 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,6 +1,6 @@ version: 18 -version_hash: veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos= -engine_version: 10 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 name: ethereum tables: _dlt_loads: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 5dc4304a63..a14b4a9602 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -112,9 +112,9 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - eth_v9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") # source removes processing hints so we do - reference_schema = Schema.from_dict(eth_v9, remove_processing_hints=True) + reference_schema = Schema.from_dict(eth_v11, remove_processing_hints=True) assert schema.stored_version_hash == reference_schema.stored_version_hash diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 30df12ae17..725872b621 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -219,8 +219,74 @@ def some_data(created_at=dlt.sources.incremental("created_at")): assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] +def test_pandas_index_as_dedup_key() -> None: + from dlt.common.libs.pandas import pandas_to_arrow, pandas as pd + + some_data, p = _make_dedup_pipeline("pandas") + + # no index + no_index_r = some_data.with_name(new_name="no_index") + p.run(no_index_r) + p.run(no_index_r) + data_ = p._dataset().no_index.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + # unnamed index: explicitly converted + unnamed_index_r = some_data.with_name(new_name="unnamed_index").add_map( + lambda df: pandas_to_arrow(df, preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + unnamed_index_r.incremental.primary_key = "__index_level_0__" + p.run(unnamed_index_r) + p.run(unnamed_index_r) + data_ = p._dataset().unnamed_index.arrow() + assert data_.schema.names == ["created_at", "id", "index_level_0"] + # indexes 2 and 3 are removed from second batch because they were in the previous batch + # and the created_at overlapped so they got deduplicated + assert data_["index_level_0"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: + df_.index = pd.RangeIndex(start=0, stop=len(df_), step=1, name="order_id") + return df_ + + # named index explicitly converted + named_index_r = some_data.with_name(new_name="named_index").add_map( + lambda df: pandas_to_arrow(_make_named_index(df), preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + named_index_r.incremental.primary_key = "order_id" + p.run(named_index_r) + p.run(named_index_r) + data_ = p._dataset().named_index.arrow() + assert data_.schema.names == ["created_at", "id", "order_id"] + assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + # named index explicitly converted + named_index_impl_r = some_data.with_name(new_name="named_index_impl").add_map( + lambda df: _make_named_index(df) + ) + p.run(named_index_impl_r) + p.run(named_index_impl_r) + data_ = p._dataset().named_index_impl.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> None: + some_data, p = _make_dedup_pipeline(item_type) + p.run(some_data()) + p.run(some_data()) + + with p.sql_client() as c: + with c.execute_query("SELECT created_at, id FROM some_data ORDER BY created_at, id") as cur: + rows = cur.fetchall() + print(rows) + assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + + +def _make_dedup_pipeline(item_type: TestDataItemFormat): data1 = [ {"created_at": 1, "id": "a"}, {"created_at": 2, "id": "b"}, @@ -235,7 +301,6 @@ def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> {"created_at": 3, "id": "f"}, {"created_at": 4, "id": "g"}, ] - source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) @@ -250,14 +315,7 @@ def some_data(created_at=dlt.sources.incremental("created_at")): pipeline_name=uniq_id(), destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) - p.run(some_data()) - p.run(some_data()) - - with p.sql_client() as c: - with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: - rows = cur.fetchall() - - assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + return some_data, p def test_nested_cursor_path() -> None: diff --git a/tests/libs/pyarrow/test_pyarrow_normalizer.py b/tests/libs/pyarrow/test_pyarrow_normalizer.py index 32ee5fdafc..c81d8cd974 100644 --- a/tests/libs/pyarrow/test_pyarrow_normalizer.py +++ b/tests/libs/pyarrow/test_pyarrow_normalizer.py @@ -5,12 +5,12 @@ from dlt.common.libs.pyarrow import normalize_py_arrow_item, NameNormalizationCollision from dlt.common.schema.utils import new_column, TColumnSchema -from dlt.common.schema.normalizers import explicit_normalizers, import_normalizers +from dlt.common.schema.normalizers import configured_normalizers, import_normalizers from dlt.common.destination import DestinationCapabilitiesContext def _normalize(table: pa.Table, columns: List[TColumnSchema]) -> pa.Table: - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) caps = DestinationCapabilitiesContext() columns_schema = {c["name"]: c for c in columns} return normalize_py_arrow_item(table, columns_schema, naming, caps) diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index ad33062f11..eabc3094bd 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -56,7 +56,8 @@ def test_clickhouse_configuration() -> None: def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: """Test experimental settings are set correctly for the session.""" - conn = client.sql_client.open_connection() + # with client.sql_client.open_connection() as conn: + conn = client.sql_client.native_connection cursor1 = conn.cursor() cursor2 = conn.cursor() @@ -69,3 +70,26 @@ def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: assert ("allow_experimental_lightweight_delete", "1") in res assert ("enable_http_compression", "1") in res assert ("date_time_input_format", "best_effort") in res + + +def test_client_has_dataset(client: ClickHouseClient) -> None: + # with client.sql_client as sql_client: + assert client.sql_client.has_dataset() + separator = client.config.dataset_table_separator + + def _assert_has_dataset() -> None: + assert not client.sql_client.has_dataset() + client.sql_client.create_dataset() + assert client.sql_client.has_dataset() + client.sql_client.drop_dataset() + assert not client.sql_client.has_dataset() + + try: + # change separator + client.config.dataset_table_separator = "_" + _assert_has_dataset() + + client.config.dataset_table_separator = "" + _assert_has_dataset() + finally: + client.config.dataset_table_separator = separator diff --git a/tests/load/conftest.py b/tests/load/conftest.py index 76a7248e5b..c52fea607d 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -9,7 +9,7 @@ drop_pipeline, empty_schema, ) -from tests.utils import preserve_environ, patch_home_dir +from tests.utils import preserve_environ, patch_home_dir, autouse_test_storage @pytest.fixture(scope="function", params=DEFAULT_BUCKETS) diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index a9479a0bb9..49475ce43f 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -19,7 +19,7 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_table -from tests.utils import patch_home_dir, autouse_test_storage, TEST_STORAGE_ROOT +from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index b782e76b7e..1113b9b35d 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -9,7 +9,6 @@ from tests.common.configuration.utils import environment from tests.load.utils import ALL_FILESYSTEM_DRIVERS -from tests.utils import autouse_test_storage # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index d0a29d03d0..afcd9105a8 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -28,7 +28,6 @@ from tests.common.configuration.utils import environment from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS -from tests.utils import autouse_test_storage from tests.load.filesystem.utils import self_signed_cert diff --git a/tests/load/pipeline/conftest.py b/tests/load/pipeline/conftest.py index a2ba65494b..80c418ed22 100644 --- a/tests/load/pipeline/conftest.py +++ b/tests/load/pipeline/conftest.py @@ -1,2 +1,2 @@ -from tests.utils import autouse_test_storage, duckdb_pipeline_location +from tests.utils import duckdb_pipeline_location from tests.pipeline.utils import drop_dataset_from_env diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2925bfac6f..8b6fc751d9 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -80,7 +80,7 @@ def test_merge_on_keys_in_schema( skip_if_not_supported(merge_strategy, p.destination) - with open("tests/common/cases/schemas/eth/ethereum_schema_v9.yml", "r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v11.yml", "r", encoding="utf-8") as f: schema = dlt.Schema.from_dict(yaml.safe_load(f)) # make block uncles unseen to trigger filtering loader in loader for nested tables diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index 2a5b9ed296..962c501619 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -11,7 +11,7 @@ from dlt.common.pipeline import LoadInfo from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES -from dlt.common.normalizers.json.relational import DataItemNormalizer +from dlt.common.normalizers.json.helpers import get_row_hash from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision from dlt.extract.resource import DltResource @@ -30,7 +30,6 @@ from tests.utils import TPythonTableFormat -get_row_hash = DataItemNormalizer.get_row_hash FROM, TO = DEFAULT_VALIDITY_COLUMN_NAMES diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index e96e06be87..8a3b37dd48 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -61,6 +61,5 @@ def has_collections(client): if has_collections(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index b60c6a8956..ef0acb33a4 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -21,7 +21,7 @@ from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy +from tests.utils import TEST_STORAGE_ROOT, skipifpypy from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage # mark all tests as essential, do not remove diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 9f64722a1e..6f699436b3 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -36,7 +36,7 @@ from dlt.common.time import ensure_pendulum_datetime from tests.cases import table_update_and_row, assert_all_data_types_row -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.common.utils import load_json_case from tests.load.utils import ( TABLE_UPDATE, diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index f5a8d51baf..1a9c8a383b 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -10,6 +10,7 @@ from typing import List from functools import reduce +from dlt.common.storages.file_storage import FileStorage from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, @@ -18,7 +19,7 @@ MEMORY_BUCKET, ) from dlt.destinations import filesystem -from tests.utils import TEST_STORAGE_ROOT +from tests.utils import TEST_STORAGE_ROOT, clean_test_storage from dlt.common.destination.reference import TDestinationReferenceArg from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException from tests.load.utils import drop_pipeline_data @@ -48,8 +49,14 @@ def _expected_chunk_count(p: Pipeline) -> List[int]: return [_chunk_size(p), _total_records(p) - _chunk_size(p)] +# this also disables autouse_test_storage on function level which destroys some tests here @pytest.fixture(scope="session") -def populated_pipeline(request) -> Any: +def autouse_test_storage() -> FileStorage: + return clean_test_storage() + + +@pytest.fixture(scope="session") +def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" destination_config = cast(DestinationTestConfiguration, request.param) diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 05c10a900f..ee48222da9 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -22,7 +22,7 @@ from dlt.destinations.typing import TNativeConn from dlt.common.time import ensure_pendulum_datetime, to_py_datetime -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.load.utils import ( yield_client_with_storage, prepare_table, diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index b391c2fa38..b98b55fcfa 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -95,6 +95,5 @@ def schema_has_classes(client): if schema_has_classes(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/pipeline/cases/github_pipeline/github_rev.py b/tests/pipeline/cases/github_pipeline/github_rev.py new file mode 100644 index 0000000000..4ebe3048f4 --- /dev/null +++ b/tests/pipeline/cases/github_pipeline/github_rev.py @@ -0,0 +1,26 @@ +import dlt + + +@dlt.source +def github(): + @dlt.resource( + table_name="issues__2", + primary_key="id", + ) + def load_issues(): + # return data with path separators + yield [ + { + "id": 100, + "issue__id": 10, + } + ] + + return load_issues + + +if __name__ == "__main__": + p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") + github_source = github() + info = p.run(github_source) + print(info) diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index a3d8b489c9..fbd4d412b3 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -484,3 +484,59 @@ def test_scd2_pipeline_update(test_storage: FileStorage) -> None: assert len(issues_retired) == 1 assert issues_retired[0][0] == 6272 # print(pipeline.default_schema.to_pretty_yaml()) + + +def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> None: + """Pre 1.4.1 normalized identifiers with path separators into single underscore, + this behavior must be preserved if the schema is updated. + """ + shutil.copytree("tests/pipeline/cases/github_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + # execute in test storage + with set_working_dir(TEST_STORAGE_ROOT): + # store dlt data in test storage (like patch_home_dir) + with custom_environ({DLT_DATA_DIR: dlt.current.run().data_dir}): + # save database outside of pipeline dir + with custom_environ( + {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} + ): + venv_dir = tempfile.mkdtemp() + # create virtual env with (0.3.0) before the current schema upgrade + with Venv.create(venv_dir, ["dlt[duckdb]==0.3.0"]) as venv: + venv._install_deps(venv.context, ["duckdb" + "==" + pkg_version("duckdb")]) + try: + print( + venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py") + ) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + + venv = Venv.restore_current() + # load same data again + try: + print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py")) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + pipeline = dlt.attach(GITHUB_PIPELINE_NAME) + print(pipeline.default_schema.to_pretty_yaml()) + # migration set the backward compat flag + assert ( + pipeline.default_schema._normalizers_config["use_break_path_on_normalize"] + is False + ) + # make sure that schema didn't change + assert pipeline.default_schema.data_table_names() == ["issues_2"] + table_ = pipeline.default_schema.tables["issues_2"] + assert set(table_["columns"].keys()) == { + "id", + "issue_id", + "_dlt_id", + "_dlt_load_id", + } + # datasets must be the same + data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + print(data_) diff --git a/tests/normalize/test_max_nesting.py b/tests/pipeline/test_max_nesting.py similarity index 100% rename from tests/normalize/test_max_nesting.py rename to tests/pipeline/test_max_nesting.py diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 3832bad81a..e58db64e5e 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1711,6 +1711,111 @@ def nested_resource(): assert pipeline.last_trace.last_normalize_info.row_counts["flattened_dict__values"] == 4 +def test_column_name_with_break_path() -> None: + """Tests how normalization behaves for names with break path ie __ + all the names must be idempotent + """ + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was preserved + table = pipeline.default_schema.get_table("custom__path") + assert pipeline.default_schema.data_table_names() == ["custom__path"] + # column name was preserved + assert table["columns"]["example_custom_field__c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field__c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom__path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_name_with_break_path_legacy() -> None: + """Tests how normalization behaves for names with break path ie __ + in legacy mode table and column names were normalized as single identifier + """ + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was contracted + table = pipeline.default_schema.get_table("custom_path") + assert pipeline.default_schema.data_table_names() == ["custom_path"] + # column name was contracted + assert table["columns"]["example_custom_field_c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field_c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom_path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_hint_with_break_path() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + + assert pipeline.default_schema.data_table_names() == ["flattened__dict"] + table = pipeline.default_schema.get_table("flattened__dict") + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + + # make sure data is there + data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + +def test_column_hint_with_break_path_legacy() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + # table name contracted + assert pipeline.default_schema.data_table_names() == ["flattened_dict"] + table = pipeline.default_schema.get_table("flattened_dict") + # hint applied + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + # make sure data is there + data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + def test_empty_rows_are_included() -> None: """Empty rows where all values are `None` or empty dicts create rows in the dataset with `NULL` in all columns From b4d807fc059591720f1ea14e73340e9a98041225 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 2 Dec 2024 16:26:01 +0100 Subject: [PATCH 2/2] bumps to version 1.4.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8afb332422..7377b03fde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1a1" +version = "1.4.1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ]