From 09914a381331c4a89903acc167c7ab847aebf3fd Mon Sep 17 00:00:00 2001 From: Marcel Coetzee <34739235+Pipboyguy@users.noreply.github.com> Date: Sat, 30 Nov 2024 21:26:41 +0200 Subject: [PATCH 1/4] Support Spatial Types for PostGIS (#1927) * Add dependencies Signed-off-by: Marcel Coetzee * Add shapely dependency Signed-off-by: Marcel Coetzee * Move sample geodata to correct folder Signed-off-by: Marcel Coetzee * Make smaller Signed-off-by: Marcel Coetzee * Enhance PostgresTableBuilder test suite with geometry type handling. Signed-off-by: Marcel Coetzee * Add tests Signed-off-by: Marcel Coetzee * Add geometry columns with default SRID 4326. Signed-off-by: Marcel Coetzee * resource can't serialize shapely objects Signed-off-by: Marcel Coetzee * Expand geom test Signed-off-by: Marcel Coetzee * Comments Signed-off-by: Marcel Coetzee * Update lock file Signed-off-by: Marcel Coetzee * schema Signed-off-by: Marcel Coetzee * [fix](database): remove unused hex validation method Signed-off-by: Marcel Coetzee * Create custom insert job for geom types Signed-off-by: Marcel Coetzee * Remove hanging client parameter Signed-off-by: Marcel Coetzee * Add a TODO comment to address the issue in the splitting logic Signed-off-by: Marcel Coetzee * Remove unnecessary init override Signed-off-by: Marcel Coetzee * Add debugging points Signed-off-by: Marcel Coetzee * [test](database): add tests for geometry parsing in Postgres Signed-off-by: Marcel Coetzee * Correct row parsing in Postgres destination Signed-off-by: Marcel Coetzee * Yield from supermethod Signed-off-by: Marcel Coetzee * Add control flow for geom Signed-off-by: Marcel Coetzee * Add test * refactor geo parsing Signed-off-by: Marcel Coetzee * [fix](test): correct schema name in PostGIS geometry test Signed-off-by: Marcel Coetzee * Remove stale test Signed-off-by: Marcel Coetzee * Remove geopandas test until resolution Signed-off-by: Marcel Coetzee * Add docs and raise on malformed values Signed-off-by: Marcel Coetzee * Add postgis dependency to ci Signed-off-by: Marcel Coetzee * fix postgis image repo Signed-off-by: Marcel Coetzee * Add postgis to dbt runner Signed-off-by: Marcel Coetzee * Change snippet to py instead of python Signed-off-by: Marcel Coetzee * add postgis Signed-off-by: Marcel Coetzee * Remove unused geodata file * Remove unnecessary INSERT class Signed-off-by: Marcel Coetzee * Add WKB format handling Signed-off-by: Marcel Coetzee * Packaging Signed-off-by: Marcel Coetzee * Move import to local Signed-off-by: Marcel Coetzee * Comment Signed-off-by: Marcel Coetzee * postgis docs Signed-off-by: Marcel Coetzee * Update lockfile Signed-off-by: Marcel Coetzee * fix(deps): remove shapely dependency from postgis extra Signed-off-by: Marcel Coetzee * format Signed-off-by: Marcel Coetzee * feat(postgres): add support for CSV loading of geometry columns Signed-off-by: Marcel Coetzee * Remove wkb examples in docs Signed-off-by: Marcel Coetzee * format Signed-off-by: Marcel Coetzee --------- Signed-off-by: Marcel Coetzee --- .github/workflows/test_dbt_runner.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 4 +- .github/workflows/test_local_sources.yml | 4 +- dlt/common/libs/pyarrow.py | 9 +- dlt/destinations/impl/postgres/factory.py | 22 +- dlt/destinations/impl/postgres/postgres.py | 54 ++-- .../impl/postgres/postgres_adapter.py | 63 +++++ .../dlt-ecosystem/destinations/postgres.md | 52 +++- poetry.lock | 100 ++++++-- pyproject.toml | 2 + .../postgres/test_postgres_table_builder.py | 239 +++++++++++++++++- tests/load/postgres/utils.py | 68 +++++ 13 files changed, 547 insertions(+), 74 deletions(-) create mode 100644 dlt/destinations/impl/postgres/postgres_adapter.py create mode 100644 tests/load/postgres/utils.py diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index 13810fbc0d..ad29909d9a 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -60,7 +60,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres --with sentry-sdk,dbt + run: poetry install --no-interaction -E postgres -E postgis --with sentry-sdk,dbt - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index df398e13ad..933248d994 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 61bfe1551a..4947a46a3b 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -48,7 +48,7 @@ jobs: # Label used to access the service container postgres: # Docker Hub image - image: postgres + image: postgis/postgis # Provide the password for postgres env: POSTGRES_DB: dlt_data @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_local_sources.yml b/.github/workflows/test_local_sources.yml index 8a3ba2a670..39689f5c85 100644 --- a/.github/workflows/test_local_sources.yml +++ b/.github/workflows/test_local_sources.yml @@ -43,7 +43,7 @@ jobs: # Label used to access the service container postgres: # Docker Hub image - image: postgres + image: postgis/postgis # Provide the password for postgres env: POSTGRES_DB: dlt_data @@ -83,7 +83,7 @@ jobs: # TODO: which deps should we enable? - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E sql_database --with sentry-sdk,pipeline,sources # run sources tests in load against configured destinations - run: poetry run pytest tests/load/sources diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 37268c0d2f..029cd75399 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -628,7 +628,14 @@ def row_tuples_to_arrow( " extracting an SQL VIEW that selects with cast." ) json_str_array = pa.array( - [None if s is None else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) for s in columnar_known_types[field.name]] + [ + ( + None + if s is None + else json.dumps(s) if not issubclass(type(s), set) else json.dumps(list(s)) + ) + for s in columnar_known_types[field.name] + ] ) columnar_known_types[field.name] = json_str_array diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py index bde0e35f3d..e0dc2836eb 100644 --- a/dlt/destinations/impl/postgres/factory.py +++ b/dlt/destinations/impl/postgres/factory.py @@ -1,19 +1,19 @@ import typing as t +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.data_writers.configuration import CsvFormatConfiguration -from dlt.common.destination import Destination, DestinationCapabilitiesContext from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal -from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.destination import Destination, DestinationCapabilitiesContext from dlt.common.destination.typing import PreparedTableSchema from dlt.common.exceptions import TerminalValueError from dlt.common.schema.typing import TColumnSchema, TColumnType from dlt.common.wei import EVM_DECIMAL_PRECISION - -from dlt.destinations.type_mapping import TypeMapperImpl from dlt.destinations.impl.postgres.configuration import ( PostgresCredentials, PostgresClientConfiguration, ) +from dlt.destinations.impl.postgres.postgres_adapter import GEOMETRY_HINT, SRID_HINT +from dlt.destinations.type_mapping import TypeMapperImpl if t.TYPE_CHECKING: from dlt.destinations.impl.postgres.postgres import PostgresClient @@ -55,6 +55,7 @@ class PostgresTypeMapper(TypeMapperImpl): "character varying": "text", "smallint": "bigint", "integer": "bigint", + "geometry": "text", } def to_db_integer_type(self, column: TColumnSchema, table: PreparedTableSchema = None) -> str: @@ -108,11 +109,18 @@ def to_db_datetime_type( def from_destination_type( self, db_type: str, precision: t.Optional[int] = None, scale: t.Optional[int] = None ) -> TColumnType: - if db_type == "numeric": - if (precision, scale) == self.capabilities.wei_precision: - return dict(data_type="wei") + if db_type == "numeric" and (precision, scale) == self.capabilities.wei_precision: + return dict(data_type="wei") + if db_type.startswith("geometry"): + return dict(data_type="text") return super().from_destination_type(db_type, precision, scale) + def to_destination_type(self, column: TColumnSchema, table: PreparedTableSchema) -> str: + if column.get(GEOMETRY_HINT): + srid = column.get(SRID_HINT, 4326) + return f"geometry(Geometry, {srid})" + return super().to_destination_type(column, table) + class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]): spec = PostgresClientConfiguration diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 682f70da04..2459ee1dbe 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -2,9 +2,9 @@ from dlt.common import logger from dlt.common.data_writers.configuration import CsvFormatConfiguration +from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.exceptions import ( DestinationInvalidFileFormat, - DestinationTerminalException, ) from dlt.common.destination.reference import ( HasFollowupJobs, @@ -12,20 +12,16 @@ RunnableLoadJob, FollowupJobRequest, LoadJob, - TLoadJobState, ) -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.exceptions import TerminalValueError from dlt.common.schema import TColumnSchema, TColumnHint, Schema -from dlt.common.schema.typing import TColumnType, TTableFormat +from dlt.common.schema.typing import TColumnType from dlt.common.schema.utils import is_nullable_column from dlt.common.storages.file_storage import FileStorage - -from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlJobParams -from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient +from dlt.destinations.insert_job_client import InsertValuesJobClient from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.sql_jobs import SqlStagingCopyFollowupJob, SqlJobParams HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} @@ -43,15 +39,16 @@ def generate_sql( with sql_client.with_staging_dataset(): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) - # drop destination table - sql.append(f"DROP TABLE IF EXISTS {table_name};") - # moving staging table to destination schema - sql.append( - f"ALTER TABLE {staging_table_name} SET SCHEMA" - f" {sql_client.fully_qualified_dataset_name()};" + sql.extend( + ( + f"DROP TABLE IF EXISTS {table_name};", + ( + f"ALTER TABLE {staging_table_name} SET SCHEMA" + f" {sql_client.fully_qualified_dataset_name()};" + ), + f"CREATE TABLE {staging_table_name} (like {table_name} including all);", + ) ) - # recreate staging table - sql.append(f"CREATE TABLE {staging_table_name} (like {table_name} including all);") return sql @@ -111,8 +108,7 @@ def run(self) -> None: split_columns.append(norm_col) if norm_col in split_headers and is_nullable_column(col): split_null_headers.append(norm_col) - split_unknown_headers = set(split_headers).difference(split_columns) - if split_unknown_headers: + if split_unknown_headers := set(split_headers).difference(split_columns): raise DestinationInvalidFileFormat( "postgres", "csv", @@ -130,15 +126,8 @@ def run(self) -> None: qualified_table_name = sql_client.make_qualified_table_name(table_name) copy_sql = ( - "COPY %s (%s) FROM STDIN WITH (FORMAT CSV, DELIMITER '%s', NULL ''," - " %s ENCODING '%s')" - % ( - qualified_table_name, - headers, - sep, - null_headers, - csv_format.encoding, - ) + f"COPY {qualified_table_name} ({headers}) FROM STDIN WITH (FORMAT CSV, DELIMITER" + f" '{sep}', NULL '', {null_headers} ENCODING '{csv_format.encoding}')" ) with sql_client.begin_transaction(): with sql_client.native_connection.cursor() as cursor: @@ -173,15 +162,16 @@ def create_load_job( return job def _get_column_def_sql(self, c: TColumnSchema, table: PreparedTableSchema = None) -> str: - hints_str = " ".join( + hints_ = " ".join( self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True ) column_name = self.sql_client.escape_column_name(c["name"]) - return ( - f"{column_name} {self.type_mapper.to_destination_type(c,table)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" - ) + nullability = self._gen_not_null(c.get("nullable", True)) + column_type = self.type_mapper.to_destination_type(c, table) + + return f"{column_name} {column_type} {hints_} {nullability}" def _create_replace_followup_jobs( self, table_chain: Sequence[PreparedTableSchema] diff --git a/dlt/destinations/impl/postgres/postgres_adapter.py b/dlt/destinations/impl/postgres/postgres_adapter.py new file mode 100644 index 0000000000..11e86ec525 --- /dev/null +++ b/dlt/destinations/impl/postgres/postgres_adapter.py @@ -0,0 +1,63 @@ +from typing import Any, Optional + +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.destinations.utils import get_resource_for_adapter +from dlt.extract import DltResource + +GEOMETRY_HINT = "x-postgres-geometry" +SRID_HINT = "x-postgres-srid" + + +def postgres_adapter( + data: Any, + geometry: TColumnNames = None, + srid: Optional[int] = 4326, +) -> DltResource: + """Prepares data for the postgres destination by specifying which columns should + be cast to PostGIS geometry types. + + Args: + data (Any): The data to be transformed. It can be raw data or an instance + of DltResource. If raw data, the function wraps it into a DltResource + object. + geometry (TColumnNames, optional): Specify columns to cast to geometries. + It can be a single column name as a string, or a list of column names. + srid (int, optional): The Spatial Reference System Identifier (SRID) to be + used for the geometry columns. If not provided, SRID 4326 will be used. + + Returns: + DltResource: A resource with applied postgres-specific hints. + + Raises: + ValueError: If input for `geometry` is invalid, or if no geometry columns are specified. + + Examples: + >>> data = [{"town": "Null Island", "loc": "POINT(0 0)"}] + >>> postgres_adapter(data, geometry="loc", srid=4326) + [DltResource with hints applied] + """ + resource = get_resource_for_adapter(data) + + column_hints: TTableSchemaColumns = {} + + if geometry: + if isinstance(geometry, str): + geometry = [geometry] + if not isinstance(geometry, list): + raise ValueError( + "'geometry' must be a list of column names or a single column name as a string." + ) + + for column_name in geometry: + column_hints[column_name] = { + "name": column_name, + GEOMETRY_HINT: True, # type: ignore[misc] + } + if srid is not None: + column_hints[column_name][SRID_HINT] = srid # type: ignore + + if not column_hints: + raise ValueError("A value for 'geometry' must be specified.") + else: + resource.apply_hints(columns=column_hints) + return resource diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index bb9aba9051..922b187a7e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -117,7 +117,57 @@ In the example above, `arrow_table` will be converted to CSV with **pyarrow** an ## Supported column hints `postgres` will create unique indexes for all columns with `unique` hints. This behavior **may be disabled**. -### Table and column identifiers +### Spatial Types + +To enable GIS capabilities in your Postgres destination, use the `x-postgres-geometry` and `x-postgres-srid` hints for columns containing geometric data. +The `postgres_adapter` facilitates applying these hints conveniently, with a default SRID of `4326`. + +**Supported Geometry Types:** + +- WKT (Well-Known Text) +- Hex Representation + +If you have geometry data in binary format, you will need to convert it to hexadecimal representation before loading. + +**Example:** Using `postgres_adapter` with Different Geometry Types + +```py +from dlt.destinations.impl.postgres.postgres_adapter import postgres_adapter + +# Sample data with various geometry types +data_wkt = [ + {"type": "Point_wkt", "geom": "POINT (1 1)"}, + {"type": "Point_wkt", "geom": "Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)])"}, + ] + +data_wkb_hex = [ + {"type": "Point_wkb_hex", "geom": "0101000000000000000000F03F000000000000F03F"}, + {"type": "Point_wkb_hex", "geom": "01020000000300000000000000000000000000000000000000000000000000F03F000000000000F03F00000000000000400000000000000040"}, +] + + + +# Apply postgres_adapter to the 'geom' column with default SRID 4326 +resource_wkt = postgres_adapter(data_wkt, geometry="geom") +resource_wkb_hex = postgres_adapter(data_wkb_hex, geometry="geom") + +# If you need a different SRID +resource_wkt = postgres_adapter(data_wkt, geometry="geom", srid=3242) +``` + +Ensure that the PostGIS extension is enabled in your Postgres database: + +```sql +CREATE EXTENSION postgis; +``` + +This configuration allows `dlt` to map the `geom` column to the PostGIS `geometry` type for spatial queries and analyses. + +:::warning +`LinearRing` geometry type isn't supported. +::: + +## Table and column identifiers Postgres supports both case-sensitive and case-insensitive identifiers. All unquoted and lowercase identifiers resolve case-insensitively in SQL statements. Case insensitive [naming conventions](../../general-usage/naming-convention.md#case-sensitive-and-insensitive-destinations) like the default **snake_case** will generate case-insensitive identifiers. Case sensitive (like **sql_cs_v1**) will generate case-sensitive identifiers that must be quoted in SQL statements. ## Additional destination options diff --git a/poetry.lock b/poetry.lock index 1bcff1de4a..9ae26bd04c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "about-time" @@ -13,13 +13,13 @@ files = [ [[package]] name = "adlfs" -version = "2024.7.0" +version = "2024.4.1" description = "Access Azure Datalake Gen1 with fsspec and dask" optional = true python-versions = ">=3.8" files = [ - {file = "adlfs-2024.7.0-py3-none-any.whl", hash = "sha256:2005c8e124fda3948f2a6abb2dbebb2c936d2d821acaca6afd61932edfa9bc07"}, - {file = "adlfs-2024.7.0.tar.gz", hash = "sha256:106995b91f0eb5e775bcd5957d180d9a14faef3271a063b1f65c66fd5ab05ddf"}, + {file = "adlfs-2024.4.1-py3-none-any.whl", hash = "sha256:acea94612ddacaa34ea8c6babcc95b8da6982f930cdade7a86fbd17382403e16"}, + {file = "adlfs-2024.4.1.tar.gz", hash = "sha256:75530a45447f358ae53c5c39c298b8d966dae684be84db899f63b94cd96fc000"}, ] [package.dependencies] @@ -4504,13 +4504,13 @@ files = [ [[package]] name = "ibis-framework" -version = "10.0.0.dev231" +version = "10.0.0.dev256" description = "The portable Python dataframe library" optional = true python-versions = "<4.0,>=3.10" files = [ - {file = "ibis_framework-10.0.0.dev231-py3-none-any.whl", hash = "sha256:8689cbcd55c3680bdb5fd51ff0d2a10260372c1b15661c123b0460087cfdbda2"}, - {file = "ibis_framework-10.0.0.dev231.tar.gz", hash = "sha256:199142243d1a6a0eba3bbbe0debba910fc8087dffe4eac9e3d61823f6988f421"}, + {file = "ibis_framework-10.0.0.dev256-py3-none-any.whl", hash = "sha256:d6f21278e6fd78920bbe986df2c871921142635cc4f7d5d2048cae26e307a3df"}, + {file = "ibis_framework-10.0.0.dev256.tar.gz", hash = "sha256:e9f97d8177fd88f4a3578be20519c1da79a6a7ffac678b46b790bfde67405930"}, ] [package.dependencies] @@ -4520,26 +4520,27 @@ db-dtypes = {version = ">=0.3,<2", optional = true, markers = "extra == \"bigque duckdb = {version = ">=0.10,<1.2", optional = true, markers = "extra == \"duckdb\""} google-cloud-bigquery = {version = ">=3,<4", optional = true, markers = "extra == \"bigquery\""} google-cloud-bigquery-storage = {version = ">=2,<3", optional = true, markers = "extra == \"bigquery\""} -numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"duckdb\" or extra == \"oracle\" or extra == \"polars\" or extra == \"pyspark\""} -pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} parsy = ">=2,<3" psycopg2 = {version = ">=2.8.4,<3", optional = true, markers = "extra == \"postgres\" or extra == \"risingwave\""} -pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} pydata-google-auth = {version = ">=1.4.0,<2", optional = true, markers = "extra == \"bigquery\""} pyodbc = {version = ">=4.0.39,<6", optional = true, markers = "extra == \"mssql\""} python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" -rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} snowflake-connector-python = {version = ">=3.0.2,<3.3.0b1 || >3.3.0b1,<4", optional = true, markers = "extra == \"snowflake\""} -sqlglot = ">=23.4,<25.29" +sqlglot = ">=23.4,<25.30" toolz = ">=0.11,<2" typing-extensions = ">=4.3.0,<5" [package.extras] bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +databricks = ["databricks-sql-connector-core (>=4,<5)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] datafusion = ["datafusion (>=0.6,<43)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] decompiler = ["black (>=22.1.0,<25)"] deltalake = ["deltalake (>=0.9.0,<1)"] @@ -7453,18 +7454,18 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydata-google-auth" -version = "1.8.2" +version = "1.9.0" description = "PyData helpers for authenticating to Google APIs" optional = true -python-versions = "*" +python-versions = ">=3.9" files = [ - {file = "pydata-google-auth-1.8.2.tar.gz", hash = "sha256:547b6c0fbea657dcecd50887c5db8640ebec062a59a2b88e8ff8e53a04818303"}, - {file = "pydata_google_auth-1.8.2-py2.py3-none-any.whl", hash = "sha256:a9dce59af4a170ea60c4b2ebbc83ee1f74d34255a4f97b2469ae9a4a0dc98e99"}, + {file = "pydata-google-auth-1.9.0.tar.gz", hash = "sha256:2f546e88f007dfdb050087556eb46d6008e351386a7b368096797fae5df374f2"}, + {file = "pydata_google_auth-1.9.0-py2.py3-none-any.whl", hash = "sha256:e17a44ce8de5b48883667357c03595b85d80938bf1fb714d65bfac9a9f9c8add"}, ] [package.dependencies] -google-auth = {version = ">=1.25.0,<3.0dev", markers = "python_version >= \"3.6\""} -google-auth-oauthlib = {version = ">=0.4.0", markers = "python_version >= \"3.6\""} +google-auth = ">=1.25.0,<3.0dev" +google-auth-oauthlib = ">=0.4.0" setuptools = "*" [[package]] @@ -8820,6 +8821,64 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "shapely" +version = "2.0.6" +description = "Manipulation and analysis of geometric objects" +optional = false +python-versions = ">=3.7" +files = [ + {file = "shapely-2.0.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:29a34e068da2d321e926b5073539fd2a1d4429a2c656bd63f0bd4c8f5b236d0b"}, + {file = "shapely-2.0.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c84c3f53144febf6af909d6b581bc05e8785d57e27f35ebaa5c1ab9baba13b"}, + {file = "shapely-2.0.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad2fae12dca8d2b727fa12b007e46fbc522148a584f5d6546c539f3464dccde"}, + {file = "shapely-2.0.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3304883bd82d44be1b27a9d17f1167fda8c7f5a02a897958d86c59ec69b705e"}, + {file = "shapely-2.0.6-cp310-cp310-win32.whl", hash = "sha256:3ec3a0eab496b5e04633a39fa3d5eb5454628228201fb24903d38174ee34565e"}, + {file = "shapely-2.0.6-cp310-cp310-win_amd64.whl", hash = "sha256:28f87cdf5308a514763a5c38de295544cb27429cfa655d50ed8431a4796090c4"}, + {file = "shapely-2.0.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5aeb0f51a9db176da9a30cb2f4329b6fbd1e26d359012bb0ac3d3c7781667a9e"}, + {file = "shapely-2.0.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9a7a78b0d51257a367ee115f4d41ca4d46edbd0dd280f697a8092dd3989867b2"}, + {file = "shapely-2.0.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f32c23d2f43d54029f986479f7c1f6e09c6b3a19353a3833c2ffb226fb63a855"}, + {file = "shapely-2.0.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b3dc9fb0eb56498912025f5eb352b5126f04801ed0e8bdbd867d21bdbfd7cbd0"}, + {file = "shapely-2.0.6-cp311-cp311-win32.whl", hash = "sha256:d93b7e0e71c9f095e09454bf18dad5ea716fb6ced5df3cb044564a00723f339d"}, + {file = "shapely-2.0.6-cp311-cp311-win_amd64.whl", hash = "sha256:c02eb6bf4cfb9fe6568502e85bb2647921ee49171bcd2d4116c7b3109724ef9b"}, + {file = "shapely-2.0.6-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cec9193519940e9d1b86a3b4f5af9eb6910197d24af02f247afbfb47bcb3fab0"}, + {file = "shapely-2.0.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83b94a44ab04a90e88be69e7ddcc6f332da7c0a0ebb1156e1c4f568bbec983c3"}, + {file = "shapely-2.0.6-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:537c4b2716d22c92036d00b34aac9d3775e3691f80c7aa517c2c290351f42cd8"}, + {file = "shapely-2.0.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98fea108334be345c283ce74bf064fa00cfdd718048a8af7343c59eb40f59726"}, + {file = "shapely-2.0.6-cp312-cp312-win32.whl", hash = "sha256:42fd4cd4834747e4990227e4cbafb02242c0cffe9ce7ef9971f53ac52d80d55f"}, + {file = "shapely-2.0.6-cp312-cp312-win_amd64.whl", hash = "sha256:665990c84aece05efb68a21b3523a6b2057e84a1afbef426ad287f0796ef8a48"}, + {file = "shapely-2.0.6-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:42805ef90783ce689a4dde2b6b2f261e2c52609226a0438d882e3ced40bb3013"}, + {file = "shapely-2.0.6-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6d2cb146191a47bd0cee8ff5f90b47547b82b6345c0d02dd8b25b88b68af62d7"}, + {file = "shapely-2.0.6-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3fdef0a1794a8fe70dc1f514440aa34426cc0ae98d9a1027fb299d45741c381"}, + {file = "shapely-2.0.6-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c665a0301c645615a107ff7f52adafa2153beab51daf34587170d85e8ba6805"}, + {file = "shapely-2.0.6-cp313-cp313-win32.whl", hash = "sha256:0334bd51828f68cd54b87d80b3e7cee93f249d82ae55a0faf3ea21c9be7b323a"}, + {file = "shapely-2.0.6-cp313-cp313-win_amd64.whl", hash = "sha256:d37d070da9e0e0f0a530a621e17c0b8c3c9d04105655132a87cfff8bd77cc4c2"}, + {file = "shapely-2.0.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:fa7468e4f5b92049c0f36d63c3e309f85f2775752e076378e36c6387245c5462"}, + {file = "shapely-2.0.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed5867e598a9e8ac3291da6cc9baa62ca25706eea186117034e8ec0ea4355653"}, + {file = "shapely-2.0.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81d9dfe155f371f78c8d895a7b7f323bb241fb148d848a2bf2244f79213123fe"}, + {file = "shapely-2.0.6-cp37-cp37m-win32.whl", hash = "sha256:fbb7bf02a7542dba55129062570211cfb0defa05386409b3e306c39612e7fbcc"}, + {file = "shapely-2.0.6-cp37-cp37m-win_amd64.whl", hash = "sha256:837d395fac58aa01aa544495b97940995211e3e25f9aaf87bc3ba5b3a8cd1ac7"}, + {file = "shapely-2.0.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c6d88ade96bf02f6bfd667ddd3626913098e243e419a0325ebef2bbd481d1eb6"}, + {file = "shapely-2.0.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8b3b818c4407eaa0b4cb376fd2305e20ff6df757bf1356651589eadc14aab41b"}, + {file = "shapely-2.0.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbc783529a21f2bd50c79cef90761f72d41c45622b3e57acf78d984c50a5d13"}, + {file = "shapely-2.0.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2423f6c0903ebe5df6d32e0066b3d94029aab18425ad4b07bf98c3972a6e25a1"}, + {file = "shapely-2.0.6-cp38-cp38-win32.whl", hash = "sha256:2de00c3bfa80d6750832bde1d9487e302a6dd21d90cb2f210515cefdb616e5f5"}, + {file = "shapely-2.0.6-cp38-cp38-win_amd64.whl", hash = "sha256:3a82d58a1134d5e975f19268710e53bddd9c473743356c90d97ce04b73e101ee"}, + {file = "shapely-2.0.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:392f66f458a0a2c706254f473290418236e52aa4c9b476a072539d63a2460595"}, + {file = "shapely-2.0.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:eba5bae271d523c938274c61658ebc34de6c4b33fdf43ef7e938b5776388c1be"}, + {file = "shapely-2.0.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7060566bc4888b0c8ed14b5d57df8a0ead5c28f9b69fb6bed4476df31c51b0af"}, + {file = "shapely-2.0.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b02154b3e9d076a29a8513dffcb80f047a5ea63c897c0cd3d3679f29363cf7e5"}, + {file = "shapely-2.0.6-cp39-cp39-win32.whl", hash = "sha256:44246d30124a4f1a638a7d5419149959532b99dfa25b54393512e6acc9c211ac"}, + {file = "shapely-2.0.6-cp39-cp39-win_amd64.whl", hash = "sha256:2b542d7f1dbb89192d3512c52b679c822ba916f93479fa5d4fc2fe4fa0b3c9e8"}, + {file = "shapely-2.0.6.tar.gz", hash = "sha256:997f6159b1484059ec239cacaa53467fd8b5564dabe186cd84ac2944663b0bf6"}, +] + +[package.dependencies] +numpy = ">=1.14,<3" + +[package.extras] +docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] +test = ["pytest", "pytest-cov"] + [[package]] name = "shellingham" version = "1.5.4" @@ -10544,6 +10603,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] +postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] @@ -10558,4 +10618,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "749c79ead9b1a800cbe5d9c93650e2ede7e9bcb240d07ff2d1787d032a0f2fa6" +content-hash = "24e262ce6bb496fad6e587c76bb9ad60a2cc45a00f52e368b59978093e57b77c" diff --git a/pyproject.toml b/pyproject.toml index a1a71a1a6a..638653ffcf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,6 +119,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +postgis = ["psycopg2-binary", "psycopg2cffi"] [tool.poetry.scripts] dlt = "dlt.cli._dlt:_main" @@ -168,6 +169,7 @@ types-regex = "^2024.5.15.20240519" flake8-print = "^5.0.0" mimesis = "^7.0.0" ibis-framework = { version = ">=9.0.0", markers = "python_version >= '3.10'", optional = true, extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} +shapely = ">=2.0.6" [tool.poetry.group.sources] optional = true diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 4dac400f2a..e2ed0f0b2e 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -1,24 +1,35 @@ -import pytest from copy import deepcopy +from typing import Generator, Any, List + +import pytest import sqlfluff +import dlt from dlt.common.exceptions import TerminalValueError -from dlt.common.utils import uniq_id from dlt.common.schema import Schema, utils - +from dlt.common.typing import DictStrStr +from dlt.common.utils import uniq_id from dlt.destinations import postgres -from dlt.destinations.impl.postgres.postgres import PostgresClient from dlt.destinations.impl.postgres.configuration import ( PostgresClientConfiguration, PostgresCredentials, ) - +from dlt.destinations.impl.postgres.postgres import ( + PostgresClient, +) +from dlt.destinations.impl.postgres.postgres_adapter import ( + postgres_adapter, + SRID_HINT, + GEOMETRY_HINT, +) +from dlt.extract import DltResource from tests.cases import ( TABLE_UPDATE, TABLE_UPDATE_ALL_INT_PRECISIONS, - TABLE_UPDATE_ALL_TIMESTAMP_PRECISIONS, ) -from tests.load.utils import empty_schema +from tests.load.postgres.utils import generate_sample_geometry_records +from tests.load.utils import destinations_configs, DestinationTestConfiguration, sequence_generator +from tests.utils import assert_load_info # mark all tests as essential, do not remove pytestmark = pytest.mark.essential @@ -182,3 +193,217 @@ def test_create_dlt_table(client: PostgresClient) -> None: sqlfluff.parse(sql, dialect="postgres") qualified_name = client.sql_client.make_qualified_table_name("_dlt_version") assert f"CREATE TABLE IF NOT EXISTS {qualified_name}" in sql + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres"]), + ids=lambda x: x.name, +) +def test_adapter_geometry_hint_config( + destination_config: DestinationTestConfiguration, +) -> None: + @dlt.resource(columns=[{"name": "content", "data_type": "text"}]) + def some_data() -> Generator[DictStrStr, Any, None]: + yield from next(sequence_generator()) + + assert some_data.columns["content"] == {"name": "content", "data_type": "text"} # type: ignore[index] + + # Default SRID. + postgres_adapter(some_data, geometry=["content"]) + + assert some_data.columns["content"] == { # type: ignore + "name": "content", + "data_type": "text", + GEOMETRY_HINT: True, + SRID_HINT: 4326, + } + + # Nonstandard SRID. + postgres_adapter(some_data, geometry="content", srid=8232) + + assert some_data.columns["content"] == { # type: ignore + "name": "content", + "data_type": "text", + GEOMETRY_HINT: True, + SRID_HINT: 8232, + } + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres"]), + ids=lambda x: x.name, +) +def test_geometry_types( + destination_config: DestinationTestConfiguration, +) -> None: + from shapely import wkt, wkb, LinearRing, Polygon # type: ignore + + @dlt.resource + def geodata_default_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource + def geodata_3857_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource + def geodata_2163_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource + def geodata_default_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource + def geodata_3857_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource + def geodata_2163_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource(file_format="csv") + def geodata_default_csv_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource(file_format="csv") + def geodata_3857_csv_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource(file_format="csv") + def geodata_2163_csv_wkt(): + yield from generate_sample_geometry_records("wkt") + + @dlt.resource(file_format="csv") + def geodata_default_csv_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource(file_format="csv") + def geodata_3857_csv_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource(file_format="csv") + def geodata_2163_csv_wkb_hex(): + yield from generate_sample_geometry_records("wkb_hex") + + @dlt.resource + def no_geodata(): + yield from [{"a": 1}, {"a": 2}] + + postgres_adapter(geodata_default_wkt, geometry=["geom"]) + postgres_adapter(geodata_3857_wkt, geometry=["geom"], srid=3857) + postgres_adapter(geodata_2163_wkt, geometry=["geom"], srid=2163) + postgres_adapter(geodata_default_wkb_hex, geometry=["geom"]) + postgres_adapter(geodata_3857_wkb_hex, geometry=["geom"], srid=3857) + postgres_adapter(geodata_2163_wkb_hex, geometry=["geom"], srid=2163) + postgres_adapter(geodata_default_csv_wkt, geometry=["geom"]) + postgres_adapter(geodata_3857_csv_wkt, geometry=["geom"], srid=3857) + postgres_adapter(geodata_2163_csv_wkt, geometry=["geom"], srid=2163) + postgres_adapter(geodata_default_csv_wkb_hex, geometry=["geom"]) + postgres_adapter(geodata_3857_csv_wkb_hex, geometry=["geom"], srid=3857) + postgres_adapter(geodata_2163_csv_wkb_hex, geometry=["geom"], srid=2163) + + @dlt.source + def geodata() -> List[DltResource]: + return [ + geodata_default_wkt, + geodata_3857_wkt, + geodata_2163_wkt, + geodata_default_wkb_hex, + geodata_3857_wkb_hex, + geodata_2163_wkb_hex, + no_geodata, + geodata_default_csv_wkt, + geodata_3857_csv_wkt, + geodata_2163_csv_wkt, + geodata_default_csv_wkb_hex, + geodata_3857_csv_wkb_hex, + geodata_2163_csv_wkb_hex, + ] + + pipeline = destination_config.setup_pipeline("test_geometry_types", dev_mode=True) + info = pipeline.run( + geodata(), + ) + assert_load_info(info) + + # Assert that types were read in as PostGIS geometry types + with pipeline.sql_client() as c: + with c.execute_query(f"""SELECT f_geometry_column +FROM geometry_columns +WHERE f_table_name in + ('geodata_default_wkb', 'geodata_3857_wkb', 'geodata_2163_wkb', 'geodata_default_wkt', 'geodata_3857_wkt', + 'geodata_2163_wkt', 'geodata_default_wkb_hex', 'geodata_3857_wkb_hex', 'geodata_2163_wkb_hex', + 'geodata_default_csv_wkt', 'geodata_3857_csv_wkt', 'geodata_2163_csv_wkt', 'geodata_default_csv_wkb_hex', + 'geodata_3857_csv_wkb_hex', 'geodata_2163_csv_wkb_hex' + ) + AND f_table_schema = '{c.fully_qualified_dataset_name(escape=False)}'""") as cur: + records = cur.fetchall() + assert records + assert {record[0] for record in records} == {"geom"} + + # Verify round-trip integrity + for resource in [ + "geodata_default_wkt", + "geodata_3857_wkt", + "geodata_2163_wkt", + "geodata_default_wkb_hex", + "geodata_3857_wkb_hex", + "geodata_2163_wkb_hex", + "geodata_default_csv_wkt", + "geodata_3857_csv_wkt", + "geodata_2163_csv_wkt", + "geodata_default_csv_wkb_hex", + "geodata_3857_csv_wkb_hex", + "geodata_2163_csv_wkb_hex", + ]: + srid = 4326 if resource.startswith("geodata_default") else int(resource.split("_")[1]) + + query = f""" + SELECT type, ST_AsText(geom) as wkt, ST_SRID(geom) as srid, ST_AsBinary(geom) as wkb + FROM {c.make_qualified_table_name(resource)} + """ + + with c.execute_query(query) as cur: + results = cur.fetchall() + + def get_format(column_name): + if column_name.endswith("wkb_hex"): + return "wkb_hex" + return column_name.split("_")[-1] + + original_geometries = generate_sample_geometry_records(get_format(resource)) + + for result in results: + db_type, db_wkt, db_srid, db_wkb = result + orig_geom = next((g for g in original_geometries if g["type"] == db_type), None) + + assert orig_geom is not None, f"No matching original geometry found for {db_type}" + + assert ( + db_srid == srid + ), f"SRID mismatch for {db_type}: expected {srid}, got {db_srid}" + + if "Empty" in db_type: + assert wkt.loads(db_wkt).is_empty, f"Expected empty geometry for {db_type}" + else: + if "_wkt" in db_type: + orig_geom = wkt.loads(orig_geom["geom"]) + db_geom = wkt.loads(db_wkt) + elif "_wkb_hex" in db_type: + orig_geom = wkb.loads(bytes.fromhex(orig_geom["geom"])) + db_geom = wkb.loads(bytes(db_wkb)) + + tolerance = 1e-8 + if isinstance(orig_geom, LinearRing): + # LinearRing geometries are converted to Polygons for PostGIS compatibility. + db_geom = Polygon(orig_geom) + assert LinearRing(db_geom.exterior.coords).equals_exact( + orig_geom, tolerance + ), f"Geometry mismatch for {db_type}" + else: + assert orig_geom.equals_exact( # type: ignore[attr-defined] + db_geom, tolerance + ), f"Geometry mismatch for {db_type}" diff --git a/tests/load/postgres/utils.py b/tests/load/postgres/utils.py new file mode 100644 index 0000000000..b03a6b5096 --- /dev/null +++ b/tests/load/postgres/utils.py @@ -0,0 +1,68 @@ +from typing import List + +from shapely import ( # type: ignore + Point, + LineString, + Polygon, + MultiPoint, + MultiLineString, + MultiPolygon, + GeometryCollection, + LinearRing, +) +from shapely.wkb import dumps as wkb_dumps # type: ignore + +from dlt.common.typing import DictStrStr + + +def generate_sample_geometry_records(geometry_type: str) -> List[DictStrStr]: + """ + Generate sample geometry records including WKT and WKB representations. + + Returns: + A list of dictionaries, each containing a geometry type, + its Well-Known Text (WKT), and Well-Known Binary (WKB) representation. + """ + geometries = [ + ("Point", Point(1, 1)), + ("LineString", LineString([(0, 0), (1, 1), (2, 2)])), + ("Polygon", Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)])), + ("MultiPoint", MultiPoint([(0, 0), (1, 1), (2, 2)])), + ("MultiLineString", MultiLineString([((0, 0), (1, 1)), ((2, 2), (3, 3))])), + ( + "MultiPolygon", + MultiPolygon( + [ + Polygon([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]), + Polygon([(2, 2), (3, 2), (3, 3), (2, 3), (2, 2)]), + ] + ), + ), + ( + "GeometryCollection", + GeometryCollection([Point(1, 1), LineString([(0, 0), (1, 1), (2, 2)])]), + ), + ( + "ComplexPolygon", + Polygon( + [(0, 0), (10, 0), (10, 10), (0, 10), (0, 0)], + [[(4, 4), (6, 4), (6, 6), (4, 6), (4, 4)]], + ), + ), + ("EmptyPoint", Point()), + ("EmptyLineString", LineString()), + ("EmptyPolygon", Polygon()), + ("EmptyMultiPoint", MultiPoint()), + ("EmptyMultiLineString", MultiLineString()), + ("EmptyMultiPolygon", MultiPolygon()), + ("EmptyGeometryCollection", GeometryCollection()), + ] + + # LinearRing only works with wkb types + if geometry_type == "wkb": + geometries += [("LinearRing", LinearRing([(0, 0), (1, 0), (1, 1), (0, 1), (0, 0)]))] + + return [ + {"type": f"{name}_{geometry_type}", "geom": getattr(geom, geometry_type)} + for name, geom in geometries + ] From 61c2ed96053bd02632b87e2c85fa940a91a9d03b Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Sat, 30 Nov 2024 14:45:29 -0500 Subject: [PATCH 2/4] Incremental table hints and incremental in resource decorator (#2033) * Incremental table hints and incremental in resource decorator * Extract incremental settings to a dict in table schema * Support passing incremental settings to @resource decorator * Fix type errors * Reset incremental from_hints when set in resource decorator * Column hint * Merge multiple hints * Test non column match * adds make_hints test * Accept TIncrementalconfig in make hints * bool only incremental hint * Test jsonpath simple field name --------- Co-authored-by: Marcin Rudolf --- dlt/common/destination/typing.py | 6 +- dlt/common/incremental/__init__.py | 0 dlt/{extract => common}/incremental/typing.py | 12 +- dlt/common/pipeline.py | 3 +- dlt/common/schema/typing.py | 5 +- dlt/common/schema/utils.py | 11 + dlt/common/typing.py | 7 + .../impl/bigquery/bigquery_adapter.py | 6 +- .../impl/lancedb/lancedb_adapter.py | 3 +- .../impl/qdrant/qdrant_adapter.py | 3 +- .../impl/weaviate/weaviate_adapter.py | 3 +- dlt/extract/decorators.py | 19 +- dlt/extract/extract.py | 7 +- dlt/extract/hints.py | 36 ++- dlt/extract/incremental/__init__.py | 74 +++++- dlt/extract/incremental/transform.py | 5 +- dlt/extract/items.py | 12 +- dlt/extract/resource.py | 113 ++++++--- dlt/extract/utils.py | 11 +- dlt/pipeline/pipeline.py | 3 +- dlt/sources/rest_api/typing.py | 7 +- tests/common/test_jsonpath.py | 43 ++++ tests/common/test_validation.py | 4 +- tests/extract/test_extract.py | 42 ++++ tests/extract/test_incremental.py | 229 +++++++++++++++++- 25 files changed, 577 insertions(+), 87 deletions(-) create mode 100644 dlt/common/incremental/__init__.py rename dlt/{extract => common}/incremental/typing.py (66%) create mode 100644 tests/common/test_jsonpath.py diff --git a/dlt/common/destination/typing.py b/dlt/common/destination/typing.py index 8cc08756cd..c79a2b0adc 100644 --- a/dlt/common/destination/typing.py +++ b/dlt/common/destination/typing.py @@ -1,6 +1,10 @@ from typing import Optional -from dlt.common.schema.typing import _TTableSchemaBase, TWriteDisposition, TTableReferenceParam +from dlt.common.schema.typing import ( + _TTableSchemaBase, + TWriteDisposition, + TTableReferenceParam, +) class PreparedTableSchema(_TTableSchemaBase, total=False): diff --git a/dlt/common/incremental/__init__.py b/dlt/common/incremental/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/extract/incremental/typing.py b/dlt/common/incremental/typing.py similarity index 66% rename from dlt/extract/incremental/typing.py rename to dlt/common/incremental/typing.py index 7b7786b529..460e2f234b 100644 --- a/dlt/extract/incremental/typing.py +++ b/dlt/common/incremental/typing.py @@ -2,9 +2,7 @@ from typing import Any, Callable, List, Literal, Optional, Sequence, TypeVar, Union -from dlt.common.schema.typing import TColumnNames -from dlt.common.typing import TSortOrder -from dlt.extract.items import TTableHintTemplate +from dlt.common.typing import TSortOrder, TTableHintTemplate, TColumnNames TCursorValue = TypeVar("TCursorValue", bound=Any) LastValueFunc = Callable[[Sequence[TCursorValue]], Any] @@ -19,10 +17,12 @@ class IncrementalColumnState(TypedDict): class IncrementalArgs(TypedDict, total=False): cursor_path: str - initial_value: Optional[str] - last_value_func: Optional[LastValueFunc[str]] + initial_value: Optional[Any] + last_value_func: Optional[Union[LastValueFunc[str], Literal["min", "max"]]] + """Last value callable or name of built in function""" primary_key: Optional[TTableHintTemplate[TColumnNames]] - end_value: Optional[str] + end_value: Optional[Any] row_order: Optional[TSortOrder] allow_external_schedulers: Optional[bool] lag: Optional[Union[float, int]] + on_cursor_value_missing: Optional[OnCursorValueMissing] diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index dba1036f85..9d3d5792ea 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -48,7 +48,6 @@ ) from dlt.common.schema import Schema from dlt.common.schema.typing import ( - TColumnNames, TColumnSchema, TWriteDispositionConfig, TSchemaContract, @@ -56,7 +55,7 @@ from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.time import ensure_pendulum_datetime, precise_time -from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize +from dlt.common.typing import DictStrAny, REPattern, StrAny, SupportsHumanize, TColumnNames from dlt.common.jsonpath import delete_matches, TAnyJsonPath from dlt.common.data_writers.writers import TLoaderFileFormat from dlt.common.utils import RowCounts, merge_row_counts diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index ed6c1c6d78..c8f5de03ed 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -19,7 +19,7 @@ from dlt.common.data_types import TDataType from dlt.common.normalizers.typing import TNormalizersConfig -from dlt.common.typing import TSortOrder, TAnyDateTime, TLoaderFileFormat +from dlt.common.typing import TSortOrder, TAnyDateTime, TLoaderFileFormat, TColumnNames try: from pydantic import BaseModel as _PydanticBaseModel @@ -132,8 +132,6 @@ class TColumnPropInfo(NamedTuple): "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" ] TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]] -TColumnNames = Union[str, Sequence[str]] -"""A string representing a column name or a list of""" class TColumnType(TypedDict, total=False): @@ -166,6 +164,7 @@ class TColumnSchema(TColumnSchemaBase, total=False): variant: Optional[bool] hard_delete: Optional[bool] dedup_sort: Optional[TSortOrder] + incremental: Optional[bool] TTableSchemaColumns = Dict[str, TColumnSchema] diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index e2e1f959dc..038abdc4d0 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -547,6 +547,17 @@ def merge_diff(table: TTableSchema, table_diff: TPartialTableSchema) -> TPartial * table hints are added or replaced from diff * nothing gets deleted """ + + incremental_a_col = get_first_column_name_with_prop( + table, "incremental", include_incomplete=True + ) + if incremental_a_col: + incremental_b_col = get_first_column_name_with_prop( + table_diff, "incremental", include_incomplete=True + ) + if incremental_b_col: + table["columns"][incremental_a_col].pop("incremental") + # add new columns when all checks passed updated_columns = merge_columns(table["columns"], table_diff["columns"]) table.update(table_diff) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 94edb57194..a3364d1b07 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -29,6 +29,7 @@ Iterator, Generator, NamedTuple, + Sequence, ) from typing_extensions import ( @@ -112,6 +113,8 @@ class SecretSentinel: TSecretStrValue = Annotated[str, SecretSentinel] +TColumnNames = Union[str, Sequence[str]] +"""A string representing a column name or a list of""" TDataItem: TypeAlias = Any """A single data item as extracted from data source""" TDataItems: TypeAlias = Union[TDataItem, List[TDataItem]] @@ -126,6 +129,10 @@ class SecretSentinel: TLoaderFileFormat = Literal["jsonl", "typed-jsonl", "insert_values", "parquet", "csv", "reference"] """known loader file formats""" +TDynHintType = TypeVar("TDynHintType") +TFunHintTemplate = Callable[[TDataItem], TDynHintType] +TTableHintTemplate = Union[TDynHintType, TFunHintTemplate[TDynHintType]] + class ConfigValueSentinel(NamedTuple): """Class to create singleton sentinel for config and secret injected value""" diff --git a/dlt/destinations/impl/bigquery/bigquery_adapter.py b/dlt/destinations/impl/bigquery/bigquery_adapter.py index 5f6a1fab85..05b26530d9 100644 --- a/dlt/destinations/impl/bigquery/bigquery_adapter.py +++ b/dlt/destinations/impl/bigquery/bigquery_adapter.py @@ -4,10 +4,8 @@ from dlt.common.destination import PreparedTableSchema from dlt.common.pendulum import timezone -from dlt.common.schema.typing import ( - TColumnNames, - TTableSchemaColumns, -) +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.typing import TColumnNames from dlt.destinations.utils import get_resource_for_adapter from dlt.extract import DltResource from dlt.extract.items import TTableHintTemplate diff --git a/dlt/destinations/impl/lancedb/lancedb_adapter.py b/dlt/destinations/impl/lancedb/lancedb_adapter.py index 4314dd703f..d192168d0a 100644 --- a/dlt/destinations/impl/lancedb/lancedb_adapter.py +++ b/dlt/destinations/impl/lancedb/lancedb_adapter.py @@ -1,6 +1,7 @@ from typing import Any, Dict -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.typing import TColumnNames from dlt.destinations.utils import get_resource_for_adapter from dlt.extract import DltResource from dlt.extract.items import TTableHintTemplate diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index bbc2d719a8..5a5a44965c 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -1,6 +1,7 @@ from typing import Any -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.typing import TColumnNames from dlt.extract import DltResource from dlt.destinations.utils import get_resource_for_adapter diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index 0ca9047528..329d13c493 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -1,6 +1,7 @@ from typing import Dict, Any, Literal, Set, get_args -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.typing import TColumnNames from dlt.extract import DltResource, resource as make_resource from dlt.destinations.utils import get_resource_for_adapter diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 63140e8f78..f8703e1452 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -32,7 +32,6 @@ from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION from dlt.common.schema.schema import Schema from dlt.common.schema.typing import ( - TColumnNames, TFileFormat, TWriteDisposition, TWriteDispositionConfig, @@ -43,7 +42,8 @@ ) from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage -from dlt.common.typing import AnyFun, ParamSpec, Concatenate, TDataItem, TDataItems +from dlt.common.typing import AnyFun, ParamSpec, Concatenate, TDataItem, TDataItems, TColumnNames + from dlt.common.utils import get_callable_name, get_module_name, is_inner_callable from dlt.extract.hints import make_hints @@ -70,6 +70,7 @@ TSourceFunParams, ) from dlt.extract.resource import DltResource, TUnboundDltResource, TDltResourceImpl +from dlt.extract.incremental import TIncrementalConfig @configspec @@ -446,6 +447,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, + incremental: Optional[TIncrementalConfig] = None, _impl_cls: Type[TDltResourceImpl] = DltResource, # type: ignore[assignment] ) -> TDltResourceImpl: ... @@ -468,6 +470,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, + incremental: Optional[TIncrementalConfig] = None, _impl_cls: Type[TDltResourceImpl] = DltResource, # type: ignore[assignment] ) -> Callable[[Callable[TResourceFunParams, Any]], TDltResourceImpl]: ... @@ -490,6 +493,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, + incremental: Optional[TIncrementalConfig] = None, _impl_cls: Type[TDltResourceImpl] = DltResource, # type: ignore[assignment] standalone: Literal[True] = True, ) -> Callable[ @@ -515,6 +519,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, + incremental: Optional[TIncrementalConfig] = None, _impl_cls: Type[TDltResourceImpl] = DltResource, # type: ignore[assignment] ) -> TDltResourceImpl: ... @@ -536,6 +541,7 @@ def resource( selected: bool = True, spec: Type[BaseConfiguration] = None, parallelized: bool = False, + incremental: Optional[TIncrementalConfig] = None, _impl_cls: Type[TDltResourceImpl] = DltResource, # type: ignore[assignment] standalone: bool = False, data_from: TUnboundDltResource = None, @@ -632,6 +638,7 @@ def make_resource(_name: str, _section: str, _data: Any) -> TDltResourceImpl: table_format=table_format, file_format=file_format, references=references, + incremental=incremental, ) resource = _impl_cls.from_data( @@ -643,6 +650,10 @@ def make_resource(_name: str, _section: str, _data: Any) -> TDltResourceImpl: cast(DltResource, data_from), True, ) + + if incremental: + # Reset the flag to allow overriding by incremental argument + resource.incremental._from_hints = False # If custom nesting level was specified then # we need to add it to table hints so that # later in normalizer dlt/common/normalizers/json/relational.py @@ -681,7 +692,7 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltResourceImpl: return _wrap def decorator( - f: Callable[TResourceFunParams, Any] + f: Callable[TResourceFunParams, Any], ) -> Callable[TResourceFunParams, TDltResourceImpl]: if not callable(f): if data_from: @@ -1023,7 +1034,7 @@ def get_source() -> DltSource: def defer( - f: Callable[TDeferredFunParams, TBoundItems] + f: Callable[TDeferredFunParams, TBoundItems], ) -> Callable[TDeferredFunParams, TDeferred[TBoundItems]]: @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> TDeferred[TBoundItems]: diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index e65f6cf0d0..25c3a0dbae 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -2,7 +2,7 @@ from collections.abc import Sequence as C_Sequence from copy import copy import itertools -from typing import Iterator, List, Dict, Any, Optional +from typing import Iterator, List, Dict, Any, Optional, Mapping import yaml from dlt.common.configuration.container import Container @@ -17,13 +17,12 @@ WithStepInfo, reset_resource_state, ) -from dlt.common.typing import DictStrAny +from dlt.common.typing import DictStrAny, TColumnNames from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.schema import Schema, utils from dlt.common.schema.typing import ( TAnySchemaColumns, - TColumnNames, TSchemaContract, TTableFormat, TWriteDispositionConfig, @@ -39,7 +38,7 @@ from dlt.extract.decorators import SourceInjectableContext, SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints -from dlt.extract.incremental import IncrementalResourceWrapper +from dlt.extract.incremental import IncrementalResourceWrapper, Incremental from dlt.extract.pipe_iterator import PipeIterator from dlt.extract.source import DltSource from dlt.extract.resource import DltResource diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 5daabd0c6a..000e5c4cdb 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -1,10 +1,9 @@ -from typing import TypedDict, cast, Any, Optional, Dict, Sequence, Mapping +from typing import TypedDict, cast, Any, Optional, Dict, Sequence, Mapping, Union from typing_extensions import Self from dlt.common import logger from dlt.common.schema.typing import ( C_DLT_ID, - TColumnNames, TColumnProp, TFileFormat, TPartialTableSchema, @@ -28,7 +27,7 @@ new_column, new_table, ) -from dlt.common.typing import TDataItem +from dlt.common.typing import TDataItem, TColumnNames from dlt.common.time import ensure_pendulum_datetime from dlt.common.utils import clone_dict_nested from dlt.common.normalizers.json.relational import DataItemNormalizer @@ -37,7 +36,7 @@ DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, ) -from dlt.extract.incremental import Incremental +from dlt.extract.incremental import Incremental, TIncrementalConfig from dlt.extract.items import TFunHintTemplate, TTableHintTemplate, TableNameMeta, ValidateItem from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator @@ -86,6 +85,7 @@ def make_hints( table_format: TTableHintTemplate[TTableFormat] = None, file_format: TTableHintTemplate[TFileFormat] = None, references: TTableHintTemplate[TTableReferenceParam] = None, + incremental: TIncrementalConfig = None, ) -> TResourceHints: """A convenience function to create resource hints. Accepts both static and dynamic hints based on data. @@ -119,6 +119,8 @@ def make_hints( if validator: new_template["validator"] = validator DltResourceHints.validate_dynamic_hints(new_template) + if incremental is not None: # TODO: Validate + new_template["incremental"] = Incremental.ensure_instance(incremental) return new_template @@ -204,6 +206,10 @@ def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTab for k, v in table_template.items() if k not in NATURAL_CALLABLES } # type: ignore + if "incremental" in table_template: + incremental = table_template["incremental"] + if isinstance(incremental, Incremental) and incremental is not Incremental.EMPTY: + resolved_template["incremental"] = incremental table_schema = self._create_table_schema(resolved_template, self.name) migrate_complex_types(table_schema, warn=True) validate_dict_ignoring_xkeys( @@ -221,7 +227,7 @@ def apply_hints( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - incremental: Incremental[Any] = None, + incremental: TIncrementalConfig = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, table_format: TTableHintTemplate[TTableFormat] = None, @@ -360,7 +366,7 @@ def apply_hints( # set properties that can't be passed to make_hints if incremental is not None: - t["incremental"] = incremental + t["incremental"] = Incremental.ensure_instance(incremental) self._set_hints(t, create_table_variant) return self @@ -506,6 +512,22 @@ def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: "row_key": False, } + @staticmethod + def _merge_incremental_column_hint(dict_: Dict[str, Any]) -> None: + incremental = dict_.pop("incremental") + if incremental is None: + return + col_name = incremental.get_cursor_column_name() + if not col_name: + # cursor cannot resolve to a single column, no hint added + return + incremental_col = dict_["columns"].get(col_name) + if not incremental_col: + incremental_col = {"name": col_name} + + incremental_col["incremental"] = True + dict_["columns"][col_name] = incremental_col + @staticmethod def _create_table_schema(resource_hints: TResourceHints, resource_name: str) -> TTableSchema: """Creates table schema from resource hints and resource name. Resource hints are resolved @@ -518,6 +540,8 @@ def _create_table_schema(resource_hints: TResourceHints, resource_name: str) -> "disposition": resource_hints["write_disposition"] } # wrap in dict DltResourceHints._merge_write_disposition_dict(resource_hints) # type: ignore[arg-type] + if "incremental" in resource_hints: + DltResourceHints._merge_incremental_column_hint(resource_hints) # type: ignore[arg-type] dict_ = cast(TTableSchema, resource_hints) dict_["resource"] = resource_name return dict_ diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 69af0d68a6..28d33bb71f 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -1,6 +1,6 @@ import os from datetime import datetime # noqa: I251 -from typing import Generic, ClassVar, Any, Optional, Type, Dict, Union +from typing import Generic, ClassVar, Any, Optional, Type, Dict, Union, Literal, Tuple from typing_extensions import get_args import inspect @@ -9,7 +9,7 @@ from dlt.common import logger from dlt.common.exceptions import MissingDependencyException from dlt.common.pendulum import pendulum -from dlt.common.jsonpath import compile_path +from dlt.common.jsonpath import compile_path, extract_simple_field_name from dlt.common.typing import ( TDataItem, TDataItems, @@ -19,8 +19,8 @@ get_generic_type_argument_from_instance, is_optional_type, is_subclass, + TColumnNames, ) -from dlt.common.schema.typing import TColumnNames from dlt.common.configuration import configspec, ConfigurationValueError from dlt.common.configuration.specs import BaseConfiguration from dlt.common.pipeline import resource_state @@ -29,17 +29,19 @@ coerce_value, py_type_to_sc_type, ) +from dlt.common.utils import without_none from dlt.extract.exceptions import IncrementalUnboundError from dlt.extract.incremental.exceptions import ( IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing, ) -from dlt.extract.incremental.typing import ( +from dlt.common.incremental.typing import ( IncrementalColumnState, TCursorValue, LastValueFunc, OnCursorValueMissing, + IncrementalArgs, ) from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform from dlt.extract.incremental.transform import ( @@ -123,7 +125,7 @@ def __init__( self, cursor_path: str = None, initial_value: Optional[TCursorValue] = None, - last_value_func: Optional[LastValueFunc[TCursorValue]] = max, + last_value_func: Optional[Union[LastValueFunc[TCursorValue], Literal["min", "max"]]] = max, primary_key: Optional[TTableHintTemplate[TColumnNames]] = None, end_value: Optional[TCursorValue] = None, row_order: Optional[TSortOrder] = None, @@ -135,6 +137,16 @@ def __init__( if cursor_path: compile_path(cursor_path) self.cursor_path = cursor_path + if isinstance(last_value_func, str): + if last_value_func == "min": + last_value_func = min + elif last_value_func == "max": + last_value_func = max + else: + raise ValueError( + f"Unknown last_value_func '{last_value_func}' passed as string. Provide a" + " callable to use a custom function." + ) self.last_value_func = last_value_func self.initial_value = initial_value """Initial value of last_value""" @@ -247,6 +259,10 @@ def copy(self) -> "Incremental[TCursorValue]": # merge creates a copy return self.merge(self) + def get_cursor_column_name(self) -> Optional[str]: + """Return the name of the cursor column if the cursor path resolves to a single column""" + return extract_simple_field_name(self.cursor_path) + def on_resolved(self) -> None: compile_path(self.cursor_path) if self.end_value is not None and self.initial_value is None: @@ -491,6 +507,12 @@ def can_close(self) -> bool: and self.start_out_of_range ) + @classmethod + def ensure_instance(cls, value: "TIncrementalConfig") -> "Incremental[TCursorValue]": + if isinstance(value, Incremental): + return value + return cls(**value) + def __str__(self) -> str: return ( f"Incremental at 0x{id(self):x} for resource {self.resource_name} with cursor path:" @@ -511,7 +533,6 @@ def _get_transformer(self, items: TDataItems) -> IncrementalTransform: def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: return rows - transformer = self._get_transformer(rows) if isinstance(rows, list): rows = [ @@ -556,6 +577,8 @@ def _check_duplicate_cursor_threshold( Incremental.EMPTY = Incremental[Any]() Incremental.EMPTY.__is_resolved__ = True +TIncrementalConfig = Union[Incremental[Any], IncrementalArgs] + class IncrementalResourceWrapper(ItemTransform[TDataItem]): placement_affinity: ClassVar[float] = 1 # stick to end @@ -595,6 +618,34 @@ def get_incremental_arg(sig: inspect.Signature) -> Optional[inspect.Parameter]: break return incremental_param + @staticmethod + def inject_implicit_incremental_arg( + incremental: Optional[Union[Incremental[Any], "IncrementalResourceWrapper"]], + sig: inspect.Signature, + func_args: Tuple[Any], + func_kwargs: Dict[str, Any], + fallback: Optional[Incremental[Any]] = None, + ) -> Tuple[Tuple[Any], Dict[str, Any], Optional[Incremental[Any]]]: + """Inject the incremental instance into function arguments + if the function has an incremental argument without default in its signature and it is not already set in the arguments. + + Returns: + Tuple of the new args, kwargs and the incremental instance that was injected (if any) + """ + if isinstance(incremental, IncrementalResourceWrapper): + incremental = incremental.incremental + if not incremental: + if not fallback: + return func_args, func_kwargs, None + incremental = fallback + incremental_param = IncrementalResourceWrapper.get_incremental_arg(sig) + if incremental_param: + bound_args = sig.bind_partial(*func_args, **func_kwargs) + if not bound_args.arguments.get(incremental_param.name): + bound_args.arguments[incremental_param.name] = incremental + return bound_args.args, bound_args.kwargs, incremental + return func_args, func_kwargs, None + def wrap(self, sig: inspect.Signature, func: TFun) -> TFun: """Wrap the callable to inject an `Incremental` object configured for the resource.""" incremental_param = self.get_incremental_arg(sig) @@ -666,12 +717,14 @@ def incremental(self) -> Optional[Incremental[Any]]: return self._incremental def set_incremental( - self, incremental: Optional[Incremental[Any]], from_hints: bool = False + self, incremental: Optional[TIncrementalConfig], from_hints: bool = False ) -> None: """Sets the incremental. If incremental was set from_hints, it can only be changed in the same manner""" if self._from_hints and not from_hints: # do not accept incremental if apply hints were used return + if incremental is not None: + incremental = Incremental.ensure_instance(incremental) self._from_hints = from_hints self._incremental = incremental @@ -710,6 +763,12 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: return self._incremental(item, meta) +def incremental_config_to_instance(cfg: TIncrementalConfig) -> Incremental[Any]: + if isinstance(cfg, Incremental): + return cfg + return Incremental(**cfg) + + __all__ = [ "Incremental", "IncrementalResourceWrapper", @@ -717,6 +776,7 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: "IncrementalCursorPathMissing", "IncrementalPrimaryKeyMissing", "IncrementalUnboundError", + "TIncrementalconfig", "LastValueFunc", "TCursorValue", ] diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 842c8aebe8..22b1194b51 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -5,7 +5,7 @@ from dlt.common.utils import digest128 from dlt.common.json import json from dlt.common.pendulum import pendulum -from dlt.common.typing import TDataItem +from dlt.common.typing import TDataItem, TColumnNames from dlt.common.jsonpath import find_values, compile_path, extract_simple_field_name from dlt.extract.incremental.exceptions import ( IncrementalCursorInvalidCoercion, @@ -13,10 +13,9 @@ IncrementalPrimaryKeyMissing, IncrementalCursorPathHasValueNone, ) -from dlt.extract.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing +from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate -from dlt.common.schema.typing import TColumnNames try: from dlt.common.libs import pyarrow diff --git a/dlt/extract/items.py b/dlt/extract/items.py index d721e8094e..888787e6b7 100644 --- a/dlt/extract/items.py +++ b/dlt/extract/items.py @@ -19,7 +19,14 @@ ) from concurrent.futures import Future -from dlt.common.typing import TAny, TDataItem, TDataItems +from dlt.common.typing import ( + TAny, + TDataItem, + TDataItems, + TTableHintTemplate, + TFunHintTemplate, + TDynHintType, +) TDecompositionStrategy = Literal["none", "scc"] @@ -27,9 +34,6 @@ TAwaitableDataItems = Awaitable[TDataItems] TPipedDataItems = Union[TDataItems, TDeferredDataItems, TAwaitableDataItems] -TDynHintType = TypeVar("TDynHintType") -TFunHintTemplate = Callable[[TDataItem], TDynHintType] -TTableHintTemplate = Union[TDynHintType, TFunHintTemplate[TDynHintType]] if TYPE_CHECKING: TItemFuture = Future[TPipedDataItems] diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index c6ca1660f4..42e3905162 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -11,6 +11,7 @@ Union, Any, Optional, + Mapping, ) from typing_extensions import TypeVar, Self @@ -28,6 +29,7 @@ pipeline_state, ) from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id +from dlt.common.schema.typing import TTableSchema from dlt.extract.utils import wrap_async_iterator, wrap_parallel_iterator from dlt.extract.items import ( @@ -42,7 +44,7 @@ ) from dlt.extract.pipe_iterator import ManagedPipeIterator from dlt.extract.pipe import Pipe, TPipeStep -from dlt.extract.hints import DltResourceHints, HintsMeta, TResourceHints +from dlt.extract.hints import DltResourceHints, HintsMeta, TResourceHints, make_hints from dlt.extract.incremental import Incremental, IncrementalResourceWrapper from dlt.extract.exceptions import ( InvalidTransformerDataTypeGeneratorFunctionRequired, @@ -442,35 +444,60 @@ def add_step( self._pipe.insert_step(item_transform, insert_at) return self + def _remove_incremental_step(self) -> None: + step_no = self._pipe.find(Incremental, IncrementalResourceWrapper) + if step_no >= 0: + self._pipe.remove_step(step_no) + + def set_incremental( + self, + new_incremental: Union[Incremental[Any], IncrementalResourceWrapper], + from_hints: bool = False, + ) -> Optional[Union[Incremental[Any], IncrementalResourceWrapper]]: + """Set/replace the incremental transform for the resource. + + Args: + new_incremental: The Incremental instance/hint to set or replace + from_hints: If the incremental is set from hints. Defaults to False. + """ + if new_incremental is Incremental.EMPTY: + new_incremental = None + incremental = self.incremental + if incremental is not None: + # if isinstance(new_incremental, Mapping): + # new_incremental = Incremental.ensure_instance(new_incremental) + + if isinstance(new_incremental, IncrementalResourceWrapper): + # Completely replace the wrapper + self._remove_incremental_step() + self.add_step(new_incremental) + elif isinstance(incremental, IncrementalResourceWrapper): + incremental.set_incremental(new_incremental, from_hints=from_hints) + else: + self._remove_incremental_step() + # re-add the step + incremental = None + if incremental is None: + # if there's no wrapper add incremental as a transform + if new_incremental: + if not isinstance(new_incremental, IncrementalResourceWrapper): + new_incremental = Incremental.ensure_instance(new_incremental) + self.add_step(new_incremental) + return new_incremental + def _set_hints( self, table_schema_template: TResourceHints, create_table_variant: bool = False ) -> None: super()._set_hints(table_schema_template, create_table_variant) # validators and incremental apply only to resource hints if not create_table_variant: - incremental = self.incremental # try to late assign incremental if table_schema_template.get("incremental") is not None: - new_incremental = table_schema_template["incremental"] - # remove incremental if empty - if new_incremental is Incremental.EMPTY: - new_incremental = None - - if incremental is not None: - if isinstance(incremental, IncrementalResourceWrapper): - # replace in wrapper - incremental.set_incremental(new_incremental, from_hints=True) - else: - step_no = self._pipe.find(Incremental) - self._pipe.remove_step(step_no) - # re-add the step - incremental = None - - if incremental is None: - # if there's no wrapper add incremental as a transform - incremental = new_incremental # type: ignore - if new_incremental: - self.add_step(new_incremental) + incremental = self.set_incremental( + table_schema_template["incremental"], from_hints=True + ) + else: + incremental = self.incremental if incremental: primary_key = table_schema_template.get("primary_key", incremental.primary_key) @@ -480,10 +507,25 @@ def _set_hints( if table_schema_template.get("validator") is not None: self.validator = table_schema_template["validator"] + def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTableSchema: + incremental: Optional[Union[Incremental[Any], IncrementalResourceWrapper]] = ( + self.incremental + ) + if incremental and "incremental" not in self._hints: + if isinstance(incremental, IncrementalResourceWrapper): + incremental = incremental.incremental + if incremental: + self._hints["incremental"] = incremental + + table_schema = super().compute_table_schema(item, meta) + + return table_schema + def bind(self: TDltResourceImpl, *args: Any, **kwargs: Any) -> TDltResourceImpl: """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" if self._args_bound: raise TypeError(f"Parametrized resource {self.name} is not callable") + orig_gen = self._pipe.gen gen = self._pipe.bind_gen(*args, **kwargs) if isinstance(gen, DltResource): @@ -599,14 +641,14 @@ def _eject_config(self) -> bool: if not self._pipe.is_empty and not self._args_bound: orig_gen = getattr(self._pipe.gen, "__GEN__", None) if orig_gen: - step_no = self._pipe.find(IncrementalResourceWrapper) - if step_no >= 0: - self._pipe.remove_step(step_no) + self._remove_incremental_step() self._pipe.replace_gen(orig_gen) return True return False - def _inject_config(self) -> "DltResource": + def _inject_config( + self, incremental_from_hints_override: Optional[bool] = None + ) -> "DltResource": """Wraps the pipe generation step in incremental and config injection wrappers and adds pipe step with Incremental transform. """ @@ -618,8 +660,17 @@ def _inject_config(self) -> "DltResource": sig = inspect.signature(gen) if IncrementalResourceWrapper.should_wrap(sig): incremental = IncrementalResourceWrapper(self._hints.get("primary_key")) + if incr_hint := self._hints.get("incremental"): + incremental.set_incremental( + incr_hint, + from_hints=( + incremental_from_hints_override + if incremental_from_hints_override is not None + else True + ), + ) incr_f = incremental.wrap(sig, gen) - self.add_step(incremental) + self.set_incremental(incremental) else: incr_f = gen resource_sections = (known_sections.SOURCES, self.section, self.name) @@ -649,6 +700,12 @@ def _clone( if self._pipe and not self._pipe.is_empty: pipe = pipe._clone(new_name=new_name, with_parent=with_parent) # incremental and parent are already in the pipe (if any) + + incremental = self.incremental + if isinstance(incremental, IncrementalResourceWrapper): + incremental_from_hints: Optional[bool] = incremental._from_hints + else: + incremental_from_hints = None r_ = self.__class__( pipe, self._clone_hints(self._hints), @@ -661,7 +718,7 @@ def _clone( # this makes sure that a take config values from a right section and wrapper has a separated # instance in the pipeline if r_._eject_config(): - r_._inject_config() + r_._inject_config(incremental_from_hints_override=incremental_from_hints) return r_ def _get_config_section_context(self) -> ConfigSectionContext: diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 55a8b0b8c4..68570d0995 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -22,8 +22,15 @@ from dlt.common.data_writers import TDataItemFormat from dlt.common.exceptions import MissingDependencyException from dlt.common.pipeline import reset_resource_state -from dlt.common.schema.typing import TColumnNames, TAnySchemaColumns, TTableSchemaColumns -from dlt.common.typing import AnyFun, DictStrAny, TDataItem, TDataItems, TAnyFunOrGenerator +from dlt.common.schema.typing import TAnySchemaColumns, TTableSchemaColumns +from dlt.common.typing import ( + AnyFun, + DictStrAny, + TDataItem, + TDataItems, + TAnyFunOrGenerator, + TColumnNames, +) from dlt.common.utils import get_callable_name from dlt.extract.exceptions import ( diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index a9f07d417e..70d160ea67 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -38,7 +38,6 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.runtime import signals, apply_runtime_config from dlt.common.schema.typing import ( - TColumnNames, TSchemaTables, TTableFormat, TWriteDispositionConfig, @@ -47,7 +46,7 @@ ) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import ConfigValue, TFun, TSecretStrValue, is_optional_type +from dlt.common.typing import ConfigValue, TFun, TSecretStrValue, is_optional_type, TColumnNames from dlt.common.runners import pool_runner as runner from dlt.common.storages import ( LiveSchemaStorage, diff --git a/dlt/sources/rest_api/typing.py b/dlt/sources/rest_api/typing.py index ccef828b1a..c48e54de4a 100644 --- a/dlt/sources/rest_api/typing.py +++ b/dlt/sources/rest_api/typing.py @@ -15,7 +15,7 @@ from dlt.common.schema.typing import ( TAnySchemaColumns, ) -from dlt.extract.incremental.typing import IncrementalArgs +from dlt.common.incremental.typing import IncrementalArgs from dlt.extract.items import TTableHintTemplate from dlt.extract.hints import TResourceHintsBase from dlt.sources.helpers.rest_client.auth import AuthConfigBase, TApiKeyLocation @@ -23,9 +23,8 @@ from dataclasses import dataclass, field from dlt.common import jsonpath -from dlt.common.typing import TSortOrder +from dlt.common.typing import TSortOrder, TColumnNames from dlt.common.schema.typing import ( - TColumnNames, TTableFormat, TAnySchemaColumns, TWriteDispositionConfig, @@ -33,7 +32,7 @@ ) from dlt.extract.items import TTableHintTemplate -from dlt.extract.incremental.typing import LastValueFunc +from dlt.common.incremental.typing import LastValueFunc from dlt.extract.resource import DltResource from requests import Session diff --git a/tests/common/test_jsonpath.py b/tests/common/test_jsonpath.py new file mode 100644 index 0000000000..c4e9fbc664 --- /dev/null +++ b/tests/common/test_jsonpath.py @@ -0,0 +1,43 @@ +import pytest + +from dlt.common import jsonpath as jp + + +@pytest.mark.parametrize("compiled", [True, False]) +@pytest.mark.parametrize( + "path, expected", + [ + ("col_a", "col_a"), + ("'col.a'", "col.a"), + ("'$col_a'", "$col_a"), + ("'col|a'", "col|a"), + ], +) +def test_extract_simple_field_name_positive(path, expected, compiled): + if compiled: + path = jp.compile_path(path) + + result = jp.extract_simple_field_name(path) + assert result == expected + + +@pytest.mark.parametrize("compiled", [True, False]) +@pytest.mark.parametrize( + "path", + [ + "$.col_a", + "$.col_a.items", + "$.col_a.items[0]", + "$.col_a.items[*]", + "col_a|col_b", + ], +) +def test_extract_simple_field_name_negative(path, compiled): + if compiled: + path = jp.compile_path(path) + + result = jp.extract_simple_field_name(path) + assert result is None + + +# TODO: Test all jsonpath utils diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 3f8ccfc20f..f3ebb02b46 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -19,13 +19,13 @@ from dlt.common import Decimal, jsonpath from dlt.common.exceptions import DictValidationException from dlt.common.schema.typing import ( - TColumnNames, TStoredSchema, TColumnSchema, TWriteDispositionConfig, ) from dlt.common.schema.utils import simple_regex_validator -from dlt.common.typing import DictStrStr, StrStr, TDataItem, TSortOrder +from dlt.common.typing import DictStrStr, StrStr, TDataItem, TSortOrder, TColumnNames + from dlt.common.validation import validate_dict, validate_dict_ignoring_xkeys diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index dbec417f97..9343449aed 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -213,6 +213,48 @@ def with_table_hints(): extract_step.extract(source, 20, 1) +def test_extract_hints_mark_incremental(extract_step: Extract) -> None: + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "TRUE" + + @dlt.resource(columns=[{"name": "id", "data_type": "bigint"}], primary_key="id") + def with_table_hints(): + # yield a regular dataset first, simulate backfil + yield [{"id": id_, "pk": "A"} for id_ in range(1, 10)] + + # get the resource + resource = dlt.current.source().resources[dlt.current.resource_name()] + table = resource.compute_table_schema() + # also there we see the hints + assert table["columns"]["id"]["primary_key"] is True + assert table["columns"]["id"]["data_type"] == "bigint" + + # start emitting incremental + yield dlt.mark.with_hints( + [{"id": id_, "pk": "A", "created_at": id_ + 10} for id_ in range(100, 110)], + make_hints(incremental=dlt.sources.incremental("created_at", initial_value=105)), + ) + + # get the resource + resource = dlt.current.source().resources[dlt.current.resource_name()] + assert resource.incremental.cursor_path == "created_at" # type: ignore[attr-defined] + assert resource.incremental.primary_key == "id" + # we are able to add the incremental to the pipe. but it won't + # join actually executing pipe which is a clone of a (partial) pipe of the resource + assert isinstance(resource._pipe._steps[1], dlt.sources.incremental) + # NOTE: this results in unbounded exception + # assert resource.incremental.last_value == 299 + table = resource.compute_table_schema() + assert table["columns"]["created_at"]["incremental"] is not None + + yield [{"id": id_, "pk": "A", "created_at": id_ + 10} for id_ in range(110, 120)] + + source = DltSource(dlt.Schema("hintable"), "module", [with_table_hints]) + extract_step.extract(source, 20, 1) + # make sure incremental is in the source schema + table = source.schema.get_table("with_table_hints") + assert table["columns"]["created_at"]["incremental"] is not None + + def test_extract_metrics_on_exception_no_flush(extract_step: Extract) -> None: @dlt.resource def letters(): diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 7ce4228b6c..30df12ae17 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -5,7 +5,7 @@ from datetime import datetime, date # noqa: I251 from itertools import chain, count from time import sleep -from typing import Any, Optional +from typing import Any, Optional, Literal, Sequence, Dict from unittest import mock import duckdb @@ -1468,10 +1468,13 @@ def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) + should_have_arg = True + @dlt.resource def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): # make sure that incremental from apply_hints is here - if created_at is not None: + if should_have_arg: + assert created_at is not None assert created_at.cursor_path == "created_at" assert created_at.last_value_func is max yield source_items @@ -1505,6 +1508,7 @@ def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): assert list(r) == [] # remove incremental + should_have_arg = False r.apply_hints(incremental=dlt.sources.incremental.EMPTY) assert r.incremental is not None assert r.incremental.incremental is None @@ -1515,6 +1519,7 @@ def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): # as above but we provide explicit incremental when creating resource p = p.drop() + should_have_arg = True r = some_data(created_at=dlt.sources.incremental("created_at", last_value_func=min)) # hints have precedence, as expected r.apply_hints(incremental=dlt.sources.incremental("created_at", last_value_func=max)) @@ -3568,3 +3573,223 @@ def some_data( call for call in logger_spy.call_args_list if "Large number of records" in call.args[0] ] assert len(warning_calls) == 1 + + +def _resource_for_table_hint( + hint_type: Literal[ + "default_arg", "explicit_arg", "apply_hints", "default_arg_override", "decorator" + ], + data: Sequence[Dict[str, Any]], + incremental_arg: dlt.sources.incremental[Any], + incremental_arg_default: dlt.sources.incremental[Any] = None, +) -> DltResource: + if incremental_arg is None and incremental_arg_default is None: + raise ValueError("One of the incremental arguments must be provided.") + + decorator_arg = None + if hint_type == "default_arg": + default_arg = incremental_arg_default + override_arg = None + elif hint_type == "default_arg_override": + default_arg = incremental_arg_default + override_arg = incremental_arg + elif hint_type == "decorator": + default_arg = None + override_arg = None + decorator_arg = incremental_arg_default + else: + default_arg = None + override_arg = incremental_arg + + @dlt.resource(incremental=decorator_arg) + def some_data( + updated_at: dlt.sources.incremental[Any] = default_arg, + ) -> Any: + yield data_to_item_format("object", data) + + if override_arg is None: + return some_data() + + if hint_type == "apply_hints": + rs = some_data() + rs.apply_hints(incremental=override_arg) + return rs + + return some_data(updated_at=override_arg) + + +@pytest.mark.parametrize( + "hint_type", ["default_arg", "explicit_arg", "apply_hints", "default_arg_override", "decorator"] +) +@pytest.mark.parametrize( + "incremental_settings", + [ + { + "last_value_func": "min", + "row_order": "desc", + "on_cursor_value_missing": "include", + }, + {"last_value_func": "max", "on_cursor_value_missing": "raise"}, + ], +) +def test_incremental_table_hint_datetime_column( + hint_type: Literal[ + "default_arg", + "explicit_arg", + "default_arg_override", + "apply_hints", + "decorator", + ], + incremental_settings: Dict[str, Any], +) -> None: + initial_value_override = pendulum.now() + initial_value_default = pendulum.now().subtract(seconds=10) + rs = _resource_for_table_hint( + hint_type, + [{"updated_at": pendulum.now().add(seconds=i)} for i in range(1, 12)], + dlt.sources.incremental( + "updated_at", initial_value=initial_value_override, **incremental_settings + ), + dlt.sources.incremental( + "updated_at", initial_value=initial_value_default, **incremental_settings + ), + ) + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(rs) + + table_schema = pipeline.default_schema.tables["some_data"] + + assert table_schema["columns"]["updated_at"]["incremental"] is True + + +def incremental_instance_or_dict(use_dict: bool, **kwargs): + if use_dict: + return kwargs + return dlt.sources.incremental(**kwargs) + + +@pytest.mark.parametrize("use_dict", [False, True]) +def test_incremental_in_resource_decorator(use_dict: bool) -> None: + # Incremental set in decorator, without any arguments + @dlt.resource( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="value", initial_value=5, last_value_func=min + ) + ) + def no_incremental_arg(): + yield [{"value": i} for i in range(10)] + + result = list(no_incremental_arg()) + # filtering is applied + assert result == [{"value": i} for i in range(0, 6)] + + # Apply hints overrides the decorator settings + rs = no_incremental_arg() + rs.apply_hints( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="value", initial_value=3, last_value_func=max + ) + ) + result = list(rs) + assert result == [{"value": i} for i in range(3, 10)] + + @dlt.resource( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="value", initial_value=5, last_value_func=min + ) + ) + def with_optional_incremental_arg(incremental: Optional[dlt.sources.incremental[int]] = None): + assert incremental is not None + yield [{"value": i} for i in range(10)] + + # Decorator settings are used + result = list(with_optional_incremental_arg()) + assert result == [{"value": i} for i in range(0, 6)] + + +@pytest.mark.parametrize("use_dict", [False, True]) +def test_incremental_in_resource_decorator_default_arg(use_dict: bool) -> None: + @dlt.resource( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="value", initial_value=5, last_value_func=min + ) + ) + def with_default_incremental_arg( + incremental: dlt.sources.incremental[int] = dlt.sources.incremental( + "value", initial_value=3, last_value_func=min + ) + ): + assert incremental.last_value == initial_value + assert incremental.last_value_func == last_value_func + yield [{"value": i} for i in range(10)] + + last_value_func = max + initial_value = 4 + # Explicit argument overrides the default and decorator argument + result = list( + with_default_incremental_arg( + incremental=dlt.sources.incremental( + "value", initial_value=initial_value, last_value_func=last_value_func + ) + ) + ) + assert result == [{"value": i} for i in range(4, 10)] + + # Decorator param overrides function default arg + last_value_func = min + initial_value = 5 + result = list(with_default_incremental_arg()) + assert result == [{"value": i} for i in range(0, 6)] + + +@pytest.mark.parametrize("use_dict", [False, True]) +def test_incremental_table_hint_merged_columns(use_dict: bool) -> None: + @dlt.resource( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="col_a", initial_value=3, last_value_func=min + ) + ) + def some_data(): + yield [{"col_a": i, "foo": i + 2, "col_b": i + 1, "bar": i + 3} for i in range(10)] + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(some_data()) + + table_schema = pipeline.default_schema.tables["some_data"] + assert table_schema["columns"]["col_a"]["incremental"] is True + + rs = some_data() + rs.apply_hints( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="col_b", initial_value=5, last_value_func=max + ) + ) + + pipeline.extract(rs) + + table_schema_2 = pipeline.default_schema.tables["some_data"] + + # Only one column should have the hint + assert "incremental" not in table_schema_2["columns"]["col_a"] + assert table_schema_2["columns"]["col_b"]["incremental"] is True + + +@pytest.mark.parametrize("use_dict", [True, False]) +def test_incremental_column_hint_cursor_is_not_column(use_dict: bool): + @dlt.resource( + incremental=incremental_instance_or_dict( + use_dict, cursor_path="col_a|col_b", initial_value=3, last_value_func=min + ) + ) + def some_data(): + yield [{"col_a": i, "foo": i + 2, "col_b": i + 1, "bar": i + 3} for i in range(10)] + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + + pipeline.extract(some_data()) + + table_schema = pipeline.default_schema.tables["some_data"] + + for col in table_schema["columns"].values(): + assert "incremental" not in col From f4faa836df37cf810b2eb5b8ba754aa80f946719 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Mon, 2 Dec 2024 16:24:57 +0100 Subject: [PATCH 3/4] #2087 allows double underscores in identifiers (#2098) * removes astunparse and aiohttp * allows for built-in ast unparse if present * uses break path for normalization to allow names containing path separators, migrates old schema to enable compat mode with old behavior * adds removeprefix util * updates docs * bumps dlt to version 1.4.1 * linter fixes * fixes tests * fixes and tests saving pandas indexes * fixes sqllite read interface tests * updates docs --- dlt/cli/deploy_command_helpers.py | 13 +- dlt/cli/source_detection.py | 5 +- dlt/common/destination/reference.py | 1 - dlt/common/libs/pandas.py | 5 +- dlt/common/normalizers/json/helpers.py | 141 + dlt/common/normalizers/json/relational.py | 172 +- dlt/common/normalizers/naming/naming.py | 2 + dlt/common/normalizers/typing.py | 2 + dlt/common/reflection/utils.py | 14 +- dlt/common/schema/configuration.py | 2 + dlt/common/schema/migrations.py | 7 +- dlt/common/schema/normalizers.py | 7 +- dlt/common/schema/schema.py | 26 +- dlt/common/schema/typing.py | 2 +- dlt/common/utils.py | 5 + dlt/destinations/dataset.py | 6 +- .../impl/clickhouse/sql_client.py | 6 +- .../impl/filesystem/filesystem.py | 3 +- dlt/extract/extractors.py | 14 +- dlt/normalize/worker.py | 5 +- dlt/reflection/script_visitor.py | 9 +- dlt/sources/sql_database/arrow_helpers.py | 5 +- .../dlt-ecosystem/destinations/filesystem.md | 2 +- .../verified-sources/arrow-pandas.md | 2 + .../docs/general-usage/naming-convention.md | 39 + mypy.ini | 2 +- poetry.lock | 110 +- pyproject.toml | 5 +- .../cases/schemas/eth/ethereum_schema_v11.yml | 394 +++ .../cases/schemas/github/issues.schema.json | 2404 ++++++++--------- .../normalizers/test_json_relational.py | 10 +- .../normalizers/test_naming_snake_case.py | 8 + .../common/schema/test_import_normalizers.py | 36 +- .../schema/test_normalize_identifiers.py | 62 +- tests/common/schema/test_schema.py | 20 +- tests/common/schema/test_versioning.py | 12 +- tests/common/storages/test_schema_storage.py | 12 +- tests/common/storages/utils.py | 4 +- tests/common/test_utils.py | 9 + tests/common/test_validation.py | 2 +- tests/common/utils.py | 6 +- .../cases/eth_source/ethereum.schema.yaml | 4 +- tests/extract/test_decorators.py | 4 +- tests/extract/test_incremental.py | 76 +- tests/libs/pyarrow/test_pyarrow_normalizer.py | 4 +- .../test_clickhouse_configuration.py | 26 +- tests/load/conftest.py | 2 +- tests/load/duckdb/test_duckdb_client.py | 2 +- tests/load/filesystem/test_aws_credentials.py | 1 - .../load/filesystem/test_filesystem_common.py | 1 - tests/load/pipeline/conftest.py | 2 +- tests/load/pipeline/test_merge_disposition.py | 2 +- tests/load/pipeline/test_scd2.py | 3 +- tests/load/qdrant/utils.py | 1 - tests/load/redshift/test_redshift_client.py | 2 +- tests/load/test_job_client.py | 2 +- tests/load/test_read_interfaces.py | 11 +- tests/load/test_sql_client.py | 2 +- tests/load/weaviate/utils.py | 1 - .../cases/github_pipeline/github_rev.py | 26 + tests/pipeline/test_dlt_versions.py | 56 + .../test_max_nesting.py | 0 tests/pipeline/test_pipeline.py | 105 + 63 files changed, 2203 insertions(+), 1721 deletions(-) create mode 100644 dlt/common/normalizers/json/helpers.py create mode 100644 tests/common/cases/schemas/eth/ethereum_schema_v11.yml create mode 100644 tests/pipeline/cases/github_pipeline/github_rev.py rename tests/{normalize => pipeline}/test_max_nesting.py (100%) diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index b508b32226..e3719fbe38 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -5,7 +5,6 @@ from yaml import Dumper from itertools import chain from typing import List, Optional, Sequence, Tuple, Any, Dict -from astunparse import unparse # optional dependencies import pipdeptree @@ -23,7 +22,7 @@ from dlt.common.git import get_origin, get_repo, Repo from dlt.common.configuration.specs.runtime_configuration import get_default_pipeline_name from dlt.common.typing import StrAny -from dlt.common.reflection.utils import evaluate_node_literal +from dlt.common.reflection.utils import evaluate_node_literal, ast_unparse from dlt.common.pipeline import LoadInfo, TPipelineState, get_dlt_repos_dir from dlt.common.storages import FileStorage from dlt.common.utils import set_working_dir @@ -313,7 +312,7 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio if f_r_value is None: fmt.warning( "The value of `dev_mode` in call to `dlt.pipeline` cannot be" - f" determined from {unparse(f_r_node).strip()}. We assume that you know" + f" determined from {ast_unparse(f_r_node).strip()}. We assume that you know" " what you are doing :)" ) if f_r_value is True: @@ -331,8 +330,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipelines_dir' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipelines-dir option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipelines-dir option.", ) p_n_node = call_args.arguments.get("pipeline_name") @@ -342,8 +341,8 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio raise CliCommandInnerException( "deploy", "The value of 'pipeline_name' argument in call to `dlt_pipeline` cannot be" - f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" - " be found. Pass it directly with --pipeline-name option.", + f" determined from {ast_unparse(p_d_node).strip()}. Pipeline working dir" + " will be found. Pass it directly with --pipeline-name option.", ) pipelines.append((pipeline_name, pipelines_dir)) diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index f4e9b3e050..7067f8b896 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -1,11 +1,10 @@ import ast import inspect -from astunparse import unparse from typing import Dict, Tuple, Set, List from dlt.common.configuration import is_secret_hint from dlt.common.configuration.specs import BaseConfiguration -from dlt.common.reflection.utils import creates_func_def_name_node +from dlt.common.reflection.utils import creates_func_def_name_node, ast_unparse from dlt.common.typing import is_optional_type from dlt.sources import SourceReference @@ -65,7 +64,7 @@ def find_source_calls_to_replace( for calls in visitor.known_sources_resources_calls.values(): for call in calls: transformed_nodes.append( - (call.func, ast.Name(id=pipeline_name + "_" + unparse(call.func))) + (call.func, ast.Name(id=pipeline_name + "_" + ast_unparse(call.func))) ) return transformed_nodes diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index d1024eb28c..e27f99cde7 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -81,7 +81,6 @@ DataFrame = Any ArrowTable = Any IbisBackend = Any - else: DataFrame = Any ArrowTable = Any diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index a165ea8747..35cfe623bb 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -8,8 +8,9 @@ raise MissingDependencyException("dlt Pandas Helpers", ["pandas"]) -def pandas_to_arrow(df: pandas.DataFrame) -> Any: +def pandas_to_arrow(df: pandas.DataFrame, preserve_index: bool = False) -> Any: """Converts pandas to arrow or raises an exception if pyarrow is not installed""" from dlt.common.libs.pyarrow import pyarrow as pa - return pa.Table.from_pandas(df) + # NOTE: None preserves named indexes but ignores unnamed + return pa.Table.from_pandas(df, preserve_index=preserve_index) diff --git a/dlt/common/normalizers/json/helpers.py b/dlt/common/normalizers/json/helpers.py new file mode 100644 index 0000000000..96c9ab4954 --- /dev/null +++ b/dlt/common/normalizers/json/helpers.py @@ -0,0 +1,141 @@ +""" +Cached helper methods for all operations that are called often +""" +from functools import lru_cache +from typing import Any, Dict, List, Optional, Tuple, cast + +from dlt.common.json import json +from dlt.common.destination.utils import resolve_merge_strategy +from dlt.common.normalizers.naming import NamingConvention +from dlt.common.normalizers.typing import TRowIdType +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES +from dlt.common.schema import Schema +from dlt.common.schema.typing import TColumnSchema, C_DLT_ID, DLT_NAME_PREFIX +from dlt.common.schema.utils import ( + get_columns_names_with_prop, + get_first_column_name_with_prop, + is_nested_table, +) +from dlt.common.utils import digest128 + + +@lru_cache(maxsize=None) +def shorten_fragments(naming: NamingConvention, *idents: str) -> str: + return naming.shorten_fragments(*idents) + + +@lru_cache(maxsize=None) +def normalize_table_identifier(schema: Schema, naming: NamingConvention, table_name: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_tables_path(table_name) + else: + return naming.normalize_table_identifier(table_name) + + +@lru_cache(maxsize=None) +def normalize_identifier(schema: Schema, naming: NamingConvention, identifier: str) -> str: + if schema._normalizers_config.get("use_break_path_on_normalize", True): + return naming.normalize_path(identifier) + else: + return naming.normalize_identifier(identifier) + + +@lru_cache(maxsize=None) +def get_table_nesting_level( + schema: Schema, table_name: str, default_nesting: int = 1000 +) -> Optional[int]: + """gets table nesting level, will inherit from parent if not set""" + + table = schema.tables.get(table_name) + if ( + table + and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) is not None + ): + return max_nesting + return default_nesting + + +@lru_cache(maxsize=None) +def get_primary_key(schema: Schema, table_name: str) -> List[str]: + if table_name not in schema.tables: + return [] + table = schema.get_table(table_name) + return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) + + +@lru_cache(maxsize=None) +def is_nested_type( + schema: Schema, + table_name: str, + field_name: str, + _r_lvl: int, +) -> bool: + """For those paths the nested objects should be left in place. + Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster + """ + + # nesting level is counted backwards + # is we have traversed to or beyond the calculated nesting level, we detect a nested type + if _r_lvl <= 0: + return True + + column: TColumnSchema = None + table = schema.tables.get(table_name) + if table: + column = table["columns"].get(field_name) + if column is None or "data_type" not in column: + data_type = schema.get_preferred_type(field_name) + else: + data_type = column["data_type"] + + return data_type == "json" + + +@lru_cache(maxsize=None) +def get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: + """Gets type of row id to be added to nested table and if linking information should be added""" + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): + return "random", False + else: + # table will be created, use standard linking + pass + return "row_hash", True + + +@lru_cache(maxsize=None) +def get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: + if table := schema.tables.get(table_name): + merge_strategy = resolve_merge_strategy(schema.tables, table) + if merge_strategy == "upsert": + return "key_hash" + elif merge_strategy == "scd2": + x_row_version_col = get_first_column_name_with_prop( + schema.get_table(table_name), + "x-row-version", + include_incomplete=True, + ) + if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): + return "row_hash" + return "random" + + +def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: + """Returns hash of row. + + Hash includes column names and values and is ordered by column name. + Excludes dlt system columns. + Can be used as deterministic row identifier. + """ + row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} + if subset is not None: + row_filtered = {k: v for k, v in row.items() if k in subset} + row_str = json.dumps(row_filtered, sort_keys=True) + return digest128(row_str, DLT_ID_LENGTH_BYTES) + + +def get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: + # create deterministic unique id of the nested row taking into account that all lists are ordered + # and all nested tables must be lists + return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index c5338192a0..e365017125 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,34 +1,27 @@ -from functools import lru_cache from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any -from dlt.common.destination.utils import resolve_merge_strategy -from dlt.common.json import json -from dlt.common.normalizers.exceptions import InvalidJsonNormalizer -from dlt.common.normalizers.typing import TJSONNormalizer, TRowIdType -from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES +from dlt.common.normalizers.exceptions import InvalidJsonNormalizer +from dlt.common.normalizers.typing import TJSONNormalizer +from dlt.common.normalizers.utils import generate_dlt_id from dlt.common.typing import DictStrAny, TDataItem, StrAny from dlt.common.schema import Schema from dlt.common.schema.typing import ( C_DLT_ID, C_DLT_LOAD_ID, - TColumnSchema, TColumnName, TSimpleRegex, - DLT_NAME_PREFIX, ) from dlt.common.schema.utils import ( column_name_validator, - get_columns_names_with_prop, - get_first_column_name_with_prop, - has_column_with_prop, is_nested_table, ) -from dlt.common.utils import digest128, update_dict_nested +from dlt.common.utils import update_dict_nested from dlt.common.normalizers.json import ( TNormalizedRowIterator, wrap_in_dict, DataItemNormalizer as DataItemNormalizerBase, ) +from dlt.common.normalizers.json import helpers from dlt.common.validation import validate_dict @@ -103,18 +96,18 @@ def _flatten( def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) -> None: for k, v in dict_row.items(): if k.strip(): - norm_k = self._normalize_identifier(self.schema, k) + norm_k = helpers.normalize_identifier(self.schema, self.naming, k) else: # for empty keys in the data use _ norm_k = self.EMPTY_KEY_IDENTIFIER # if norm_k != k: # print(f"{k} -> {norm_k}") nested_name = ( - norm_k if path == () else self._shorten_fragments(self.schema, *path, norm_k) + norm_k if path == () else helpers.shorten_fragments(self.naming, *path, norm_k) ) # for lists and dicts we must check if type is possibly nested if isinstance(v, (dict, list)): - if not self._is_nested_type(self.schema, table, nested_name, __r_lvl): + if not helpers.is_nested_type(self.schema, table, nested_name, __r_lvl): # TODO: if schema contains table {table}__{nested_name} then convert v into single element list if isinstance(v, dict): # flatten the dict more @@ -122,7 +115,8 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - else: # pass the list to out_rec_list out_rec_list[ - path + (self._normalize_table_identifier(self.schema, k),) + path + + (helpers.normalize_table_identifier(self.schema, self.naming, k),) ] = v continue else: @@ -134,26 +128,6 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_row_dicts(dict_row, _r_lvl) return out_rec_row, out_rec_list - @staticmethod - def get_row_hash(row: Dict[str, Any], subset: Optional[List[str]] = None) -> str: - """Returns hash of row. - - Hash includes column names and values and is ordered by column name. - Excludes dlt system columns. - Can be used as deterministic row identifier. - """ - row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} - if subset is not None: - row_filtered = {k: v for k, v in row.items() if k in subset} - row_str = json.dumps(row_filtered, sort_keys=True) - return digest128(row_str, DLT_ID_LENGTH_BYTES) - - @staticmethod - def _get_nested_row_hash(parent_row_id: str, nested_table: str, list_idx: int) -> str: - # create deterministic unique id of the nested row taking into account that all lists are ordered - # and all nested tables must be lists - return digest128(f"{parent_row_id}_{nested_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - def _link_row(self, row: DictStrAny, parent_row_id: str, list_idx: int) -> DictStrAny: assert parent_row_id row[self.c_dlt_parent_id] = parent_row_id @@ -175,20 +149,20 @@ def _add_row_id( is_root: bool = False, ) -> str: if is_root: # root table - row_id_type = self._get_root_row_id_type(self.schema, table) + row_id_type = helpers.get_root_row_id_type(self.schema, table) if row_id_type in ("key_hash", "row_hash"): subset = None if row_id_type == "key_hash": - subset = self._get_primary_key(self.schema, table) + subset = helpers.get_primary_key(self.schema, table) # base hash on `dict_row` instead of `flattened_row` # so changes in nested tables lead to new row id - row_id = self.get_row_hash(dict_row, subset=subset) + row_id = helpers.get_row_hash(dict_row, subset=subset) else: row_id = generate_dlt_id() else: # nested table - row_id_type, is_nested = self._get_nested_row_id_type(self.schema, table) + row_id_type, is_nested = helpers.get_nested_row_id_type(self.schema, table) if row_id_type == "row_hash": - row_id = DataItemNormalizer._get_nested_row_hash(parent_row_id, table, pos) + row_id = helpers.get_nested_row_hash(parent_row_id, table, pos) # link to parent table if is_nested: self._link_row(flattened_row, parent_row_id, pos) @@ -227,7 +201,7 @@ def _normalize_list( parent_row_id: Optional[str] = None, _r_lvl: int = 0, ) -> TNormalizedRowIterator: - table = self._shorten_fragments(self.schema, *parent_path, *ident_path) + table = helpers.shorten_fragments(self.naming, *parent_path, *ident_path) for idx, v in enumerate(seq): if isinstance(v, dict): @@ -251,7 +225,7 @@ def _normalize_list( wrap_v = wrap_in_dict(self.c_value, v) DataItemNormalizer._extend_row(extend, wrap_v) self._add_row_id(table, wrap_v, wrap_v, parent_row_id, idx) - yield (table, self._shorten_fragments(self.schema, *parent_path)), wrap_v + yield (table, helpers.shorten_fragments(self.naming, *parent_path)), wrap_v def _normalize_row( self, @@ -264,8 +238,8 @@ def _normalize_row( _r_lvl: int = 0, is_root: bool = False, ) -> TNormalizedRowIterator: - schema = self.schema - table = self._shorten_fragments(schema, *parent_path, *ident_path) + naming = self.naming + table = helpers.shorten_fragments(naming, *parent_path, *ident_path) # flatten current row and extract all lists to recur into flattened_row, lists = self._flatten(table, dict_row, _r_lvl) # always extend row @@ -280,7 +254,7 @@ def _normalize_row( # yield parent table first should_descend = yield ( - (table, self._shorten_fragments(schema, *parent_path)), + (table, helpers.shorten_fragments(naming, *parent_path)), flattened_row, ) if should_descend is False: @@ -361,8 +335,10 @@ def normalize_data_item( # identify load id if loaded data must be processed after loading incrementally row[self.c_dlt_load_id] = load_id # get table name and nesting level - root_table_name = self._normalize_table_identifier(self.schema, table_name) - max_nesting = self._get_table_nesting_level(self.schema, root_table_name, self.max_nesting) + root_table_name = helpers.normalize_table_identifier(self.schema, self.naming, table_name) + max_nesting = helpers.get_table_nesting_level( + self.schema, root_table_name, self.max_nesting + ) yield from self._normalize_row( row, @@ -426,103 +402,3 @@ def _normalize_prop( "./normalizers/json/config", validator_f=column_name_validator(schema.naming), ) - - # - # Cached helper methods for all operations that are called often - # - @staticmethod - @lru_cache(maxsize=None) - def _shorten_fragments(schema: Schema, *idents: str) -> str: - return schema.naming.shorten_fragments(*idents) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_table_identifier(schema: Schema, table_name: str) -> str: - return schema.naming.normalize_table_identifier(table_name) - - @staticmethod - @lru_cache(maxsize=None) - def _normalize_identifier(schema: Schema, identifier: str) -> str: - return schema.naming.normalize_path(identifier) - - @staticmethod - @lru_cache(maxsize=None) - def _get_table_nesting_level( - schema: Schema, table_name: str, default_nesting: int = 1000 - ) -> Optional[int]: - """gets table nesting level, will inherit from parent if not set""" - - table = schema.tables.get(table_name) - if ( - table - and (max_nesting := cast(int, table.get("x-normalizer", {}).get("max_nesting"))) - is not None - ): - return max_nesting - return default_nesting - - @staticmethod - @lru_cache(maxsize=None) - def _get_primary_key(schema: Schema, table_name: str) -> List[str]: - if table_name not in schema.tables: - return [] - table = schema.get_table(table_name) - return get_columns_names_with_prop(table, "primary_key", include_incomplete=True) - - @staticmethod - @lru_cache(maxsize=None) - def _is_nested_type( - schema: Schema, - table_name: str, - field_name: str, - _r_lvl: int, - ) -> bool: - """For those paths the nested objects should be left in place. - Cache perf: max_nesting < _r_lvl: ~2x faster, full check 10x faster - """ - - # nesting level is counted backwards - # is we have traversed to or beyond the calculated nesting level, we detect a nested type - if _r_lvl <= 0: - return True - - column: TColumnSchema = None - table = schema.tables.get(table_name) - if table: - column = table["columns"].get(field_name) - if column is None or "data_type" not in column: - data_type = schema.get_preferred_type(field_name) - else: - data_type = column["data_type"] - - return data_type == "json" - - @staticmethod - @lru_cache(maxsize=None) - def _get_nested_row_id_type(schema: Schema, table_name: str) -> Tuple[TRowIdType, bool]: - """Gets type of row id to be added to nested table and if linking information should be added""" - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy not in ("upsert", "scd2") and not is_nested_table(table): - return "random", False - else: - # table will be created, use standard linking - pass - return "row_hash", True - - @staticmethod - @lru_cache(maxsize=None) - def _get_root_row_id_type(schema: Schema, table_name: str) -> TRowIdType: - if table := schema.tables.get(table_name): - merge_strategy = resolve_merge_strategy(schema.tables, table) - if merge_strategy == "upsert": - return "key_hash" - elif merge_strategy == "scd2": - x_row_version_col = get_first_column_name_with_prop( - schema.get_table(table_name), - "x-row-version", - include_incomplete=True, - ) - if x_row_version_col == schema.naming.normalize_identifier(C_DLT_ID): - return "row_hash" - return "random" diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index 5ae5847963..9953d25913 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -45,6 +45,8 @@ def make_path(self, *identifiers: str) -> str: def break_path(self, path: str) -> Sequence[str]: """Breaks path into sequence of identifiers""" + # TODO: this is no longer needed if we modify all naming convention to do not contract + # underscores then also normalize_path will not be needed return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] def normalize_path(self, path: str) -> str: diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 9840f3a4d2..16ad097fde 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -18,5 +18,7 @@ class TJSONNormalizer(TypedDict, total=False): class TNormalizersConfig(TypedDict, total=False): names: str allow_identifier_change_on_table_with_data: Optional[bool] + use_break_path_on_normalize: Optional[bool] + """Post 1.4.0 to allow table and column names that contain table separators""" detections: Optional[List[str]] json: TJSONNormalizer diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index cbf38a7327..c612c5a4f1 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -1,7 +1,13 @@ import ast import inspect -import astunparse -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, Callable + +try: + import astunparse + + ast_unparse: Callable[[ast.AST], str] = astunparse.unparse +except ImportError: + ast_unparse = ast.unparse # type: ignore[attr-defined, unused-ignore] from dlt.common.typing import AnyFun @@ -25,7 +31,7 @@ def get_literal_defaults(node: Union[ast.FunctionDef, ast.AsyncFunctionDef]) -> literal_defaults: Dict[str, str] = {} for arg, default in zip(reversed(args), reversed(defaults)): if default: - literal_defaults[str(arg.arg)] = astunparse.unparse(default).strip() + literal_defaults[str(arg.arg)] = ast_unparse(default).strip() return literal_defaults @@ -99,7 +105,7 @@ def rewrite_python_script( script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # replace node value - script_lines.append(astunparse.unparse(t_value).strip()) + script_lines.append(ast_unparse(t_value).strip()) last_line = node.end_lineno - 1 last_offset = node.end_col_offset diff --git a/dlt/common/schema/configuration.py b/dlt/common/schema/configuration.py index e64dd57494..72f79026da 100644 --- a/dlt/common/schema/configuration.py +++ b/dlt/common/schema/configuration.py @@ -14,3 +14,5 @@ class SchemaConfiguration(BaseConfiguration): naming: Optional[TNamingConventionReferenceArg] = None # Union[str, NamingConvention] json_normalizer: Optional[DictStrAny] = None allow_identifier_change_on_table_with_data: Optional[bool] = None + use_break_path_on_normalize: Optional[bool] = None + """Post 1.4.0 to allow table and column names that contain table separators""" diff --git a/dlt/common/schema/migrations.py b/dlt/common/schema/migrations.py index d9e758f204..06eb35c0f6 100644 --- a/dlt/common/schema/migrations.py +++ b/dlt/common/schema/migrations.py @@ -29,13 +29,13 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> schema_dict["excludes"] = [] from_engine = 2 if from_engine == 2 and to_engine > 2: - from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers + from dlt.common.schema.normalizers import import_normalizers, configured_normalizers # current version of the schema current = cast(TStoredSchema, schema_dict) # add default normalizers and root hash propagation # use explicit None to get default settings. ignore any naming conventions - normalizers = explicit_normalizers(naming=None, json_normalizer=None) + normalizers = configured_normalizers(naming=None, json_normalizer=None) current["normalizers"], _, _ = import_normalizers(normalizers, normalizers) current["normalizers"]["json"]["config"] = { "propagation": {"root": {"_dlt_id": "_dlt_root_id"}} @@ -169,6 +169,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: json_config.pop("generate_dlt_id", None) from_engine = 10 + if from_engine == 10 and to_engine > 10: + schema_dict["normalizers"]["use_break_path_on_normalize"] = False + from_engine = 11 schema_dict["engine_version"] = from_engine if from_engine != to_engine: diff --git a/dlt/common/schema/normalizers.py b/dlt/common/schema/normalizers.py index 9b2a37e708..8f42e90596 100644 --- a/dlt/common/schema/normalizers.py +++ b/dlt/common/schema/normalizers.py @@ -40,13 +40,14 @@ def _section_for_schema(kwargs: Dict[str, Any]) -> Tuple[str, ...]: @with_config(spec=SchemaConfiguration, sections=_section_for_schema) # type: ignore[call-overload] -def explicit_normalizers( +def configured_normalizers( naming: TNamingConventionReferenceArg = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value, allow_identifier_change_on_table_with_data: bool = None, + use_break_path_on_normalize: Optional[bool] = None, schema_name: Optional[str] = None, ) -> TNormalizersConfig: - """Gets explicitly configured normalizers without any defaults or capabilities injection. If `naming` + """Gets explicitly onfigured normalizers without any defaults or capabilities injection. If `naming` is a module or a type it will get converted into string form via import. If `schema_name` is present, a section ("sources", schema_name, "schema") is used to inject the config @@ -57,6 +58,8 @@ def explicit_normalizers( norm_conf["allow_identifier_change_on_table_with_data"] = ( allow_identifier_change_on_table_with_data ) + if use_break_path_on_normalize is not None: + norm_conf["use_break_path_on_normalize"] = use_break_path_on_normalize return norm_conf diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 0dbeda93cf..d6031a08fa 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -57,7 +57,7 @@ SchemaCorruptedException, TableIdentifiersFrozen, ) -from dlt.common.schema.normalizers import import_normalizers, explicit_normalizers +from dlt.common.schema.normalizers import import_normalizers, configured_normalizers from dlt.common.schema.exceptions import DataValidationError from dlt.common.validation import validate_dict @@ -439,7 +439,8 @@ def update_schema(self, schema: "Schema") -> None: """Updates this schema from an incoming schema. Normalizes identifiers after updating normalizers.""" # pass normalizer config self._settings = deepcopy(schema.settings) - self._configure_normalizers(schema._normalizers_config) + # make shallow copy of normalizer settings + self._configure_normalizers(copy(schema._normalizers_config)) self._compile_settings() # update all tables for table in schema.tables.values(): @@ -753,7 +754,7 @@ def update_normalizers(self) -> None: Default hints, preferred data types and normalize configs (ie. column propagation) are normalized as well. Regexes are included as long as textual parts can be extracted from an expression. """ - self._configure_normalizers(explicit_normalizers(schema_name=self._schema_name)) + self._configure_normalizers(configured_normalizers(schema_name=self._schema_name)) self._compile_settings() def will_update_normalizers(self) -> bool: @@ -761,7 +762,7 @@ def will_update_normalizers(self) -> bool: # import desired modules _, to_naming, _ = import_normalizers( - explicit_normalizers(schema_name=self._schema_name), self._normalizers_config + configured_normalizers(schema_name=self._schema_name), self._normalizers_config ) return type(to_naming) is not type(self.naming) # noqa @@ -1106,13 +1107,13 @@ def _verify_identifiers(table: TTableSchema, norm_table: TTableSchema) -> None: else: return self._schema_tables - def _renormalize_schema_identifiers( + def _replace_and_apply_naming( self, normalizers_config: TNormalizersConfig, to_naming: NamingConvention, from_naming: NamingConvention, ) -> None: - """Normalizes all identifiers in the schema in place""" + """Normalizes all identifiers in the schema in place according to `to_naming`""" self._schema_tables = self._verify_update_normalizers( normalizers_config, to_naming, from_naming ) @@ -1140,10 +1141,19 @@ def _renormalize_schema_identifiers( def _configure_normalizers(self, explicit_normalizers: TNormalizersConfig) -> None: """Gets naming and item normalizer from schema yaml, config providers and destination capabilities and applies them to schema.""" + # preserve current schema settings if not explicitly set in `explicit_normalizers` + if explicit_normalizers and self._normalizers_config: + for prop_ in [ + "use_break_path_on_normalize", + "allow_identifier_change_on_table_with_data", + ]: + if prop_ in self._normalizers_config and prop_ not in explicit_normalizers: + explicit_normalizers[prop_] = self._normalizers_config[prop_] # type: ignore[literal-required] + normalizers_config, to_naming, item_normalizer_class = import_normalizers( explicit_normalizers, self._normalizers_config ) - self._renormalize_schema_identifiers(normalizers_config, to_naming, self.naming) + self._replace_and_apply_naming(normalizers_config, to_naming, self.naming) # data item normalization function self.data_item_normalizer = item_normalizer_class(self) self.data_item_normalizer.extend_schema() @@ -1174,7 +1184,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._add_standard_hints() # configure normalizers, including custom config if present if not normalizers: - normalizers = explicit_normalizers(schema_name=self._schema_name) + normalizers = configured_normalizers(schema_name=self._schema_name) self._configure_normalizers(normalizers) # add version tables self._add_standard_tables() diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index c8f5de03ed..6f5d6213c9 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -28,7 +28,7 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 10 +SCHEMA_ENGINE_VERSION = 11 # dlt tables VERSION_TABLE_NAME = "_dlt_version" diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 3ff23c9bae..58e1dbd824 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -647,3 +647,8 @@ def is_typeerror_due_to_wrong_call(exc: Exception, func: AnyFun) -> bool: func_name = func.__name__ message = str(exc) return message.__contains__(f"{func_name}()") + + +removeprefix = getattr( + str, "removeprefix", lambda s_, p_: s_[len(p_) :] if s_.startswith(p_) else s_ +) diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py index 411c876c19..27a7f5a7af 100644 --- a/dlt/destinations/dataset.py +++ b/dlt/destinations/dataset.py @@ -3,12 +3,8 @@ from contextlib import contextmanager from dlt import version - from dlt.common.json import json - -from dlt.common.normalizers.naming.naming import NamingConvention from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination import AnyDestination from dlt.common.destination.reference import ( SupportsReadableRelation, @@ -109,7 +105,7 @@ def query(self) -> Any: return self._provided_query table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_path(self._table_name) + self.schema.naming.normalize_tables_path(self._table_name) ) maybe_limit_clause_1 = "" diff --git a/dlt/destinations/impl/clickhouse/sql_client.py b/dlt/destinations/impl/clickhouse/sql_client.py index 00f35da082..a6c4ee0458 100644 --- a/dlt/destinations/impl/clickhouse/sql_client.py +++ b/dlt/destinations/impl/clickhouse/sql_client.py @@ -28,6 +28,7 @@ from dlt.common import logger from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.typing import DictStrAny +from dlt.common.utils import removeprefix from dlt.destinations.exceptions import ( DatabaseUndefinedRelation, @@ -88,9 +89,8 @@ def has_dataset(self) -> bool: sentinel_table = self.config.dataset_sentinel_table_name all_ds_tables = self._list_tables() if self.dataset_name: - return sentinel_table in [ - t.split(self.config.dataset_table_separator)[1] for t in all_ds_tables - ] + prefix = self.dataset_name + self.config.dataset_table_separator + return sentinel_table in [removeprefix(t, prefix) for t in all_ds_tables] else: # if no dataset specified we look for sentinel table return sentinel_table in all_ds_tables diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 0cf63b3ac9..1739c87fb3 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -37,7 +37,7 @@ TPipelineStateDoc, load_package as current_load_package, ) -from dlt.destinations.sql_client import DBApiCursor, WithSqlClient, SqlClientBase +from dlt.destinations.sql_client import WithSqlClient, SqlClientBase from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJobRequest, @@ -63,7 +63,6 @@ from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations import path_utils from dlt.destinations.fs_client import FSClientBase -from dlt.destinations.dataset import ReadableDBAPIDataset from dlt.destinations.utils import verify_schema_merge_disposition INIT_FILE_NAME = "init" diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 41d3035a9f..03f8a31462 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -18,6 +18,8 @@ TTableSchemaColumns, TPartialTableSchema, ) +from dlt.common.normalizers.json import helpers as normalize_helpers + from dlt.extract.hints import HintsMeta, TResourceHints from dlt.extract.resource import DltResource from dlt.extract.items import DataItemWithMeta, TableNameMeta @@ -141,7 +143,9 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No self._write_to_dynamic_table(resource, items, meta) def write_empty_items_file(self, table_name: str) -> None: - table_name = self.naming.normalize_table_identifier(table_name) + table_name = normalize_helpers.normalize_table_identifier( + self.schema, self.naming, table_name + ) self.item_storage.write_empty_items_file(self.load_id, self.schema.name, table_name, None) def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: @@ -151,10 +155,12 @@ def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[s table_name = meta.table_name else: table_name = resource.table_name # type: ignore[assignment] - return self.naming.normalize_table_identifier(table_name) + return normalize_helpers.normalize_table_identifier(self.schema, self.naming, table_name) def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: - return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + return normalize_helpers.normalize_table_identifier( + self.schema, self.naming, resource._table_name_hint_fun(item) + ) def _write_item( self, @@ -322,7 +328,7 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No ) for tbl in ( ( - # 1. Convert pandas frame(s) to arrow Table + # 1. Convert pandas frame(s) to arrow Table, remove indexes because we store pandas_to_arrow(item) if (pandas and isinstance(item, pandas.DataFrame)) else item diff --git a/dlt/normalize/worker.py b/dlt/normalize/worker.py index 53a856f7d0..5eccdf5433 100644 --- a/dlt/normalize/worker.py +++ b/dlt/normalize/worker.py @@ -20,6 +20,7 @@ ParsedLoadJobFileName, ) from dlt.common.schema import TSchemaUpdate, Schema +from dlt.common.normalizers.json import helpers as normalize_helpers from dlt.normalize.configuration import NormalizeConfiguration from dlt.normalize.exceptions import NormalizeJobFailed @@ -218,8 +219,8 @@ def _gather_metrics_and_close( parsed_file_name = ParsedLoadJobFileName.parse(extracted_items_file) # normalize table name in case the normalization changed # NOTE: this is the best we can do, until a full lineage information is in the schema - root_table_name = schema.naming.normalize_table_identifier( - parsed_file_name.table_name + root_table_name = normalize_helpers.normalize_table_identifier( + schema, schema.naming, parsed_file_name.table_name ) root_tables.add(root_table_name) root_table = stored_schema["tables"].get(root_table_name, {"name": root_table_name}) diff --git a/dlt/reflection/script_visitor.py b/dlt/reflection/script_visitor.py index f4a5569ed0..c49fed20ab 100644 --- a/dlt/reflection/script_visitor.py +++ b/dlt/reflection/script_visitor.py @@ -1,10 +1,9 @@ import inspect import ast -import astunparse from ast import NodeVisitor from typing import Any, Dict, List -from dlt.common.reflection.utils import find_outer_func_def +from dlt.common.reflection.utils import find_outer_func_def, ast_unparse import dlt.reflection.names as n @@ -68,9 +67,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: for deco in node.decorator_list: # decorators can be function calls, attributes or names if isinstance(deco, (ast.Name, ast.Attribute)): - alias_name = astunparse.unparse(deco).strip() + alias_name = ast_unparse(deco).strip() elif isinstance(deco, ast.Call): - alias_name = astunparse.unparse(deco.func).strip() + alias_name = ast_unparse(deco.func).strip() else: raise ValueError( self.source_segment(deco), type(deco), "Unknown decorator form" @@ -87,7 +86,7 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: def visit_Call(self, node: ast.Call) -> Any: if self._curr_pass == 2: # check if this is a call to any of known functions - alias_name = astunparse.unparse(node.func).strip() + alias_name = ast_unparse(node.func).strip() fn = self.func_aliases.get(alias_name) if not fn: # try a fallback to "run" function that may be called on pipeline or source diff --git a/dlt/sources/sql_database/arrow_helpers.py b/dlt/sources/sql_database/arrow_helpers.py index 1f72205a2a..1de9dffc87 100644 --- a/dlt/sources/sql_database/arrow_helpers.py +++ b/dlt/sources/sql_database/arrow_helpers.py @@ -4,9 +4,6 @@ from dlt.common.configuration import with_config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.libs.pyarrow import ( - row_tuples_to_arrow as _row_tuples_to_arrow, -) @with_config @@ -20,6 +17,8 @@ def row_tuples_to_arrow( is always the case if run within the pipeline. This will generate arrow schema compatible with the destination. Otherwise generic capabilities are used """ + from dlt.common.libs.pyarrow import row_tuples_to_arrow as _row_tuples_to_arrow + return _row_tuples_to_arrow( rows, caps or DestinationCapabilitiesContext.generic_capabilities(), columns, tz ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index aa0a5fe68a..9b243b9429 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -181,7 +181,7 @@ bucket_url = "abfss://@.dfs.core.windows.n You can use `az`, `abfss`, `azure` and `abfs` url schemes. -If you need to use a custom host to account your storage account you can set it up like below: +If you need to use a custom host for your storage account, you can set it up like below: ```toml [destination.filesystem.credentials] # The storage account name is always required diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md index 11d4382a22..fa5cf7b128 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/arrow-pandas.md @@ -39,6 +39,8 @@ pipeline = dlt.pipeline("orders_pipeline", destination="snowflake") pipeline.run(df, table_name="orders") ``` +Note that Pandas indexes are not save by default (up from `dlt` version 1.4.1). If for some reason you need the destination, +use `Table.from_pandas` with `preserve_index` set to True to explicitly convert the dataframe into arrow table. A `pyarrow` table can be loaded in the same way: diff --git a/docs/website/docs/general-usage/naming-convention.md b/docs/website/docs/general-usage/naming-convention.md index f1766d1797..c10ac3e3d0 100644 --- a/docs/website/docs/general-usage/naming-convention.md +++ b/docs/website/docs/general-usage/naming-convention.md @@ -69,6 +69,45 @@ Note that many destinations are exclusively case-insensitive, of which some pres ### Identifier shortening Identifier shortening happens during normalization. `dlt` takes the maximum length of the identifier from the destination capabilities and will trim the identifiers that are too long. The default shortening behavior generates short deterministic hashes of the source identifiers and places them in the middle of the destination identifier. This (with a high probability) avoids shortened identifier collisions. +### Compound (flattened) identifiers +`dlt` combines several identifiers in order to name nested tables and flattened columns. For example: +```json +{ + "column": + { + "value": 1 + } +} +``` +generates flattened column name `column__value`. Where `__` is a path separator (in **snake case**). Each component in the combined identifier is normalized +separately and shortened as a whole. + +:::note +Combined identifier is also a valid single identifier. Starting from +`dlt` version above 1.4.0 normalization is fully idempotent and normalized +`column__value` will be still `column__value`. +::: + +:::caution +Previously double underscores were contracted into single underscore. That +prevented using data loaded by `dlt` as a data source without identifier modifications. `dlt` maintains backward compatibility for version >1.4.0 as follows: + +* All schemas stored locally or at destination will be migrated to backward compatible mode by setting a flag `use_break_path_on_normalize` ie.: +```yaml +normalizers: + names: dlt.common.normalizers.names.snake_case + use_break_path_on_normalize: true + json: + module: dlt.common.normalizers.json.relational +``` +* Backward compatible behavior may be explicitly enabled by setting +`SCHEMA__USE_BREAK_PATH_ON_NORMALIZE` to `TRUE` or via `config.toml`: +```toml +[schema] +use_break_path_on_normalize=true +``` +::: + ### 🚧 [WIP] Name convention changes are lossy `dlt` does not store the source identifiers in the schema so when the naming convention changes (or we increase the maximum identifier length), it is not able to generate a fully correct set of new identifiers. Instead, it will re-normalize already normalized identifiers. We are currently working to store the full identifier lineage - source identifiers will be stored and mapped to the destination in the schema. diff --git a/mypy.ini b/mypy.ini index eee4db6126..769e84b13a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -134,4 +134,4 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-time_machine.*] -ignore_missing_imports = True \ No newline at end of file +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 9ae26bd04c..732ba0e219 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -13,13 +13,13 @@ files = [ [[package]] name = "adlfs" -version = "2024.4.1" +version = "2024.7.0" description = "Access Azure Datalake Gen1 with fsspec and dask" optional = true python-versions = ">=3.8" files = [ - {file = "adlfs-2024.4.1-py3-none-any.whl", hash = "sha256:acea94612ddacaa34ea8c6babcc95b8da6982f930cdade7a86fbd17382403e16"}, - {file = "adlfs-2024.4.1.tar.gz", hash = "sha256:75530a45447f358ae53c5c39c298b8d966dae684be84db899f63b94cd96fc000"}, + {file = "adlfs-2024.7.0-py3-none-any.whl", hash = "sha256:2005c8e124fda3948f2a6abb2dbebb2c936d2d821acaca6afd61932edfa9bc07"}, + {file = "adlfs-2024.7.0.tar.gz", hash = "sha256:106995b91f0eb5e775bcd5957d180d9a14faef3271a063b1f65c66fd5ab05ddf"}, ] [package.dependencies] @@ -3900,106 +3900,6 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, - {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, - {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, - {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, - {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, - {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, - {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, - {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, - {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, - {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, - {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, - {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, - {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, - {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, - {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, - {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, - {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, - {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, - {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, - {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, - {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, - {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, - {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, - {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, - {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, - {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, - {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, - {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, - {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, - {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, - {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, - {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, - {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, - {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, - {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, - {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, - {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, - {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, - {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, - {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, - {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -10618,4 +10518,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "24e262ce6bb496fad6e587c76bb9ad60a2cc45a00f52e368b59978093e57b77c" +content-hash = "c0607d05ab37a1a6addf3ae7264bf5972cb6ce6e46df1dcdc2da3cff72e5008e" diff --git a/pyproject.toml b/pyproject.toml index 638653ffcf..8afb332422 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1a0" +version = "1.4.1a1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ] @@ -40,7 +40,7 @@ click = ">=7.1" requirements-parser = ">=0.5.0" setuptools = ">=65.6.0" humanize = ">=4.4.0" -astunparse = ">=1.6.3" +astunparse = { "version" = ">=1.6.3", "python" = "<3.9"} gitpython = ">=3.1.29" pytz = ">=2022.6" giturlparse = ">=0.10.0" @@ -89,7 +89,6 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } -aiohttp = { version = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v11.yml b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml new file mode 100644 index 0000000000..fd6717c614 --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v11.yml @@ -0,0 +1,394 @@ +version: 18 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 +name: ethereum +tables: + _dlt_loads: + columns: + load_id: + nullable: false + data_type: text + schema_name: + nullable: true + data_type: text + status: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_version_hash: + nullable: true + data_type: text + write_disposition: skip + description: Created by DLT. Tracks completed loads + schema_contract: {} + resource: _dlt_loads + _dlt_version: + columns: + version: + nullable: false + data_type: bigint + engine_version: + nullable: false + data_type: bigint + inserted_at: + nullable: false + data_type: timestamp + schema_name: + nullable: false + data_type: text + version_hash: + nullable: false + data_type: text + schema: + nullable: false + data_type: text + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_contract: {} + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + description: load id coming from the extractor + data_type: text + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + number: + nullable: false + primary_key: true + data_type: bigint + parent_hash: + nullable: true + data_type: text + hash: + nullable: false + cluster: true + unique: true + data_type: text + base_fee_per_gas: + nullable: false + data_type: wei + difficulty: + nullable: false + data_type: wei + extra_data: + nullable: true + data_type: text + gas_limit: + nullable: false + data_type: bigint + gas_used: + nullable: false + data_type: bigint + logs_bloom: + nullable: true + data_type: binary + miner: + nullable: true + data_type: text + mix_hash: + nullable: true + data_type: text + nonce: + nullable: true + data_type: text + receipts_root: + nullable: true + data_type: text + sha3_uncles: + nullable: true + data_type: text + size: + nullable: true + data_type: bigint + state_root: + nullable: false + data_type: text + timestamp: + nullable: false + unique: true + sort: true + data_type: timestamp + total_difficulty: + nullable: true + data_type: wei + transactions_root: + nullable: false + data_type: text + schema_contract: {} + resource: blocks + x-normalizer: + seen-data: true + blocks__transactions: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + block_number: + nullable: false + primary_key: true + data_type: bigint + merge_key: true + transaction_index: + nullable: false + primary_key: true + data_type: bigint + hash: + nullable: false + unique: true + data_type: text + block_hash: + nullable: false + cluster: true + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + chain_id: + nullable: true + data_type: text + from: + nullable: true + data_type: text + gas: + nullable: true + data_type: bigint + gas_price: + nullable: true + data_type: bigint + input: + nullable: true + data_type: text + max_fee_per_gas: + nullable: true + data_type: wei + max_priority_fee_per_gas: + nullable: true + data_type: wei + nonce: + nullable: true + data_type: bigint + r: + nullable: true + data_type: text + s: + nullable: true + data_type: text + status: + nullable: true + data_type: bigint + to: + nullable: true + data_type: text + type: + nullable: true + data_type: text + v: + nullable: true + data_type: bigint + value: + nullable: false + data_type: wei + eth_value: + nullable: true + data_type: decimal + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions + blocks__transactions__logs: + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + address: + nullable: false + data_type: text + block_timestamp: + nullable: false + sort: true + data_type: timestamp + block_hash: + nullable: false + cluster: true + data_type: text + block_number: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + transaction_index: + nullable: false + primary_key: true + merge_key: true + data_type: bigint + log_index: + nullable: false + primary_key: true + data_type: bigint + data: + nullable: true + data_type: text + removed: + nullable: true + data_type: bool + transaction_hash: + nullable: false + data_type: text + x-normalizer: + seen-data: true + write_disposition: append + resource: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + address: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + data_type: text + parent_key: true + _dlt_list_idx: + nullable: false + data_type: bigint + _dlt_id: + nullable: false + unique: true + data_type: text + row_key: true + _dlt_root_id: + nullable: false + root_key: true + data_type: text + value: + nullable: true + data_type: text + x-normalizer: + seen-data: true +settings: + default_hints: + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + row_key: + - _dlt_id + parent_key: + - _dlt_parent_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_contract: {} +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash +previous_hashes: +- oHfYGTI2GHOxuzwVz6+yvMilXUvHYhxrxkanC2T6MAI= +- C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE= +- yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= + diff --git a/tests/common/cases/schemas/github/issues.schema.json b/tests/common/cases/schemas/github/issues.schema.json index 4c4f5425ae..5a1b0c6f84 100644 --- a/tests/common/cases/schemas/github/issues.schema.json +++ b/tests/common/cases/schemas/github/issues.schema.json @@ -1,1322 +1,1100 @@ { - "version": 2, - "version_hash": "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=", - "engine_version": 5, - "name": "event", - "tables": { - "_dlt_version": { - "name": "_dlt_version", - "columns": { - "version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version", - "data_type": "bigint", - "nullable": false - }, - "engine_version": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "engine_version", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": false - }, - "version_hash": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "version_hash", - "data_type": "text", - "nullable": false - }, - "schema": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema", - "data_type": "text", - "nullable": false - } + "version": 3, + "version_hash": "o6olKmaCAQVWDWR4eT4aZ1V/RiH+003516xq7Zrva+Q=", + "engine_version": 11, + "name": "event", + "tables": { + "_dlt_version": { + "columns": { + "version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "engine_version": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": false + }, + "schema_name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "version_hash": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "schema": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + } + }, + "write_disposition": "skip", + "description": "Created by DLT. Tracks schema updates", + "schema_contract": {}, + "resource": "_dlt_version" }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks schema updates" - }, - "_dlt_loads": { - "name": "_dlt_loads", - "columns": { - "load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "load_id", - "data_type": "text", - "nullable": false - }, - "schema_name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "schema_name", - "data_type": "text", - "nullable": true - }, - "status": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "status", - "data_type": "bigint", - "nullable": false - }, - "inserted_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "inserted_at", - "data_type": "timestamp", - "nullable": false - } + "_dlt_loads": { + "columns": { + "load_id": { + "data_type": "text", + "nullable": false + }, + "schema_name": { + "data_type": "text", + "nullable": true + }, + "status": { + "data_type": "bigint", + "nullable": false + }, + "inserted_at": { + "data_type": "timestamp", + "nullable": false + }, + "schema_version_hash": { + "data_type": "text", + "nullable": true + } + }, + "write_disposition": "skip", + "resource": "_dlt_loads", + "description": "Created by DLT. Tracks completed loads", + "schema_contract": {} }, - "write_disposition": "skip", - "description": "Created by DLT. Tracks completed loads" - }, - "issues": { - "name": "issues", - "columns": { - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "repository_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repository_url", - "data_type": "text", - "nullable": true - }, - "labels_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "labels_url", - "data_type": "text", - "nullable": true - }, - "comments_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "number": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "number", - "data_type": "bigint", - "nullable": true - }, - "title": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "title", - "data_type": "text", - "nullable": true - }, - "user__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__login", - "data_type": "text", - "nullable": true - }, - "user__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__id", - "data_type": "bigint", - "nullable": true - }, - "user__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__node_id", - "data_type": "text", - "nullable": true - }, - "user__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__avatar_url", - "data_type": "text", - "nullable": true - }, - "user__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gravatar_id", - "data_type": "text", - "nullable": true - }, - "user__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__url", - "data_type": "text", - "nullable": true - }, - "user__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__html_url", - "data_type": "text", - "nullable": true - }, - "user__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__followers_url", - "data_type": "text", - "nullable": true - }, - "user__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__following_url", - "data_type": "text", - "nullable": true - }, - "user__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__gists_url", - "data_type": "text", - "nullable": true - }, - "user__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__starred_url", - "data_type": "text", - "nullable": true - }, - "user__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "user__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__organizations_url", - "data_type": "text", - "nullable": true - }, - "user__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__repos_url", - "data_type": "text", - "nullable": true - }, - "user__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__events_url", - "data_type": "text", - "nullable": true - }, - "user__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__received_events_url", - "data_type": "text", - "nullable": true - }, - "user__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__type", - "data_type": "text", - "nullable": true - }, - "user__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "user__site_admin", - "data_type": "bool", - "nullable": true - }, - "state": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state", - "data_type": "text", - "nullable": true - }, - "locked": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "locked", - "data_type": "bool", - "nullable": true - }, - "assignee__login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__login", - "data_type": "text", - "nullable": true - }, - "assignee__id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__id", - "data_type": "bigint", - "nullable": true - }, - "assignee__node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__node_id", - "data_type": "text", - "nullable": true - }, - "assignee__avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__avatar_url", - "data_type": "text", - "nullable": true - }, - "assignee__gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gravatar_id", - "data_type": "text", - "nullable": true - }, - "assignee__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__url", - "data_type": "text", - "nullable": true - }, - "assignee__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__html_url", - "data_type": "text", - "nullable": true - }, - "assignee__followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__followers_url", - "data_type": "text", - "nullable": true - }, - "assignee__following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__following_url", - "data_type": "text", - "nullable": true - }, - "assignee__gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__gists_url", - "data_type": "text", - "nullable": true - }, - "assignee__starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__starred_url", - "data_type": "text", - "nullable": true - }, - "assignee__subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__subscriptions_url", - "data_type": "text", - "nullable": true - }, - "assignee__organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__organizations_url", - "data_type": "text", - "nullable": true - }, - "assignee__repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__repos_url", - "data_type": "text", - "nullable": true - }, - "assignee__events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__events_url", - "data_type": "text", - "nullable": true - }, - "assignee__received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__received_events_url", - "data_type": "text", - "nullable": true - }, - "assignee__type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__type", - "data_type": "text", - "nullable": true - }, - "assignee__site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "assignee__site_admin", - "data_type": "bool", - "nullable": true - }, - "comments": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "comments", - "data_type": "bigint", - "nullable": true - }, - "created_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "created_at", - "data_type": "timestamp", - "nullable": true - }, - "updated_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "updated_at", - "data_type": "timestamp", - "nullable": true - }, - "closed_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "closed_at", - "data_type": "timestamp", - "nullable": true - }, - "author_association": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "author_association", - "data_type": "text", - "nullable": true - }, - "body": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "body", - "data_type": "text", - "nullable": true - }, - "reactions__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__url", - "data_type": "text", - "nullable": true - }, - "reactions__total_count": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__total_count", - "data_type": "bigint", - "nullable": true - }, - "reactions___1": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions___1", - "data_type": "bigint", - "nullable": true - }, - "reactions__laugh": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__laugh", - "data_type": "bigint", - "nullable": true - }, - "reactions__hooray": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__hooray", - "data_type": "bigint", - "nullable": true - }, - "reactions__confused": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__confused", - "data_type": "bigint", - "nullable": true - }, - "reactions__heart": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__heart", - "data_type": "bigint", - "nullable": true - }, - "reactions__rocket": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__rocket", - "data_type": "bigint", - "nullable": true - }, - "reactions__eyes": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "reactions__eyes", - "data_type": "bigint", - "nullable": true - }, - "timeline_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "timeline_url", - "data_type": "text", - "nullable": true - }, - "state_reason": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "state_reason", - "data_type": "text", - "nullable": true - }, - "_dlt_load_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_load_id", - "data_type": "text", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false - }, - "draft": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "draft", - "data_type": "bool", - "nullable": true - }, - "pull_request__url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__url", - "data_type": "text", - "nullable": true - }, - "pull_request__html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__html_url", - "data_type": "text", - "nullable": true - }, - "pull_request__diff_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__diff_url", - "data_type": "text", - "nullable": true - }, - "pull_request__patch_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__patch_url", - "data_type": "text", - "nullable": true - }, - "pull_request__merged_at": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "pull_request__merged_at", - "data_type": "timestamp", - "nullable": true - } + "issues": { + "columns": { + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repository_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "labels_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "comments_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "number": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "title": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "user__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "user__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "state": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "locked": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "assignee__login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "assignee__node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "assignee__site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "comments": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "created_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "updated_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "closed_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + }, + "author_association": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "body": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "reactions__total_count": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions___1": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__laugh": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__hooray": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__confused": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__heart": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__rocket": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "reactions__eyes": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "timeline_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "state_reason": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_load_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + }, + "draft": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "pull_request__url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__diff_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__patch_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "pull_request__merged_at": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "timestamp", + "nullable": true + } + }, + "write_disposition": "append", + "schema_contract": {}, + "x-normalizer": { + "seen-data": true + }, + "resource": "issues" }, - "write_disposition": "append" - }, - "issues__labels": { - "name": "issues__labels", - "columns": { - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "name": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "name", - "data_type": "text", - "nullable": true - }, - "color": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "color", - "data_type": "text", - "nullable": true - }, - "default": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "default", - "data_type": "bool", - "nullable": true - }, - "description": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "description", - "data_type": "text", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__labels": { + "columns": { + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "name": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "color": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "default": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "description": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } }, - "parent": "issues" - }, - "issues__assignees": { - "name": "issues__assignees", - "columns": { - "login": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "login", - "data_type": "text", - "nullable": true - }, - "id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "id", - "data_type": "bigint", - "nullable": true - }, - "node_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "node_id", - "data_type": "text", - "nullable": true - }, - "avatar_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "avatar_url", - "data_type": "text", - "nullable": true - }, - "gravatar_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gravatar_id", - "data_type": "text", - "nullable": true - }, - "url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "url", - "data_type": "text", - "nullable": true - }, - "html_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "html_url", - "data_type": "text", - "nullable": true - }, - "followers_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "followers_url", - "data_type": "text", - "nullable": true - }, - "following_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "following_url", - "data_type": "text", - "nullable": true - }, - "gists_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "gists_url", - "data_type": "text", - "nullable": true - }, - "starred_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "starred_url", - "data_type": "text", - "nullable": true - }, - "subscriptions_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "subscriptions_url", - "data_type": "text", - "nullable": true - }, - "organizations_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "organizations_url", - "data_type": "text", - "nullable": true - }, - "repos_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "repos_url", - "data_type": "text", - "nullable": true - }, - "events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "events_url", - "data_type": "text", - "nullable": true - }, - "received_events_url": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "received_events_url", - "data_type": "text", - "nullable": true - }, - "type": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "type", - "data_type": "text", - "nullable": true - }, - "site_admin": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "site_admin", - "data_type": "bool", - "nullable": true - }, - "_dlt_parent_id": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": true, - "name": "_dlt_parent_id", - "data_type": "text", - "nullable": false - }, - "_dlt_list_idx": { - "partition": false, - "cluster": false, - "unique": false, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_list_idx", - "data_type": "bigint", - "nullable": false - }, - "_dlt_id": { - "partition": false, - "cluster": false, - "unique": true, - "sort": false, - "primary_key": false, - "foreign_key": false, - "name": "_dlt_id", - "data_type": "text", - "nullable": false + "issues__assignees": { + "columns": { + "login": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": true + }, + "node_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "avatar_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gravatar_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "html_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "followers_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "following_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "gists_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "starred_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "subscriptions_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "organizations_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "repos_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "received_events_url": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "type": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": true + }, + "site_admin": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bool", + "nullable": true + }, + "_dlt_parent_id": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "parent_key": true + }, + "_dlt_list_idx": { + "partition": false, + "cluster": false, + "unique": false, + "sort": false, + "primary_key": false, + "data_type": "bigint", + "nullable": false + }, + "_dlt_id": { + "partition": false, + "cluster": false, + "unique": true, + "sort": false, + "primary_key": false, + "data_type": "text", + "nullable": false, + "row_key": true + } + }, + "parent": "issues", + "x-normalizer": { + "seen-data": true } - }, - "parent": "issues" - } - }, - "settings": { - "detections": [ - "timestamp", - "iso_timestamp", - "iso_date" - ], - "default_hints": { - "not_null": [ - "_dlt_id", - "_dlt_root_id", - "_dlt_parent_id", - "_dlt_list_idx", - "_dlt_load_id" - ], - "foreign_key": [ - "_dlt_parent_id" + } + }, + "settings": { + "detections": [ + "timestamp", + "iso_timestamp", + "iso_date" ], - "unique": [ - "_dlt_id" - ] - } - }, - "normalizers": { - "names": "dlt.common.normalizers.names.snake_case", - "json": { - "module": "dlt.common.normalizers.json.relational" - } + "default_hints": { + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "_dlt_load_id" + ], + "unique": [ + "_dlt_id" + ], + "row_key": [ + "_dlt_id" + ], + "parent_key": [ + "_dlt_parent_id" + ] + }, + "schema_contract": {} + }, + "normalizers": { + "names": "dlt.common.normalizers.names.snake_case", + "json": { + "module": "dlt.common.normalizers.json.relational" + } + }, + "previous_hashes": [ + "IeCTkq8epwbjSy1O3jdkPPUkTPCt4hLj6RYo8uZ02JI=" + ] } -} diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 1553cea04f..35bc80add2 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -6,14 +6,12 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.schema import Schema from dlt.common.schema.utils import new_table - +from dlt.common.normalizers.utils import DLT_ID_LENGTH_BYTES from dlt.common.normalizers.json.relational import ( RelationalNormalizerConfigPropagation, DataItemNormalizer as RelationalNormalizer, - DLT_ID_LENGTH_BYTES, ) - -# _flatten, _get_child_row_hash, _normalize_row, normalize_data_item, +from dlt.common.normalizers.json import helpers as normalize_helpers from tests.utils import create_schema_with_name @@ -420,7 +418,7 @@ def test_list_in_list() -> None: schema.update_table(path_table) assert "zen__webpath" in schema.tables # clear cache with json paths - schema.data_item_normalizer._is_nested_type.cache_clear() # type: ignore[attr-defined] + normalize_helpers.is_nested_type.cache_clear() rows = list(schema.normalize_data_item(chats, "1762162.1212", "zen")) # both lists are json types now @@ -890,7 +888,7 @@ def test_caching_perf(norm: RelationalNormalizer) -> None: table["x-normalizer"] = {} start = time() for _ in range(100000): - norm._is_nested_type(norm.schema, "test", "field", 0) + normalize_helpers.is_nested_type(norm.schema, "test", "field", 0) # norm._get_table_nesting_level(norm.schema, "test") print(f"{time() - start}") diff --git a/tests/common/normalizers/test_naming_snake_case.py b/tests/common/normalizers/test_naming_snake_case.py index ee4f43e7f0..e03de65696 100644 --- a/tests/common/normalizers/test_naming_snake_case.py +++ b/tests/common/normalizers/test_naming_snake_case.py @@ -50,6 +50,14 @@ def test_normalize_path(naming_unlimited: NamingConvention) -> None: assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" assert naming_unlimited.normalize_path("Small Love Potion") == "small_love_potion" + # paths with non normalized underscores + # NOTE: empty idents created during break path are removed so underscores are contracted + assert ( + naming_unlimited.normalize_path("Small___Love____Potion_____x") + == "small___love__potion___x" + ) + assert naming_unlimited.normalize_path("small___love__potion___x") == "small___love__potion___x" + def test_normalize_non_alpha_single_underscore() -> None: assert SnakeCaseNamingConvention.RE_NON_ALPHANUMERIC.sub("_", "-=!*") == "_" diff --git a/tests/common/schema/test_import_normalizers.py b/tests/common/schema/test_import_normalizers.py index a1e3d775f0..d444259946 100644 --- a/tests/common/schema/test_import_normalizers.py +++ b/tests/common/schema/test_import_normalizers.py @@ -16,7 +16,7 @@ ) from dlt.common.schema.normalizers import ( DEFAULT_NAMING_NAMESPACE, - explicit_normalizers, + configured_normalizers, import_normalizers, naming_from_reference, serialize_reference, @@ -26,25 +26,25 @@ def test_explicit_normalizers() -> None: - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None assert config["json"] is None # pass explicit - config = explicit_normalizers("direct", {"module": "custom"}) + config = configured_normalizers("direct", {"module": "custom"}) assert config["names"] == "direct" assert config["json"] == {"module": "custom"} # pass modules and types, make sure normalizer config is serialized - config = explicit_normalizers(direct) + config = configured_normalizers(direct) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" - config = explicit_normalizers(direct.NamingConvention) + config = configured_normalizers(direct.NamingConvention) assert config["names"] == f"{DEFAULT_NAMING_NAMESPACE}.direct.NamingConvention" # use environ os.environ["SCHEMA__NAMING"] = "direct" os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "custom"}' - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] == "direct" assert config["json"] == {"module": "custom"} @@ -54,7 +54,7 @@ def test_explicit_normalizers_caps_ignored() -> None: destination_caps = DestinationCapabilitiesContext.generic_capabilities() destination_caps.naming_convention = "direct" with Container().injectable_context(destination_caps): - config = explicit_normalizers() + config = configured_normalizers() assert config["names"] is None @@ -121,7 +121,7 @@ def test_naming_from_reference() -> None: def test_import_normalizers() -> None: - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert isinstance(naming, snake_case.NamingConvention) # no maximum length: we do not know the destination capabilities assert naming.max_length is None @@ -133,7 +133,7 @@ def test_import_normalizers() -> None: os.environ["SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, naming, json_normalizer = import_normalizers(explicit_normalizers()) + config, naming, json_normalizer = import_normalizers(configured_normalizers()) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} assert isinstance(naming, direct.NamingConvention) @@ -142,7 +142,7 @@ def test_import_normalizers() -> None: def test_import_normalizers_with_defaults() -> None: - explicit = explicit_normalizers() + explicit = configured_normalizers() default_: TNormalizersConfig = { "names": "dlt.destinations.impl.weaviate.naming", "json": {"module": "tests.common.normalizers.custom_normalizers"}, @@ -170,7 +170,7 @@ def test_config_sections(sections: str) -> None: os.environ[f"{sections}SCHEMA__JSON_NORMALIZER"] = ( '{"module": "tests.common.normalizers.custom_normalizers"}' ) - config, _, _ = import_normalizers(explicit_normalizers(schema_name="test_schema")) + config, _, _ = import_normalizers(configured_normalizers(schema_name="test_schema")) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} @@ -181,11 +181,11 @@ def test_import_normalizers_with_caps() -> None: destination_caps.naming_convention = "direct" destination_caps.max_identifier_length = 127 with Container().injectable_context(destination_caps): - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) assert isinstance(naming, direct.NamingConvention) assert naming.max_length == 127 - _, naming, _ = import_normalizers(explicit_normalizers(snake_case)) + _, naming, _ = import_normalizers(configured_normalizers(snake_case)) assert isinstance(naming, snake_case.NamingConvention) assert naming.max_length == 127 @@ -196,23 +196,23 @@ def test_import_normalizers_with_caps() -> None: } destination_caps.max_table_nesting = 0 with Container().injectable_context(destination_caps): - config, _, relational = import_normalizers(explicit_normalizers()) + config, _, relational = import_normalizers(configured_normalizers()) assert config["json"]["config"]["max_nesting"] == 0 assert relational is RelationalNormalizer # wrong normalizer - config, _, relational = import_normalizers(explicit_normalizers(), default_) + config, _, relational = import_normalizers(configured_normalizers(), default_) assert "config" not in config["json"] def test_import_invalid_naming_module() -> None: with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("unknown")) + import_normalizers(configured_normalizers("unknown")) assert py_ex.value.naming_module == "unknown" with pytest.raises(UnknownNamingModule) as py_ex: - import_normalizers(explicit_normalizers("dlt.common.tests")) + import_normalizers(configured_normalizers("dlt.common.tests")) assert py_ex.value.naming_module == "dlt.common.tests" with pytest.raises(InvalidNamingType) as py_ex2: - import_normalizers(explicit_normalizers("dlt.pipeline.helpers")) + import_normalizers(configured_normalizers("dlt.pipeline.helpers")) assert py_ex2.value.naming_module == "dlt.pipeline" assert py_ex2.value.naming_class == "helpers" diff --git a/tests/common/schema/test_normalize_identifiers.py b/tests/common/schema/test_normalize_identifiers.py index f84d857e26..a1cb181525 100644 --- a/tests/common/schema/test_normalize_identifiers.py +++ b/tests/common/schema/test_normalize_identifiers.py @@ -271,12 +271,7 @@ def test_normalize_table_identifiers_table_reference() -> None: def test_update_normalizers() -> None: - schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") - schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] - # drop seen data - del schema.tables["issues"]["x-normalizer"] - del schema.tables["issues__labels"]["x-normalizer"] - del schema.tables["issues__assignees"]["x-normalizer"] + schema = make_issues_schema_for_normalizers_update() # save default hints in original form default_hints = schema._settings["default_hints"] @@ -307,8 +302,8 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.configuration.container import Container - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - orig_schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + orig_schema = Schema.from_dict(eth_V11) # save schema schema_storage_no_import.save_schema(orig_schema) @@ -317,7 +312,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) as caps: assert caps.naming_convention is sql_upper # creating a schema from dict keeps original normalizers - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert_schema_identifiers_case(schema, str.lower) assert schema._normalizers_config["names"].endswith("snake_case") @@ -350,7 +345,7 @@ def test_normalize_default_hints(schema_storage_no_import: SchemaStorage) -> Non ) norm_schema = Schema.from_dict( - deepcopy(eth_V9), remove_processing_hints=True, bump_version=False + deepcopy(eth_V11), remove_processing_hints=True, bump_version=False ) norm_schema.update_normalizers() assert_schema_identifiers_case(norm_schema, str.upper) @@ -452,3 +447,50 @@ def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: assert schema.naming.break_path("A__B__!C") == ["A", "B", "!C"] row = list(schema.normalize_data_item({"bool": True}, "load_id", "a_table")) assert row[0] == (("a_table", None), {"bool": True}) + + +def test_update_schema_normalizer_props() -> None: + schema = make_issues_schema_for_normalizers_update() + schema_2 = make_issues_schema_for_normalizers_update() + # remove issues table + del schema_2._schema_tables["issues"] + schema_2.update_schema(schema) + + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper" + # apply normalizers + schema_2.update_normalizers() + + # preserve schema_2 str + schema_2_str = schema_2.to_pretty_json() + + # make sure that normalizer props in original schema are preserved + schema._normalizers_config["allow_identifier_change_on_table_with_data"] = True + schema._normalizers_config["use_break_path_on_normalize"] = True + + # set some fake naming convention. during schema update it should not be used + os.environ["SCHEMA__NAMING"] = "tests.common.cases.normalizers.sql_upper_X" + schema.update_schema(schema_2) + assert isinstance(schema.naming, sql_upper.NamingConvention) + assert_schema_identifiers_case(schema, str.upper) + # make sure norm setting still in schema + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is True + assert schema._normalizers_config["use_break_path_on_normalize"] is True + # schema 2 not modified during the update + assert schema_2_str == schema_2.to_pretty_json() + + # make sure that explicit settings are passed + schema_2._normalizers_config["allow_identifier_change_on_table_with_data"] = False + schema_2._normalizers_config["use_break_path_on_normalize"] = False + schema.update_schema(schema_2) + assert schema._normalizers_config["allow_identifier_change_on_table_with_data"] is False + assert schema._normalizers_config["use_break_path_on_normalize"] is False + + +def make_issues_schema_for_normalizers_update() -> Schema: + schema_dict: TStoredSchema = load_json_case("schemas/github/issues.schema") + schema = Schema.from_dict(schema_dict) # type: ignore[arg-type] + # drop seen data + del schema.tables["issues"]["x-normalizer"] + del schema.tables["issues__labels"]["x-normalizer"] + del schema.tables["issues__assignees"]["x-normalizer"] + return schema diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 7124ca5c80..5cdd42e448 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -570,8 +570,8 @@ def test_update_preferred_types(schema: Schema) -> None: def test_default_table_resource() -> None: """Parent tables without `resource` set default to table name""" - eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") - tables = Schema.from_dict(eth_v5).tables + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") + tables = Schema.from_dict(eth_v11).tables assert tables["blocks"]["resource"] == "blocks" assert all([t.get("resource") is None for t in tables.values() if t.get("parent")]) @@ -737,7 +737,7 @@ def assert_new_schema_props_custom_normalizers(schema: Schema) -> None: def assert_is_new_schema(schema: Schema) -> None: assert schema.stored_version is None assert schema.stored_version_hash is None - assert schema.ENGINE_VERSION == 10 + assert schema.ENGINE_VERSION == 11 assert schema._stored_previous_hashes == [] assert schema.is_modified assert schema.is_new @@ -845,9 +845,9 @@ def test_group_tables_by_resource(schema: Schema) -> None: def test_remove_processing_hints() -> None: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # here tables contain processing hints - schema = Schema.from_dict(eth_V9) + schema = Schema.from_dict(eth_V11) assert "x-normalizer" in schema.tables["blocks"] # clone with hints removal, note that clone does not bump version @@ -867,16 +867,10 @@ def test_remove_processing_hints() -> None: assert "x-normalizer" not in to_json # load without hints - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True, bump_version=False) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True, bump_version=False) assert no_hints.stored_version_hash == cloned.stored_version_hash # now load without hints but with version bump cloned._bump_version() - no_hints = schema.from_dict(eth_V9, remove_processing_hints=True) + no_hints = schema.from_dict(eth_V11, remove_processing_hints=True) assert no_hints.stored_version_hash == cloned.stored_version_hash - - -# def test_get_new_table_columns() -> None: -# pytest.fail(reason="must implement!") -# pass -# get_new_table_columns() diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 39f1ad3211..1577b51115 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -86,10 +86,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v10: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v10") - version = eth_v10["version"] - version_hash = eth_v10["version_hash"] - schema = Schema.from_dict(eth_v10) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + version = eth_v11["version"] + version_hash = eth_v11["version_hash"] + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -98,8 +98,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") - schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] + eth_v11: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_v11) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 0dcf2930de..2818ea9622 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -3,7 +3,7 @@ import yaml from dlt.common import json -from dlt.common.schema.normalizers import explicit_normalizers +from dlt.common.schema.normalizers import configured_normalizers from dlt.common.schema.schema import Schema from dlt.common.storages.exceptions import ( InStorageSchemaModified, @@ -304,7 +304,7 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No def test_save_store_schema(storage: SchemaStorage) -> None: - d_n = explicit_normalizers() + d_n = configured_normalizers() d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) assert schema.is_new @@ -357,16 +357,16 @@ def test_save_initial_import_schema(ie_storage: LiveSchemaStorage) -> None: ie_storage.load_schema("ethereum") # save initial import schema where processing hints are removed - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") - schema = Schema.from_dict(eth_V9) + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") assert "x-normalizer" not in eth.tables["blocks"] # won't overwrite initial schema - del eth_V9["tables"]["blocks__uncles"] - schema = Schema.from_dict(eth_V9) + del eth_V11["tables"]["blocks__uncles"] + schema = Schema.from_dict(eth_V11) ie_storage.save_import_schema_if_not_exists(schema) # should be available now eth = ie_storage.load_schema("ethereum") diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index a1334ba1da..5366d8b06f 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -218,9 +218,9 @@ def assert_package_info( def prepare_eth_import_folder(storage: SchemaStorage) -> Schema: - eth_V9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V9, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) storage._export_schema(eth, storage.config.import_schema_path) return eth diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index e3098a1a77..9eeded1229 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -26,6 +26,7 @@ get_exception_trace, get_exception_trace_chain, update_dict_nested, + removeprefix, ) @@ -440,3 +441,11 @@ def _function_test(a, *, b=None): except Exception as exc: assert str(exc) == "wrong type" assert is_typeerror_due_to_wrong_call(exc, function_typeerror_exc) is False + + +def test_removeprefix() -> None: + assert removeprefix("a_data", "a_") == "data" + assert removeprefix("a_data", "a_data") == "" + assert removeprefix("a_data", "a_data_1") == "a_data" + assert removeprefix("", "a_data_1") == "" + assert removeprefix("a_data", "") == "a_data" diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index f3ebb02b46..6899d8d5fe 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -111,7 +111,7 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: with open( - "tests/common/cases/schemas/eth/ethereum_schema_v10.yml", mode="r", encoding="utf-8" + "tests/common/cases/schemas/eth/ethereum_schema_v11.yml", mode="r", encoding="utf-8" ) as f: schema_dict: TStoredSchema = yaml.safe_load(f) diff --git a/tests/common/utils.py b/tests/common/utils.py index 9b5e6bccce..a0760ffe86 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -19,11 +19,11 @@ def IMPORTED_VERSION_HASH_ETH_V10() -> str: # for import schema tests, change when upgrading the schema version - eth_V10 = load_yml_case("schemas/eth/ethereum_schema_v10") - assert eth_V10["version_hash"] == "veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos=" + eth_V11 = load_yml_case("schemas/eth/ethereum_schema_v11") + assert eth_V11["version_hash"] == "XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI=" # remove processing hints before installing as import schema # ethereum schema is a "dirty" schema with processing hints - eth = Schema.from_dict(eth_V10, remove_processing_hints=True) + eth = Schema.from_dict(eth_V11, remove_processing_hints=True) return eth.stored_version_hash diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index d224088f8b..e20260bfe7 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,6 +1,6 @@ version: 18 -version_hash: veEmgbCPXCIiqyfabeQWwz6UIQ2liETv7LLMpyktCos= -engine_version: 10 +version_hash: XfkJ8E1tZzG/Sb3lfEZrEVshTMKdB7JpOP2HA7eS6EI= +engine_version: 11 name: ethereum tables: _dlt_loads: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 5dc4304a63..a14b4a9602 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -112,9 +112,9 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - eth_v9 = load_yml_case("schemas/eth/ethereum_schema_v9") + eth_v11 = load_yml_case("schemas/eth/ethereum_schema_v11") # source removes processing hints so we do - reference_schema = Schema.from_dict(eth_v9, remove_processing_hints=True) + reference_schema = Schema.from_dict(eth_v11, remove_processing_hints=True) assert schema.stored_version_hash == reference_schema.stored_version_hash diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 30df12ae17..725872b621 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -219,8 +219,74 @@ def some_data(created_at=dlt.sources.incremental("created_at")): assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] +def test_pandas_index_as_dedup_key() -> None: + from dlt.common.libs.pandas import pandas_to_arrow, pandas as pd + + some_data, p = _make_dedup_pipeline("pandas") + + # no index + no_index_r = some_data.with_name(new_name="no_index") + p.run(no_index_r) + p.run(no_index_r) + data_ = p._dataset().no_index.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + # unnamed index: explicitly converted + unnamed_index_r = some_data.with_name(new_name="unnamed_index").add_map( + lambda df: pandas_to_arrow(df, preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + unnamed_index_r.incremental.primary_key = "__index_level_0__" + p.run(unnamed_index_r) + p.run(unnamed_index_r) + data_ = p._dataset().unnamed_index.arrow() + assert data_.schema.names == ["created_at", "id", "index_level_0"] + # indexes 2 and 3 are removed from second batch because they were in the previous batch + # and the created_at overlapped so they got deduplicated + assert data_["index_level_0"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: + df_.index = pd.RangeIndex(start=0, stop=len(df_), step=1, name="order_id") + return df_ + + # named index explicitly converted + named_index_r = some_data.with_name(new_name="named_index").add_map( + lambda df: pandas_to_arrow(_make_named_index(df), preserve_index=True) + ) + # use it (as in arrow table) to deduplicate + named_index_r.incremental.primary_key = "order_id" + p.run(named_index_r) + p.run(named_index_r) + data_ = p._dataset().named_index.arrow() + assert data_.schema.names == ["created_at", "id", "order_id"] + assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] + + # named index explicitly converted + named_index_impl_r = some_data.with_name(new_name="named_index_impl").add_map( + lambda df: _make_named_index(df) + ) + p.run(named_index_impl_r) + p.run(named_index_impl_r) + data_ = p._dataset().named_index_impl.arrow() + assert data_.schema.names == ["created_at", "id"] + assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] + + @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> None: + some_data, p = _make_dedup_pipeline(item_type) + p.run(some_data()) + p.run(some_data()) + + with p.sql_client() as c: + with c.execute_query("SELECT created_at, id FROM some_data ORDER BY created_at, id") as cur: + rows = cur.fetchall() + print(rows) + assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + + +def _make_dedup_pipeline(item_type: TestDataItemFormat): data1 = [ {"created_at": 1, "id": "a"}, {"created_at": 2, "id": "b"}, @@ -235,7 +301,6 @@ def test_unique_rows_by_hash_are_deduplicated(item_type: TestDataItemFormat) -> {"created_at": 3, "id": "f"}, {"created_at": 4, "id": "g"}, ] - source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) @@ -250,14 +315,7 @@ def some_data(created_at=dlt.sources.incremental("created_at")): pipeline_name=uniq_id(), destination=dlt.destinations.duckdb(credentials=duckdb.connect(":memory:")), ) - p.run(some_data()) - p.run(some_data()) - - with p.sql_client() as c: - with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: - rows = cur.fetchall() - - assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] + return some_data, p def test_nested_cursor_path() -> None: diff --git a/tests/libs/pyarrow/test_pyarrow_normalizer.py b/tests/libs/pyarrow/test_pyarrow_normalizer.py index 32ee5fdafc..c81d8cd974 100644 --- a/tests/libs/pyarrow/test_pyarrow_normalizer.py +++ b/tests/libs/pyarrow/test_pyarrow_normalizer.py @@ -5,12 +5,12 @@ from dlt.common.libs.pyarrow import normalize_py_arrow_item, NameNormalizationCollision from dlt.common.schema.utils import new_column, TColumnSchema -from dlt.common.schema.normalizers import explicit_normalizers, import_normalizers +from dlt.common.schema.normalizers import configured_normalizers, import_normalizers from dlt.common.destination import DestinationCapabilitiesContext def _normalize(table: pa.Table, columns: List[TColumnSchema]) -> pa.Table: - _, naming, _ = import_normalizers(explicit_normalizers()) + _, naming, _ = import_normalizers(configured_normalizers()) caps = DestinationCapabilitiesContext() columns_schema = {c["name"]: c for c in columns} return normalize_py_arrow_item(table, columns_schema, naming, caps) diff --git a/tests/load/clickhouse/test_clickhouse_configuration.py b/tests/load/clickhouse/test_clickhouse_configuration.py index ad33062f11..eabc3094bd 100644 --- a/tests/load/clickhouse/test_clickhouse_configuration.py +++ b/tests/load/clickhouse/test_clickhouse_configuration.py @@ -56,7 +56,8 @@ def test_clickhouse_configuration() -> None: def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: """Test experimental settings are set correctly for the session.""" - conn = client.sql_client.open_connection() + # with client.sql_client.open_connection() as conn: + conn = client.sql_client.native_connection cursor1 = conn.cursor() cursor2 = conn.cursor() @@ -69,3 +70,26 @@ def test_clickhouse_connection_settings(client: ClickHouseClient) -> None: assert ("allow_experimental_lightweight_delete", "1") in res assert ("enable_http_compression", "1") in res assert ("date_time_input_format", "best_effort") in res + + +def test_client_has_dataset(client: ClickHouseClient) -> None: + # with client.sql_client as sql_client: + assert client.sql_client.has_dataset() + separator = client.config.dataset_table_separator + + def _assert_has_dataset() -> None: + assert not client.sql_client.has_dataset() + client.sql_client.create_dataset() + assert client.sql_client.has_dataset() + client.sql_client.drop_dataset() + assert not client.sql_client.has_dataset() + + try: + # change separator + client.config.dataset_table_separator = "_" + _assert_has_dataset() + + client.config.dataset_table_separator = "" + _assert_has_dataset() + finally: + client.config.dataset_table_separator = separator diff --git a/tests/load/conftest.py b/tests/load/conftest.py index 76a7248e5b..c52fea607d 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -9,7 +9,7 @@ drop_pipeline, empty_schema, ) -from tests.utils import preserve_environ, patch_home_dir +from tests.utils import preserve_environ, patch_home_dir, autouse_test_storage @pytest.fixture(scope="function", params=DEFAULT_BUCKETS) diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index a9479a0bb9..49475ce43f 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -19,7 +19,7 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_table -from tests.utils import patch_home_dir, autouse_test_storage, TEST_STORAGE_ROOT +from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index b782e76b7e..1113b9b35d 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -9,7 +9,6 @@ from tests.common.configuration.utils import environment from tests.load.utils import ALL_FILESYSTEM_DRIVERS -from tests.utils import autouse_test_storage # mark all tests as essential, do not remove pytestmark = pytest.mark.essential diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index d0a29d03d0..afcd9105a8 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -28,7 +28,6 @@ from tests.common.configuration.utils import environment from tests.common.storages.utils import TEST_SAMPLE_FILES, assert_sample_files from tests.load.utils import ALL_FILESYSTEM_DRIVERS, AWS_BUCKET, WITH_GDRIVE_BUCKETS -from tests.utils import autouse_test_storage from tests.load.filesystem.utils import self_signed_cert diff --git a/tests/load/pipeline/conftest.py b/tests/load/pipeline/conftest.py index a2ba65494b..80c418ed22 100644 --- a/tests/load/pipeline/conftest.py +++ b/tests/load/pipeline/conftest.py @@ -1,2 +1,2 @@ -from tests.utils import autouse_test_storage, duckdb_pipeline_location +from tests.utils import duckdb_pipeline_location from tests.pipeline.utils import drop_dataset_from_env diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 2925bfac6f..8b6fc751d9 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -80,7 +80,7 @@ def test_merge_on_keys_in_schema( skip_if_not_supported(merge_strategy, p.destination) - with open("tests/common/cases/schemas/eth/ethereum_schema_v9.yml", "r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v11.yml", "r", encoding="utf-8") as f: schema = dlt.Schema.from_dict(yaml.safe_load(f)) # make block uncles unseen to trigger filtering loader in loader for nested tables diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py index 2a5b9ed296..962c501619 100644 --- a/tests/load/pipeline/test_scd2.py +++ b/tests/load/pipeline/test_scd2.py @@ -11,7 +11,7 @@ from dlt.common.pipeline import LoadInfo from dlt.common.data_types.typing import TDataType from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES -from dlt.common.normalizers.json.relational import DataItemNormalizer +from dlt.common.normalizers.json.helpers import get_row_hash from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision from dlt.extract.resource import DltResource @@ -30,7 +30,6 @@ from tests.utils import TPythonTableFormat -get_row_hash = DataItemNormalizer.get_row_hash FROM, TO = DEFAULT_VALIDITY_COLUMN_NAMES diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index e96e06be87..8a3b37dd48 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -61,6 +61,5 @@ def has_collections(client): if has_collections(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index b60c6a8956..ef0acb33a4 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -21,7 +21,7 @@ from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy +from tests.utils import TEST_STORAGE_ROOT, skipifpypy from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage # mark all tests as essential, do not remove diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 9f64722a1e..6f699436b3 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -36,7 +36,7 @@ from dlt.common.time import ensure_pendulum_datetime from tests.cases import table_update_and_row, assert_all_data_types_row -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.common.utils import load_json_case from tests.load.utils import ( TABLE_UPDATE, diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index f5a8d51baf..1a9c8a383b 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -10,6 +10,7 @@ from typing import List from functools import reduce +from dlt.common.storages.file_storage import FileStorage from tests.load.utils import ( destinations_configs, DestinationTestConfiguration, @@ -18,7 +19,7 @@ MEMORY_BUCKET, ) from dlt.destinations import filesystem -from tests.utils import TEST_STORAGE_ROOT +from tests.utils import TEST_STORAGE_ROOT, clean_test_storage from dlt.common.destination.reference import TDestinationReferenceArg from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException from tests.load.utils import drop_pipeline_data @@ -48,8 +49,14 @@ def _expected_chunk_count(p: Pipeline) -> List[int]: return [_chunk_size(p), _total_records(p) - _chunk_size(p)] +# this also disables autouse_test_storage on function level which destroys some tests here @pytest.fixture(scope="session") -def populated_pipeline(request) -> Any: +def autouse_test_storage() -> FileStorage: + return clean_test_storage() + + +@pytest.fixture(scope="session") +def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" destination_config = cast(DestinationTestConfiguration, request.param) diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 05c10a900f..ee48222da9 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -22,7 +22,7 @@ from dlt.destinations.typing import TNativeConn from dlt.common.time import ensure_pendulum_datetime, to_py_datetime -from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage +from tests.utils import TEST_STORAGE_ROOT from tests.load.utils import ( yield_client_with_storage, prepare_table, diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index b391c2fa38..b98b55fcfa 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -95,6 +95,5 @@ def schema_has_classes(client): if schema_has_classes(client): client.drop_storage() - p._wipe_working_folder() # deactivate context Container()[PipelineContext].deactivate() diff --git a/tests/pipeline/cases/github_pipeline/github_rev.py b/tests/pipeline/cases/github_pipeline/github_rev.py new file mode 100644 index 0000000000..4ebe3048f4 --- /dev/null +++ b/tests/pipeline/cases/github_pipeline/github_rev.py @@ -0,0 +1,26 @@ +import dlt + + +@dlt.source +def github(): + @dlt.resource( + table_name="issues__2", + primary_key="id", + ) + def load_issues(): + # return data with path separators + yield [ + { + "id": 100, + "issue__id": 10, + } + ] + + return load_issues + + +if __name__ == "__main__": + p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3") + github_source = github() + info = p.run(github_source) + print(info) diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index a3d8b489c9..fbd4d412b3 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -484,3 +484,59 @@ def test_scd2_pipeline_update(test_storage: FileStorage) -> None: assert len(issues_retired) == 1 assert issues_retired[0][0] == 6272 # print(pipeline.default_schema.to_pretty_yaml()) + + +def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> None: + """Pre 1.4.1 normalized identifiers with path separators into single underscore, + this behavior must be preserved if the schema is updated. + """ + shutil.copytree("tests/pipeline/cases/github_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + # execute in test storage + with set_working_dir(TEST_STORAGE_ROOT): + # store dlt data in test storage (like patch_home_dir) + with custom_environ({DLT_DATA_DIR: dlt.current.run().data_dir}): + # save database outside of pipeline dir + with custom_environ( + {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} + ): + venv_dir = tempfile.mkdtemp() + # create virtual env with (0.3.0) before the current schema upgrade + with Venv.create(venv_dir, ["dlt[duckdb]==0.3.0"]) as venv: + venv._install_deps(venv.context, ["duckdb" + "==" + pkg_version("duckdb")]) + try: + print( + venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py") + ) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + + venv = Venv.restore_current() + # load same data again + try: + print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_rev.py")) + except CalledProcessError as cpe: + print(f"script stdout: {cpe.stdout}") + print(f"script stderr: {cpe.stderr}") + raise + pipeline = dlt.attach(GITHUB_PIPELINE_NAME) + print(pipeline.default_schema.to_pretty_yaml()) + # migration set the backward compat flag + assert ( + pipeline.default_schema._normalizers_config["use_break_path_on_normalize"] + is False + ) + # make sure that schema didn't change + assert pipeline.default_schema.data_table_names() == ["issues_2"] + table_ = pipeline.default_schema.tables["issues_2"] + assert set(table_["columns"].keys()) == { + "id", + "issue_id", + "_dlt_id", + "_dlt_load_id", + } + # datasets must be the same + data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + print(data_) diff --git a/tests/normalize/test_max_nesting.py b/tests/pipeline/test_max_nesting.py similarity index 100% rename from tests/normalize/test_max_nesting.py rename to tests/pipeline/test_max_nesting.py diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 3832bad81a..e58db64e5e 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1711,6 +1711,111 @@ def nested_resource(): assert pipeline.last_trace.last_normalize_info.row_counts["flattened_dict__values"] == 4 +def test_column_name_with_break_path() -> None: + """Tests how normalization behaves for names with break path ie __ + all the names must be idempotent + """ + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was preserved + table = pipeline.default_schema.get_table("custom__path") + assert pipeline.default_schema.data_table_names() == ["custom__path"] + # column name was preserved + assert table["columns"]["example_custom_field__c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field__c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom__path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_name_with_break_path_legacy() -> None: + """Tests how normalization behaves for names with break path ie __ + in legacy mode table and column names were normalized as single identifier + """ + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + pipeline = dlt.pipeline(destination="duckdb", pipeline_name="breaking") + info = pipeline.run( + [{"example_custom_field__c": "custom", "reg_c": "c"}], table_name="custom__path" + ) + assert_load_info(info) + # table name was contracted + table = pipeline.default_schema.get_table("custom_path") + assert pipeline.default_schema.data_table_names() == ["custom_path"] + # column name was contracted + assert table["columns"]["example_custom_field_c"]["data_type"] == "text" + assert set(table["columns"]) == {"example_custom_field_c", "reg_c", "_dlt_id", "_dlt_load_id"} + + # get data + assert_data_table_counts(pipeline, {"custom_path": 1}) + # get data via dataset with dbapi + data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + assert data_ == [("custom", "c")] + + +def test_column_hint_with_break_path() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + + assert pipeline.default_schema.data_table_names() == ["flattened__dict"] + table = pipeline.default_schema.get_table("flattened__dict") + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + + # make sure data is there + data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + +def test_column_hint_with_break_path_legacy() -> None: + """Up form the v 1.4.1 name normalizer is idempotent on break path""" + + os.environ["SCHEMA__USE_BREAK_PATH_ON_NORMALIZE"] = "False" + now = cast(pendulum.DateTime, pendulum.parse("2024-11-29T10:10")) + + @dlt.resource( + name="flattened__dict", columns=[{"name": "value__timestamp", "data_type": "timestamp"}] + ) + def flattened_dict(): + for delta in range(4): + yield { + "delta": delta, + "value": {"timestamp": now.timestamp() + delta}, + } + + pipeline = dlt.pipeline(destination="duckdb") + info = pipeline.run(flattened_dict()) + assert_load_info(info) + # table name contracted + assert pipeline.default_schema.data_table_names() == ["flattened_dict"] + table = pipeline.default_schema.get_table("flattened_dict") + # hint applied + assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} + assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" + # make sure data is there + data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + assert data_ == [(0, now)] + + def test_empty_rows_are_included() -> None: """Empty rows where all values are `None` or empty dicts create rows in the dataset with `NULL` in all columns From b4d807fc059591720f1ea14e73340e9a98041225 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 2 Dec 2024 16:26:01 +0100 Subject: [PATCH 4/4] bumps to version 1.4.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8afb332422..7377b03fde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "1.4.1a1" +version = "1.4.1" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Anton Burnashev ", "David Scharf " ]