From 0e251028e54f51137ee4ecb39d5d1c08b94e8a10 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 28 Sep 2023 16:39:30 +0200 Subject: [PATCH 01/36] first iceberg prototype --- dlt/destinations/athena/athena.py | 41 ++++++++++++++----- dlt/destinations/athena/configuration.py | 1 + dlt/destinations/bigquery/bigquery.py | 4 +- dlt/destinations/job_client_impl.py | 33 +++++++++++---- dlt/destinations/mssql/mssql.py | 4 +- dlt/destinations/postgres/postgres.py | 4 +- dlt/destinations/snowflake/snowflake.py | 7 ++-- dlt/destinations/sql_jobs.py | 24 +++++++---- tests/load/test_iceberg.py | 52 ++++++++++++++++++++++++ 9 files changed, 133 insertions(+), 37 deletions(-) create mode 100644 tests/load/test_iceberg.py diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index ed8364aa3a..69175faabb 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -16,21 +16,21 @@ from dlt.common.utils import without_none from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, Schema -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition from dlt.common.schema.utils import table_schema_has_type from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import LoadJob -from dlt.common.destination.reference import TLoadJobState +from dlt.common.destination.reference import LoadJob, FollowupJob +from dlt.common.destination.reference import TLoadJobState, NewLoadJob from dlt.common.storages import FileStorage from dlt.common.data_writers.escape import escape_bigquery_identifier - +from dlt.destinations.sql_jobs import SqlStagingCopyJob from dlt.destinations.typing import DBApi, DBTransaction from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException from dlt.destinations.athena import capabilities from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error from dlt.destinations.typing import DBApiCursor -from dlt.destinations.job_client_impl import SqlJobClientBase, StorageSchemaInfo +from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.athena.configuration import AthenaClientConfiguration from dlt.destinations.type_mapping import TypeMapper from dlt.destinations import path_utils @@ -121,7 +121,7 @@ def __init__(self) -> None: DLTAthenaFormatter._INSTANCE = self -class DoNothingJob(LoadJob): +class DoNothingJob(LoadJob, FollowupJob): """The most lazy class of dlt""" def __init__(self, file_path: str) -> None: @@ -135,6 +135,7 @@ def exception(self) -> str: # this part of code should be never reached raise NotImplementedError() + class AthenaSQLClient(SqlClientBase[Connection]): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -274,9 +275,9 @@ def has_dataset(self) -> bool: query = f"""SHOW DATABASES LIKE {self.fully_qualified_dataset_name()};""" rows = self.execute_sql(query) return len(rows) > 0 + - -class AthenaClient(SqlJobClientBase): +class AthenaClient(SqlJobClientWithStaging): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -307,12 +308,22 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: + create_only_iceberg_tables = self.config.iceberg_bucket_url is not None and not self.in_staging_mode + bucket = self.config.staging_config.bucket_url - dataset = self.sql_client.dataset_name + if create_only_iceberg_tables: + bucket = self.config.iceberg_bucket_url + + print(table_name) + print(bucket) + + # TODO: we need to strip the staging layout from the table name, find a better way! + dataset = self.sql_client.dataset_name.replace("_staging", "") sql: List[str] = [] # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries - is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip" + # or if we are in iceberg mode, we create iceberg tables for all tables + is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or create_only_iceberg_tables columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -348,6 +359,16 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> job = DoNothingJob(file_path) return job + def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: + """update destination tables from staging tables""" + return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) + + def get_stage_dispositions(self) -> List[TWriteDisposition]: + # in iceberg mode, we always use staging tables + if self.config.iceberg_bucket_url is not None: + return ["append", "replace", "merge"] + return [] + @staticmethod def is_dbapi_exception(ex: Exception) -> bool: return isinstance(ex, Error) diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py index f6e6fa3b51..3c175cba9b 100644 --- a/dlt/destinations/athena/configuration.py +++ b/dlt/destinations/athena/configuration.py @@ -9,6 +9,7 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_name: Final[str] = "athena" # type: ignore[misc] query_result_bucket: str = None + iceberg_bucket_url: Optional[str] = None credentials: AwsCredentials = None athena_work_group: Optional[str] = None aws_data_catalog: Optional[str] = "awsdatacatalog" diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index 473fee2113..387e450184 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -19,7 +19,7 @@ from dlt.destinations.bigquery import capabilities from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS -from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob +from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper @@ -138,7 +138,7 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st class BigqueryStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index c082eefb93..90cdf1ffcd 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -148,26 +148,34 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition] def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: return SqlMergeJob.from_table_chain(table_chain, self.sql_client) - def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: + def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: """update destination tables from staging tables""" - return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client) + if not replace: + return None + return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: """optimized replace strategy, defaults to _create_staging_copy_job for the basic client for some destinations there are much faster destination updates at the cost of dropping tables possible""" - return self._create_staging_copy_job(table_chain) + return self._create_staging_copy_job(table_chain, True) def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) write_disposition = table_chain[0]["write_disposition"] - if write_disposition == "merge": - jobs.append(self._create_merge_job(table_chain)) + if write_disposition == "append": + if job := self._create_staging_copy_job(table_chain, False): + jobs.append(job) + elif write_disposition == "merge": + if job := self._create_merge_job(table_chain): + jobs.append(job) elif write_disposition == "replace" and self.config.replace_strategy == "insert-from-staging": - jobs.append(self._create_staging_copy_job(table_chain)) + if job := self._create_staging_copy_job(table_chain, True): + jobs.append(job) elif write_disposition == "replace" and self.config.replace_strategy == "staging-optimized": - jobs.append(self._create_optimized_replace_job(table_chain)) + if job := self._create_optimized_replace_job(table_chain): + jobs.append(job) return jobs def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: @@ -431,10 +439,17 @@ def _commit_schema_update(self, schema: Schema, schema_str: str) -> None: class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset): + + in_staging_mode: bool = False + @contextlib.contextmanager def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: - with self.sql_client.with_staging_dataset(True): - yield self + try: + with self.sql_client.with_staging_dataset(True): + self.in_staging_mode = True + yield self + finally: + self.in_staging_mode = False def get_stage_dispositions(self) -> List[TWriteDisposition]: """Returns a list of dispositions that require staging tables to be populated""" diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/mssql/mssql.py index 5ed3b706b8..67b51f885c 100644 --- a/dlt/destinations/mssql/mssql.py +++ b/dlt/destinations/mssql/mssql.py @@ -8,7 +8,7 @@ from dlt.common.schema.typing import TTableSchema, TColumnType from dlt.common.utils import uniq_id -from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob +from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob, SqlJobParams from dlt.destinations.insert_job_client import InsertValuesJobClient @@ -83,7 +83,7 @@ def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[i class MsSqlStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/postgres/postgres.py index ead5ab6639..b6c716754f 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/postgres/postgres.py @@ -7,7 +7,7 @@ from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType -from dlt.destinations.sql_jobs import SqlStagingCopyJob +from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.insert_job_client import InsertValuesJobClient @@ -79,7 +79,7 @@ def from_db_type(self, db_type: str, precision: Optional[int] = None, scale: Opt class PostgresStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/snowflake/snowflake.py index b9046cde75..69432bc696 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/snowflake/snowflake.py @@ -17,7 +17,7 @@ from dlt.destinations.snowflake import capabilities from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient -from dlt.destinations.sql_jobs import SqlStagingCopyJob +from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase @@ -157,13 +157,12 @@ def exception(self) -> str: class SnowflakeStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) - # drop destination table sql.append(f"DROP TABLE IF EXISTS {table_name};") # recreate destination table with data cloned from staging table sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") @@ -206,7 +205,7 @@ def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)] def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client) + return SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool, separate_alters: bool = False) -> List[str]: sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index c1137ee9ad..784c0e3a05 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, List, Sequence, Tuple, cast +from typing import Any, Callable, List, Sequence, Tuple, cast, TypedDict, Optional import yaml from dlt.common.runtime.logger import pretty_format_exception @@ -11,24 +11,30 @@ from dlt.destinations.job_impl import NewLoadJobImpl from dlt.destinations.sql_client import SqlClientBase +class SqlJobParams(TypedDict): + replace: Optional[bool] + +DEFAULTS: SqlJobParams = { + "replace": False +} class SqlBaseJob(NewLoadJobImpl): """Sql base job for jobs that rely on the whole tablechain""" failed_text: str = "" @classmethod - def from_table_chain(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> NewLoadJobImpl: + def from_table_chain(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> NewLoadJobImpl: """Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). """ - + params = cast(SqlJobParams, {**DEFAULTS, **(params or {})}) # type: ignore top_table = table_chain[0] file_info = ParsedLoadJobFileName(top_table["name"], uniq_id()[:10], 0, "sql") try: # Remove line breaks from multiline statements and write one SQL statement per line in output file # to support clients that need to execute one statement at a time (i.e. snowflake) - sql = [' '.join(stmt.splitlines()) for stmt in cls.generate_sql(table_chain, sql_client)] + sql = [' '.join(stmt.splitlines()) for stmt in cls.generate_sql(table_chain, sql_client, params)] job = cls(file_info.job_id(), "running") job._save_text_file("\n".join(sql)) except Exception: @@ -39,7 +45,7 @@ def from_table_chain(cls, table_chain: Sequence[TTableSchema], sql_client: SqlCl return job @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: pass @@ -48,14 +54,16 @@ class SqlStagingCopyJob(SqlBaseJob): failed_text: str = "Tried to generate a staging copy sql job for the following tables:" @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) columns = ", ".join(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(table, "name"))) - sql.append(sql_client._truncate_table_sql(table_name)) + if params["replace"]: + sql.append(sql_client._truncate_table_sql(table_name)) + print(f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};") sql.append(f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};") return sql @@ -64,7 +72,7 @@ class SqlMergeJob(SqlBaseJob): failed_text: str = "Tried to generate a merge sql job for the following tables:" @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). diff --git a/tests/load/test_iceberg.py b/tests/load/test_iceberg.py new file mode 100644 index 0000000000..35f4996ecd --- /dev/null +++ b/tests/load/test_iceberg.py @@ -0,0 +1,52 @@ +""" +Temporary test file for iceberg +""" + +import pytest +import os +import datetime # noqa: I251 +from typing import Iterator, Any + +import dlt +from dlt.common import pendulum +from dlt.common.utils import uniq_id +from tests.load.pipeline.utils import load_table_counts +from tests.cases import table_update_and_row, assert_all_data_types_row +from tests.pipeline.utils import assert_load_info + +from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration + +def test_iceberg() -> None: + + os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "s3://dlt-ci-test-bucket" + os.environ['DESTINATION__ATHENA__ICEBERG_BUCKET_URL'] = "s3://dlt-ci-test-bucket/iceberg" + + pipeline = dlt.pipeline(pipeline_name="aaathena", destination="athena", staging="filesystem", full_refresh=True) + + @dlt.resource(name="items", write_disposition="append") + def items(): + yield { + "id": 1, + "name": "item", + "sub_items": [{ + "id": 101, + "name": "sub item 101" + },{ + "id": 101, + "name": "sub item 102" + }] + } + + print(pipeline.run(items)) + + # see if we have athena tables with items + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) + assert table_counts["items"] == 1 + assert table_counts["items__sub_items"] == 2 + assert table_counts["_dlt_loads"] == 1 + + pipeline.run(items) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) + assert table_counts["items"] == 2 + assert table_counts["items__sub_items"] == 4 + assert table_counts["_dlt_loads"] == 2 \ No newline at end of file From 26f9e41b2ea36d567ca77962ed40d0720d355733 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 28 Sep 2023 17:20:35 +0200 Subject: [PATCH 02/36] fix linting and clearing of staging tables --- dlt/common/destination/reference.py | 4 ++++ dlt/destinations/athena/athena.py | 12 +++++++----- dlt/load/load.py | 4 +++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 0bf4088b27..a26ede267d 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -248,6 +248,10 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition] # in the base job, all replace strategies are treated the same, see filesystem for example return ["replace"] + def get_truncate_destination_table_dispositions_for_staging(self) -> List[TWriteDisposition]: + # some clients need to additionally be able to get the staging destination to truncate tables + return [] + def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 69175faabb..30c3b7e907 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -275,7 +275,7 @@ def has_dataset(self) -> bool: query = f"""SHOW DATABASES LIKE {self.fully_qualified_dataset_name()};""" rows = self.execute_sql(query) return len(rows) > 0 - + class AthenaClient(SqlJobClientWithStaging): @@ -314,15 +314,12 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc if create_only_iceberg_tables: bucket = self.config.iceberg_bucket_url - print(table_name) - print(bucket) - # TODO: we need to strip the staging layout from the table name, find a better way! dataset = self.sql_client.dataset_name.replace("_staging", "") sql: List[str] = [] # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries - # or if we are in iceberg mode, we create iceberg tables for all tables + # or if we are in iceberg mode, we create iceberg tables for all tables is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or create_only_iceberg_tables columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) @@ -369,6 +366,11 @@ def get_stage_dispositions(self) -> List[TWriteDisposition]: return ["append", "replace", "merge"] return [] + def get_truncate_destination_table_dispositions_for_staging(self) -> List[TWriteDisposition]: + if self.config.iceberg_bucket_url is not None: + return ["append", "replace", "merge"] + return ["replace"] + @staticmethod def is_dbapi_exception(ex: Exception) -> bool: return isinstance(ex, Error) diff --git a/dlt/load/load.py b/dlt/load/load.py index 2cae753978..09fe7078ee 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -313,7 +313,9 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: # initialize staging storage if needed if self.staging_destination: with self.get_staging_destination_client(schema) as staging_client: - truncate_tables = self.get_table_chain_tables_for_write_disposition(load_id, schema, staging_client.get_truncate_destination_table_dispositions()) + truncate_dispositions = staging_client.get_truncate_destination_table_dispositions() + truncate_dispositions.extend(job_client.get_truncate_destination_table_dispositions_for_staging()) + truncate_tables = self.get_table_chain_tables_for_write_disposition(load_id, schema, truncate_dispositions) staging_client.initialize_storage(truncate_tables) # update the staging dataset if client supports this if isinstance(job_client, WithStagingDataset): From e199bd1d1ea795d78ceae5c76ddd742769945203 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 28 Sep 2023 21:04:20 +0200 Subject: [PATCH 03/36] disable tests --- tests/load/test_iceberg.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/load/test_iceberg.py b/tests/load/test_iceberg.py index 35f4996ecd..f9cb891fbc 100644 --- a/tests/load/test_iceberg.py +++ b/tests/load/test_iceberg.py @@ -16,6 +16,11 @@ from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration +from tests.utils import skip_if_not_active + +skip_if_not_active("athena") + + def test_iceberg() -> None: os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "s3://dlt-ci-test-bucket" From b768a6325151ef5142b3b4649803cb7ef3701b63 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 29 Sep 2023 09:53:41 +0200 Subject: [PATCH 04/36] enable iceberg tests for athena --- tests/load/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/load/utils.py b/tests/load/utils.py index 6790a816fb..370f9789cb 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -48,6 +48,7 @@ class DestinationTestConfiguration: staging: Optional[str] = None file_format: Optional[str] = None bucket_url: Optional[str] = None + iceberg_bucket_url: Optional[str] = None stage_name: Optional[str] = None staging_iam_role: Optional[str] = None extra_info: Optional[str] = None @@ -71,6 +72,7 @@ def setup(self) -> None: os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = self.bucket_url or "" os.environ['DESTINATION__STAGE_NAME'] = self.stage_name or "" os.environ['DESTINATION__STAGING_IAM_ROLE'] = self.staging_iam_role or "" + os.environ['DESTINATION__ATHENA__ICEBERG_BUCKET_URL'] = self.iceberg_bucket_url or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" if self.destination == "filesystem": @@ -116,6 +118,7 @@ def destinations_configs( if default_staging_configs or all_staging_configs: destination_configs += [ DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, supports_merge=False), + DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg"), DestinationTestConfiguration(destination="redshift", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, staging_iam_role="arn:aws:iam::267388281016:role/redshift_s3_read", extra_info="s3-role"), DestinationTestConfiguration(destination="bigquery", staging="filesystem", file_format="parquet", bucket_url=GCS_BUCKET, extra_info="gcs-authorization"), DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=GCS_BUCKET, stage_name="PUBLIC.dlt_gcs_stage", extra_info="gcs-integration"), From 29a6d06fb0daa09341b86c5a69eae3a8b34cab26 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 4 Oct 2023 10:21:02 +0200 Subject: [PATCH 05/36] fix iceberg detection --- dlt/destinations/athena/athena.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 30c3b7e907..91be6df2ba 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -295,6 +295,7 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config self.type_mapper = AthenaTypeMapper(self.capabilities) + self.iceberg_mode = not (not self.config.iceberg_bucket_url) def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # never truncate tables in athena @@ -308,10 +309,10 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: - create_only_iceberg_tables = self.config.iceberg_bucket_url is not None and not self.in_staging_mode + create_data_iceberg_tables = self.iceberg_mode and not self.in_staging_mode bucket = self.config.staging_config.bucket_url - if create_only_iceberg_tables: + if create_data_iceberg_tables: bucket = self.config.iceberg_bucket_url # TODO: we need to strip the staging layout from the table name, find a better way! @@ -320,7 +321,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or create_only_iceberg_tables + is_iceberg = create_data_iceberg_tables or (self.schema.tables[table_name].get("write_disposition", None) == "skip") columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -362,12 +363,12 @@ def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: def get_stage_dispositions(self) -> List[TWriteDisposition]: # in iceberg mode, we always use staging tables - if self.config.iceberg_bucket_url is not None: + if self.iceberg_mode: return ["append", "replace", "merge"] return [] def get_truncate_destination_table_dispositions_for_staging(self) -> List[TWriteDisposition]: - if self.config.iceberg_bucket_url is not None: + if self.iceberg_mode: return ["append", "replace", "merge"] return ["replace"] From 439c72f2a64de3a254c9fb0d2388ad997ba3ee06 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 4 Oct 2023 13:02:18 +0200 Subject: [PATCH 06/36] move athena tests to default sql configs --- tests/load/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index f524511ef8..a0cab6ff73 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -110,16 +110,14 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] + destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg")] if default_vector_configs: # for now only weaviate destination_configs += [DestinationTestConfiguration(destination="weaviate")] - if default_staging_configs or all_staging_configs: destination_configs += [ - DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, supports_merge=False), - DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg"), DestinationTestConfiguration(destination="redshift", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, staging_iam_role="arn:aws:iam::267388281016:role/redshift_s3_read", extra_info="s3-role"), DestinationTestConfiguration(destination="bigquery", staging="filesystem", file_format="parquet", bucket_url=GCS_BUCKET, extra_info="gcs-authorization"), DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=GCS_BUCKET, stage_name="PUBLIC.dlt_gcs_stage", extra_info="gcs-integration"), @@ -147,6 +145,8 @@ def destinations_configs( for bucket in ALL_BUCKETS: destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=bucket, extra_info=bucket)] + # destination_configs = [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg")] + # filter out non active destinations destination_configs = [conf for conf in destination_configs if conf.destination in ACTIVE_DESTINATIONS] @@ -156,6 +156,7 @@ def destinations_configs( if exclude: destination_configs = [conf for conf in destination_configs if conf.destination not in exclude] + return destination_configs From b964388425c4eef719279be7a3f90eebf8a61d8f Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 4 Oct 2023 22:42:20 +0200 Subject: [PATCH 07/36] finally fix regular athena tests... --- dlt/destinations/athena/athena.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 91be6df2ba..5394a78aa5 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -359,7 +359,9 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: """update destination tables from staging tables""" - return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) + if self.iceberg_mode: + return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) + return None def get_stage_dispositions(self) -> List[TWriteDisposition]: # in iceberg mode, we always use staging tables From 2b5f0048e21f942756c5df82488d728beeb0a6e2 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 00:00:16 +0200 Subject: [PATCH 08/36] some more work --- dlt/common/destination/reference.py | 2 +- dlt/destinations/athena/athena.py | 20 ++++++++++++++++---- dlt/destinations/athena/configuration.py | 1 + dlt/destinations/job_client_impl.py | 14 ++++---------- dlt/load/load.py | 2 +- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index a26ede267d..f4fbe4df76 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -248,7 +248,7 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition] # in the base job, all replace strategies are treated the same, see filesystem for example return ["replace"] - def get_truncate_destination_table_dispositions_for_staging(self) -> List[TWriteDisposition]: + def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: # some clients need to additionally be able to get the staging destination to truncate tables return [] diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 5394a78aa5..9e670d5ca6 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -357,22 +357,34 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> job = DoNothingJob(file_path) return job + def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + """Creates a list of followup jobs for merge write disposition and staging replace strategies""" + jobs = super().create_table_chain_completed_followup_jobs(table_chain) + # when in iceberg mode, we need to add some more jobs + if self.iceberg_mode: + write_disposition = table_chain[0]["write_disposition"] + if write_disposition == "append": + jobs.append(self._create_staging_copy_job(table_chain, False)) + elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert": + jobs.append(self._create_staging_copy_job(table_chain, True)) + return jobs + def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: """update destination tables from staging tables""" if self.iceberg_mode: return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) - return None + return super()._create_staging_copy_job(table_chain, replace=replace) def get_stage_dispositions(self) -> List[TWriteDisposition]: # in iceberg mode, we always use staging tables if self.iceberg_mode: return ["append", "replace", "merge"] - return [] + return super().get_stage_dispositions() - def get_truncate_destination_table_dispositions_for_staging(self) -> List[TWriteDisposition]: + def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: if self.iceberg_mode: return ["append", "replace", "merge"] - return ["replace"] + return [] @staticmethod def is_dbapi_exception(ex: Exception) -> bool: diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py index 3c175cba9b..b2df8fb5e9 100644 --- a/dlt/destinations/athena/configuration.py +++ b/dlt/destinations/athena/configuration.py @@ -13,6 +13,7 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): credentials: AwsCredentials = None athena_work_group: Optional[str] = None aws_data_catalog: Optional[str] = "awsdatacatalog" + supports_truncate_command: bool = False __config_gen_annotations__: ClassVar[List[str]] = ["athena_work_group"] diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 90cdf1ffcd..aec3a6a310 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -150,8 +150,6 @@ def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: """update destination tables from staging tables""" - if not replace: - return None return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: @@ -165,17 +163,13 @@ def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTabl jobs = super().create_table_chain_completed_followup_jobs(table_chain) write_disposition = table_chain[0]["write_disposition"] if write_disposition == "append": - if job := self._create_staging_copy_job(table_chain, False): - jobs.append(job) + pass elif write_disposition == "merge": - if job := self._create_merge_job(table_chain): - jobs.append(job) + jobs.append(self._create_merge_job(table_chain)) elif write_disposition == "replace" and self.config.replace_strategy == "insert-from-staging": - if job := self._create_staging_copy_job(table_chain, True): - jobs.append(job) + jobs.append(self._create_staging_copy_job(table_chain, True)) elif write_disposition == "replace" and self.config.replace_strategy == "staging-optimized": - if job := self._create_optimized_replace_job(table_chain): - jobs.append(job) + jobs.append(self._create_optimized_replace_job(table_chain)) return jobs def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: diff --git a/dlt/load/load.py b/dlt/load/load.py index 09fe7078ee..488a0ce4f2 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -314,7 +314,7 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: if self.staging_destination: with self.get_staging_destination_client(schema) as staging_client: truncate_dispositions = staging_client.get_truncate_destination_table_dispositions() - truncate_dispositions.extend(job_client.get_truncate_destination_table_dispositions_for_staging()) + truncate_dispositions.extend(job_client.get_truncate_staging_destination_table_dispositions()) truncate_tables = self.get_table_chain_tables_for_write_disposition(load_id, schema, truncate_dispositions) staging_client.initialize_storage(truncate_tables) # update the staging dataset if client supports this From 7e82de72ad8b0454bbc5680908bb37f97085c0f8 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 11:53:16 +0200 Subject: [PATCH 09/36] fix replace disposition --- dlt/destinations/athena/__init__.py | 1 + dlt/destinations/athena/athena.py | 33 ++++++++++++------- .../load/pipeline/test_replace_disposition.py | 4 ++- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/dlt/destinations/athena/__init__.py b/dlt/destinations/athena/__init__.py index 531744f6e6..d19a0ffdb7 100644 --- a/dlt/destinations/athena/__init__.py +++ b/dlt/destinations/athena/__init__.py @@ -36,6 +36,7 @@ def capabilities() -> DestinationCapabilitiesContext: caps.alter_add_multi_column = True caps.schema_supports_numeric_precision = False caps.timestamp_precision = 3 + caps.supports_truncate_command = False return caps diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 9e670d5ca6..c76ba6a790 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -121,7 +121,7 @@ def __init__(self) -> None: DLTAthenaFormatter._INSTANCE = self -class DoNothingJob(LoadJob, FollowupJob): +class DoNothingJob(LoadJob): """The most lazy class of dlt""" def __init__(self, file_path: str) -> None: @@ -135,6 +135,10 @@ def exception(self) -> str: # this part of code should be never reached raise NotImplementedError() +class DoNothingFollowupJob(DoNothingJob, FollowupJob): + """The second most lazy class of dlt""" + pass + class AthenaSQLClient(SqlClientBase[Connection]): @@ -298,8 +302,10 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: self.iceberg_mode = not (not self.config.iceberg_bucket_url) def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: - # never truncate tables in athena - super().initialize_storage([]) + # only truncate tables in iceberg mode + if not self.iceberg_mode or self.in_staging_mode: + truncate_tables = [] + super().initialize_storage(truncate_tables) def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: return self.type_mapper.from_db_type(hive_t, precision, scale) @@ -354,19 +360,24 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> ) job = super().start_file_load(table, file_path, load_id) if not job: - job = DoNothingJob(file_path) + job = DoNothingFollowupJob(file_path) if self.iceberg_mode else DoNothingJob(file_path) return job def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) - # when in iceberg mode, we need to add some more jobs - if self.iceberg_mode: - write_disposition = table_chain[0]["write_disposition"] - if write_disposition == "append": - jobs.append(self._create_staging_copy_job(table_chain, False)) - elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert": - jobs.append(self._create_staging_copy_job(table_chain, True)) + + # no jobs if we are merging: TODO: add proper iceberg merge job + write_disposition = table_chain[0]["write_disposition"] + if write_disposition == "merge": + return [] + + # add some additional jobs + write_disposition = table_chain[0]["write_disposition"] + if write_disposition == "append": + jobs.append(self._create_staging_copy_job(table_chain, False)) + elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert": + jobs.append(self._create_staging_copy_job(table_chain, False)) return jobs def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index 9ee3a5b947..d39556ab2f 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -25,7 +25,7 @@ def test_replace_disposition(destination_config: DestinationTestConfiguration, r # TODO: start storing _dlt_loads with right json content increase_loads = lambda x: x if destination_config.destination == "filesystem" else x + 1 - increase_state_loads = lambda info: len([job for job in info.load_packages[0].jobs["completed_jobs"] if job.job_file_info.table_name == "_dlt_pipeline_state" and job.job_file_info.file_format != "reference"]) + increase_state_loads = lambda info: len([job for job in info.load_packages[0].jobs["completed_jobs"] if job.job_file_info.table_name == "_dlt_pipeline_state" and job.job_file_info.file_format not in ["sql", "reference"]]) # filesystem does not have versions and child tables def norm_table_counts(counts: Dict[str, int], *child_tables: str) -> Dict[str, int]: @@ -72,6 +72,7 @@ def append_items(): "name": f"item {index}", } + # first run with offset 0 info = pipeline.run([load_items, append_items], loader_file_format=destination_config.file_format) assert_load_info(info) @@ -98,6 +99,7 @@ def append_items(): "_dlt_loads": dlt_loads, "_dlt_version": dlt_versions } + # check trace assert pipeline.last_trace.last_normalize_info.row_counts == { "append_items": 12, From 202466fe4a569f826b4c491562066785dda56373 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 15:22:56 +0200 Subject: [PATCH 10/36] fix datatype support --- dlt/destinations/athena/athena.py | 11 ++++++++--- dlt/destinations/sql_jobs.py | 1 - 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index c76ba6a790..b406e1556b 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -69,13 +69,18 @@ class AthenaTypeMapper(TypeMapper): "int": "bigint", } + def __init__(self, capabilities: DestinationCapabilitiesContext, iceberg_mode: bool): + super().__init__(capabilities) + self.iceberg_mode = iceberg_mode + def to_db_integer_type(self, precision: Optional[int]) -> str: if precision is None: return "bigint" + # iceberg does not support smallint and tinyint if precision <= 8: - return "tinyint" + return "int" if self.iceberg_mode else "tinyint" elif precision <= 16: - return "smallint" + return "int" if self.iceberg_mode else "smallint" elif precision <= 32: return "int" return "bigint" @@ -298,8 +303,8 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config - self.type_mapper = AthenaTypeMapper(self.capabilities) self.iceberg_mode = not (not self.config.iceberg_bucket_url) + self.type_mapper = AthenaTypeMapper(self.capabilities, self.iceberg_mode) def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # only truncate tables in iceberg mode diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 784c0e3a05..b601cb4813 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -63,7 +63,6 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient columns = ", ".join(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(table, "name"))) if params["replace"]: sql.append(sql_client._truncate_table_sql(table_name)) - print(f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};") sql.append(f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};") return sql From bd9744c855ed309145b7997fff7a0221f827a882 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 15:48:36 +0200 Subject: [PATCH 11/36] fix append for merge in iceberg --- dlt/destinations/athena/athena.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index b406e1556b..730bcc6b9e 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -372,10 +372,10 @@ def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTabl """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) - # no jobs if we are merging: TODO: add proper iceberg merge job + # append job if there is a merge TODO: add proper iceberg merge job write_disposition = table_chain[0]["write_disposition"] if write_disposition == "merge": - return [] + jobs.append(self._create_staging_copy_job(table_chain, False)) # add some additional jobs write_disposition = table_chain[0]["write_disposition"] From f627a0fa153a3ef42449bab70f912e3f34a815b2 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 17:05:40 +0200 Subject: [PATCH 12/36] fix merge jobs for iceberg --- dlt/destinations/athena/athena.py | 5 ----- dlt/destinations/sql_jobs.py | 21 ++++++++++++++------- tests/load/utils.py | 2 -- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 730bcc6b9e..ed43f5b502 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -372,11 +372,6 @@ def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTabl """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) - # append job if there is a merge TODO: add proper iceberg merge job - write_disposition = table_chain[0]["write_disposition"] - if write_disposition == "merge": - jobs.append(self._create_staging_copy_job(table_chain, False)) - # add some additional jobs write_disposition = table_chain[0]["write_disposition"] if write_disposition == "append": diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index b601cb4813..794db1b00a 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -143,7 +143,7 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: Returns: sql statement that inserts data from selects into temp table """ - return f"CREATE TEMP TABLE {temp_table_name} AS {select_sql};" + return f"CREATE TABLE {temp_table_name} AS {select_sql};" @classmethod def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: @@ -161,7 +161,8 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien unique_column: str = None root_key_column: str = None - insert_temp_table_sql: str = None + insert_temp_table_name: str = None + delete_temp_table_name: str = None if len(table_chain) == 1: @@ -183,10 +184,10 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien # get first unique column unique_column = sql_client.capabilities.escape_identifier(unique_columns[0]) # create temp table with unique identifier - create_delete_temp_table_sql, delete_temp_table_sql = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses) + create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses) sql.extend(create_delete_temp_table_sql) # delete top table - sql.append(f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM {delete_temp_table_sql});") + sql.append(f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM {delete_temp_table_name});") # delete other tables for table in table_chain[1:]: table_name = sql_client.make_qualified_table_name(table["name"]) @@ -199,10 +200,10 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien f"There is no root foreign key (ie _dlt_root_id) in child table {table['name']} so it is not possible to refer to top level table {root_table['name']} unique column {unique_column}" ) root_key_column = sql_client.capabilities.escape_identifier(root_key_columns[0]) - sql.append(f"DELETE FROM {table_name} WHERE {root_key_column} IN (SELECT * FROM {delete_temp_table_sql});") + sql.append(f"DELETE FROM {table_name} WHERE {root_key_column} IN (SELECT * FROM {delete_temp_table_name});") # create temp table used to deduplicate, only when we have primary keys if primary_keys: - create_insert_temp_table_sql, insert_temp_table_sql = cls.gen_insert_temp_table_sql(staging_root_table_name, primary_keys, unique_column) + create_insert_temp_table_sql, insert_temp_table_name = cls.gen_insert_temp_table_sql(staging_root_table_name, primary_keys, unique_column) sql.extend(create_insert_temp_table_sql) # insert from staging to dataset, truncate staging table @@ -222,11 +223,17 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien """ else: uniq_column = unique_column if table.get("parent") is None else root_key_column - insert_sql += f" WHERE {uniq_column} IN (SELECT * FROM {insert_temp_table_sql});" + insert_sql += f" WHERE {uniq_column} IN (SELECT * FROM {insert_temp_table_name});" if insert_sql.strip()[-1] != ";": insert_sql += ";" sql.append(insert_sql) # -- DELETE FROM {staging_table_name} WHERE 1=1; + # clean up + if insert_temp_table_name: + sql.append(f"DROP TABLE {insert_temp_table_name};") + if delete_temp_table_name: + sql.append(f"DROP TABLE {delete_temp_table_name};") + return sql diff --git a/tests/load/utils.py b/tests/load/utils.py index a0cab6ff73..471c8cc59f 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -145,8 +145,6 @@ def destinations_configs( for bucket in ALL_BUCKETS: destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=bucket, extra_info=bucket)] - # destination_configs = [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg")] - # filter out non active destinations destination_configs = [conf for conf in destination_configs if conf.destination in ACTIVE_DESTINATIONS] From 9a94d4a0f3fcac02e2f049cf78eb7d169738b876 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 17:56:35 +0200 Subject: [PATCH 13/36] clean up followup jobs code --- dlt/destinations/athena/athena.py | 30 +++++++++++-------------- dlt/destinations/bigquery/bigquery.py | 10 +++++---- dlt/destinations/job_client_impl.py | 29 +++++++++++------------- dlt/destinations/mssql/mssql.py | 10 +++++---- dlt/destinations/postgres/postgres.py | 6 +++-- dlt/destinations/redshift/redshift.py | 4 ++-- dlt/destinations/snowflake/snowflake.py | 6 +++-- 7 files changed, 48 insertions(+), 47 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index ed43f5b502..9553fc937a 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -323,11 +323,15 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc create_data_iceberg_tables = self.iceberg_mode and not self.in_staging_mode bucket = self.config.staging_config.bucket_url + dataset = self.sql_client.dataset_name + if create_data_iceberg_tables: bucket = self.config.iceberg_bucket_url - # TODO: we need to strip the staging layout from the table name, find a better way! - dataset = self.sql_client.dataset_name.replace("_staging", "") + # strip the staging portion from the dataset name if we are in iceberg mode + if self.iceberg_mode and self.in_staging_mode and dataset.endswith("_staging") : + dataset = dataset[:-len("_staging")] + sql: List[str] = [] # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries @@ -368,23 +372,15 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> job = DoNothingFollowupJob(file_path) if self.iceberg_mode else DoNothingJob(file_path) return job - def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: - """Creates a list of followup jobs for merge write disposition and staging replace strategies""" - jobs = super().create_table_chain_completed_followup_jobs(table_chain) - - # add some additional jobs - write_disposition = table_chain[0]["write_disposition"] - if write_disposition == "append": - jobs.append(self._create_staging_copy_job(table_chain, False)) - elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert": - jobs.append(self._create_staging_copy_job(table_chain, False)) - return jobs + def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + if self.iceberg_mode: + return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False})] + return super()._create_append_followup_jobs(table_chain) - def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: - """update destination tables from staging tables""" + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: if self.iceberg_mode: - return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace}) - return super()._create_staging_copy_job(table_chain, replace=replace) + return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] + return super()._create_replace_followup_jobs(table_chain) def get_stage_dispositions(self) -> List[TWriteDisposition]: # in iceberg mode, we always use staging tables diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index 387e450184..a5aa0cc703 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -167,11 +167,13 @@ def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: self.sql_client: BigQuerySqlClient = sql_client # type: ignore self.type_mapper = BigQueryTypeMapper(self.capabilities) - def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return BigQueryMergeJob.from_table_chain(table_chain, self.sql_client) + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)] - def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client) + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) def restore_file_load(self, file_path: str) -> LoadJob: """Returns a completed SqlLoadJob or restored BigQueryLoadJob diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index aec3a6a310..91f7d15bee 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -145,31 +145,28 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition] return ["replace"] return [] - def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return SqlMergeJob.from_table_chain(table_chain, self.sql_client) + def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [] - def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob: - """update destination tables from staging tables""" - return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [SqlMergeJob.from_table_chain(table_chain, self.sql_client)] - def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - """optimized replace strategy, defaults to _create_staging_copy_job for the basic client - for some destinations there are much faster destination updates at the cost of - dropping tables possible""" - return self._create_staging_copy_job(table_chain, True) + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + jobs: List[NewLoadJob] = [] + if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: + jobs.append(SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})) + return jobs def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) write_disposition = table_chain[0]["write_disposition"] if write_disposition == "append": - pass + jobs.extend(self._create_append_followup_jobs(table_chain)) elif write_disposition == "merge": - jobs.append(self._create_merge_job(table_chain)) - elif write_disposition == "replace" and self.config.replace_strategy == "insert-from-staging": - jobs.append(self._create_staging_copy_job(table_chain, True)) - elif write_disposition == "replace" and self.config.replace_strategy == "staging-optimized": - jobs.append(self._create_optimized_replace_job(table_chain)) + jobs.extend(self._create_merge_followup_jobs(table_chain)) + elif write_disposition == "replace": + jobs.extend(self._create_replace_followup_jobs(table_chain)) return jobs def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/mssql/mssql.py index 67b51f885c..c06ddeadbd 100644 --- a/dlt/destinations/mssql/mssql.py +++ b/dlt/destinations/mssql/mssql.py @@ -133,8 +133,8 @@ def __init__(self, schema: Schema, config: MsSqlClientConfiguration) -> None: self.active_hints = HINT_TO_MSSQL_ATTR if self.config.create_indexes else {} self.type_mapper = MsSqlTypeMapper(self.capabilities) - def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return MsSqlMergeJob.from_table_chain(table_chain, self.sql_client) + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)] def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str]: # Override because mssql requires multiple columns in a single ADD COLUMN clause @@ -152,8 +152,10 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c['nullable'])}" - def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client) + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/postgres/postgres.py index b6c716754f..72837b42b3 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/postgres/postgres.py @@ -114,8 +114,10 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" - def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client) + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/redshift/redshift.py index 944bd24581..f210d757c1 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/redshift/redshift.py @@ -201,8 +201,8 @@ def __init__(self, schema: Schema, config: RedshiftClientConfiguration) -> None: self.config: RedshiftClientConfiguration = config self.type_mapper = RedshiftTypeMapper(self.capabilities) - def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return RedshiftMergeJob.from_table_chain(table_chain, self.sql_client) + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema) -> str: hints_str = " ".join(HINT_TO_REDSHIFT_ATTR.get(h, "") for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True) diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/snowflake/snowflake.py index 69432bc696..58d81a602d 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/snowflake/snowflake.py @@ -204,8 +204,10 @@ def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str # Override because snowflake requires multiple columns in a single ADD COLUMN clause return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)] - def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob: - return SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) + def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool, separate_alters: bool = False) -> List[str]: sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) From 8682350d76b114f542e445f3d69bc0b2c5153f10 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 5 Oct 2023 21:58:32 +0200 Subject: [PATCH 14/36] set iceberg tests to merge supported --- tests/load/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index 471c8cc59f..54e410e43a 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -110,7 +110,7 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=False, extra_info="iceberg")] + destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=True, extra_info="iceberg")] if default_vector_configs: # for now only weaviate From 1560768cca5ef7090593c805fb1e39cc848fdad8 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 6 Oct 2023 09:56:31 +0200 Subject: [PATCH 15/36] fix sql merge syntax for iceberg --- dlt/destinations/sql_jobs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 794db1b00a..7e54aac0b6 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -100,17 +100,17 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery """ - return [f"FROM {root_table_name} as d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as s WHERE {' OR '.join([c.format(d='d',s='s') for c in key_clauses])})"] + return [f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as s WHERE {' OR '.join([c.format(d=root_table_name,s='s') for c in key_clauses])})"] @classmethod - def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequence[str]) -> Tuple[List[str], str]: + def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequence[str], root_table_name: str) -> Tuple[List[str], str]: """Generate sql that creates delete temp table and inserts `unique_column` from root table for all records to delete. May return several statements. Returns temp table name for cases where special names are required like SQLServer. """ sql: List[str] = [] temp_table_name = cls._new_temp_table_name("delete") - select_statement = f"SELECT d.{unique_column} {key_table_clauses[0]}" + select_statement = f"SELECT {root_table_name}.{unique_column} {key_table_clauses[0]}" sql.append(cls._to_temp_table(select_statement, temp_table_name)) for clause in key_table_clauses[1:]: sql.append(f"INSERT INTO {temp_table_name} SELECT {unique_column} {clause};") @@ -184,7 +184,7 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien # get first unique column unique_column = sql_client.capabilities.escape_identifier(unique_columns[0]) # create temp table with unique identifier - create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses) + create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses, root_table_name) sql.extend(create_delete_temp_table_sql) # delete top table sql.append(f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM {delete_temp_table_name});") From 3f4fb1efe7daa1abeb5391d18d6b59cc1e117d88 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 6 Oct 2023 13:58:12 +0200 Subject: [PATCH 16/36] separate regular athena and iceberg tests --- .github/workflows/test_destination_athena.yml | 1 + .../test_destination_athena_iceberg.yml | 94 +++++++++++++++++++ tests/load/utils.py | 5 +- tests/utils.py | 5 + 4 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test_destination_athena_iceberg.yml diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 16c9caff53..704e66522b 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -21,6 +21,7 @@ env: RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB ACTIVE_DESTINATIONS: "[\"athena\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-parquet-staging-iceberg\"]" jobs: get_docs_changes: diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml new file mode 100644 index 0000000000..6892a96bf1 --- /dev/null +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -0,0 +1,94 @@ + +name: test athena iceberg + +on: + pull_request: + branches: + - master + - devel + workflow_dispatch: + +env: + DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 + DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 + DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 + DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output + + RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 + RUNTIME__LOG_LEVEL: ERROR + RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB + ACTIVE_DESTINATIONS: "[\"athena\"]" + ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-no-staging\"]" + +jobs: + get_docs_changes: + uses: ./.github/workflows/get_docs_changes.yml + # Tests that require credentials do not run in forks + if: ${{ !github.event.pull_request.head.repo.fork }} + + run_loader: + name: test destination athena iceberg + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + # os: ["ubuntu-latest", "macos-latest", "windows-latest"] + defaults: + run: + shell: bash + runs-on: ${{ matrix.os }} + + steps: + + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10.x" + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + # path: ${{ steps.pip-cache.outputs.dir }} + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-athena + + - name: Install dependencies + # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction -E athena + + - run: | + poetry run pytest tests/load + if: runner.os != 'Windows' + name: Run tests Linux/MAC + - run: | + poetry run pytest tests/load + if: runner.os == 'Windows' + name: Run tests Windows + shell: cmd + + matrix_job_required_check: + name: Redshift, PostgreSQL and DuckDB tests + needs: run_loader + runs-on: ubuntu-latest + if: always() + steps: + - name: Check matrix job results + if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + run: | + echo "One or more matrix job tests failed or were cancelled. You may need to re-run them." && exit 1 diff --git a/tests/load/utils.py b/tests/load/utils.py index 54e410e43a..e2d58102dc 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,7 @@ from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS +from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS from tests.cases import TABLE_UPDATE_COLUMNS_SCHEMA, TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES, assert_all_data_types_row # bucket urls @@ -154,6 +154,9 @@ def destinations_configs( if exclude: destination_configs = [conf for conf in destination_configs if conf.destination not in exclude] + # filter out destination configs as obtained from the env + destination_configs = [conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS] + return destination_configs diff --git a/tests/utils.py b/tests/utils.py index c64cf0ed9d..5e5e132703 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -25,6 +25,7 @@ TEST_STORAGE_ROOT = "_storage" + # destination constants IMPLEMENTED_DESTINATIONS = {"athena", "duckdb", "bigquery", "redshift", "postgres", "snowflake", "filesystem", "weaviate", "dummy", "motherduck", "mssql"} NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck"} @@ -32,6 +33,10 @@ # filter out active destinations for current tests ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS) + +# exclude destination configs (for now used for athena and athena iceberg separation) +EXCLUDED_DESTINATION_CONFIGURATIONS = set(dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set()) + ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) From 92613ec908defe10a8bfb454afabc458add1a30d Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 9 Oct 2023 12:19:18 +0200 Subject: [PATCH 17/36] remove some iceberg specific code --- .github/workflows/test_destination_athena.yml | 1 - .../test_destination_athena_iceberg.yml | 94 ------------------- dlt/destinations/athena/athena.py | 34 +++---- dlt/destinations/athena/configuration.py | 1 - dlt/destinations/sql_jobs.py | 19 ++-- tests/load/athena_iceberg/__init__.py | 0 .../test_athena_iceberg.py} | 22 +++-- tests/load/utils.py | 8 +- tests/utils.py | 2 - 9 files changed, 33 insertions(+), 148 deletions(-) delete mode 100644 .github/workflows/test_destination_athena_iceberg.yml create mode 100644 tests/load/athena_iceberg/__init__.py rename tests/load/{test_iceberg.py => athena_iceberg/test_athena_iceberg.py} (75%) diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 704e66522b..16c9caff53 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -21,7 +21,6 @@ env: RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB ACTIVE_DESTINATIONS: "[\"athena\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" - EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-parquet-staging-iceberg\"]" jobs: get_docs_changes: diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml deleted file mode 100644 index 6892a96bf1..0000000000 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ /dev/null @@ -1,94 +0,0 @@ - -name: test athena iceberg - -on: - pull_request: - branches: - - master - - devel - workflow_dispatch: - -env: - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 - DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 - DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} - DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 - DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output - - RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 - RUNTIME__LOG_LEVEL: ERROR - RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB - ACTIVE_DESTINATIONS: "[\"athena\"]" - ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" - EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-no-staging\"]" - -jobs: - get_docs_changes: - uses: ./.github/workflows/get_docs_changes.yml - # Tests that require credentials do not run in forks - if: ${{ !github.event.pull_request.head.repo.fork }} - - run_loader: - name: test destination athena iceberg - needs: get_docs_changes - if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' - strategy: - fail-fast: false - matrix: - os: ["ubuntu-latest"] - # os: ["ubuntu-latest", "macos-latest", "windows-latest"] - defaults: - run: - shell: bash - runs-on: ${{ matrix.os }} - - steps: - - - name: Check out - uses: actions/checkout@master - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: "3.10.x" - - - name: Install Poetry - uses: snok/install-poetry@v1.3.2 - with: - virtualenvs-create: true - virtualenvs-in-project: true - installer-parallel: true - - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v3 - with: - # path: ${{ steps.pip-cache.outputs.dir }} - path: .venv - key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-athena - - - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena - - - run: | - poetry run pytest tests/load - if: runner.os != 'Windows' - name: Run tests Linux/MAC - - run: | - poetry run pytest tests/load - if: runner.os == 'Windows' - name: Run tests Windows - shell: cmd - - matrix_job_required_check: - name: Redshift, PostgreSQL and DuckDB tests - needs: run_loader - runs-on: ubuntu-latest - if: always() - steps: - - name: Check matrix job results - if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') - run: | - echo "One or more matrix job tests failed or were cancelled. You may need to re-run them." && exit 1 diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 9553fc937a..792617badd 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -303,13 +303,11 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config - self.iceberg_mode = not (not self.config.iceberg_bucket_url) - self.type_mapper = AthenaTypeMapper(self.capabilities, self.iceberg_mode) + self.type_mapper = AthenaTypeMapper(self.capabilities, True) def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # only truncate tables in iceberg mode - if not self.iceberg_mode or self.in_staging_mode: - truncate_tables = [] + truncate_tables = [] super().initialize_storage(truncate_tables) def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: @@ -320,23 +318,14 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str: def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: - create_data_iceberg_tables = self.iceberg_mode and not self.in_staging_mode - bucket = self.config.staging_config.bucket_url dataset = self.sql_client.dataset_name - if create_data_iceberg_tables: - bucket = self.config.iceberg_bucket_url - - # strip the staging portion from the dataset name if we are in iceberg mode - if self.iceberg_mode and self.in_staging_mode and dataset.endswith("_staging") : - dataset = dataset[:-len("_staging")] - sql: List[str] = [] # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = create_data_iceberg_tables or (self.schema.tables[table_name].get("write_disposition", None) == "skip") + is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip" columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -369,28 +358,31 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> ) job = super().start_file_load(table, file_path, load_id) if not job: - job = DoNothingFollowupJob(file_path) if self.iceberg_mode else DoNothingJob(file_path) + job = DoNothingFollowupJob(file_path) if self._is_iceberg_table(table) else DoNothingJob(file_path) return job def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: - if self.iceberg_mode: + if self._is_iceberg_table(table_chain[0]): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False})] return super()._create_append_followup_jobs(table_chain) def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: - if self.iceberg_mode: + if self._is_iceberg_table(table_chain[0]): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] return super()._create_replace_followup_jobs(table_chain) + def _is_iceberg_table(self, table: TTableSchema) -> bool: + return False + def get_stage_dispositions(self) -> List[TWriteDisposition]: # in iceberg mode, we always use staging tables - if self.iceberg_mode: - return ["append", "replace", "merge"] + # if self.iceberg_mode: + # return ["append", "replace", "merge"] return super().get_stage_dispositions() def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: - if self.iceberg_mode: - return ["append", "replace", "merge"] + # if self.iceberg_mode: + # return ["append", "replace", "merge"] return [] @staticmethod diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py index b2df8fb5e9..7eca85fe41 100644 --- a/dlt/destinations/athena/configuration.py +++ b/dlt/destinations/athena/configuration.py @@ -9,7 +9,6 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_name: Final[str] = "athena" # type: ignore[misc] query_result_bucket: str = None - iceberg_bucket_url: Optional[str] = None credentials: AwsCredentials = None athena_work_group: Optional[str] = None aws_data_catalog: Optional[str] = "awsdatacatalog" diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 7e54aac0b6..4e8393ed74 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -100,17 +100,17 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery """ - return [f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as s WHERE {' OR '.join([c.format(d=root_table_name,s='s') for c in key_clauses])})"] + return [f"FROM {root_table_name} as d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as s WHERE {' OR '.join([c.format(d='d',s='s') for c in key_clauses])})"] @classmethod - def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequence[str], root_table_name: str) -> Tuple[List[str], str]: + def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequence[str]) -> Tuple[List[str], str]: """Generate sql that creates delete temp table and inserts `unique_column` from root table for all records to delete. May return several statements. Returns temp table name for cases where special names are required like SQLServer. """ sql: List[str] = [] temp_table_name = cls._new_temp_table_name("delete") - select_statement = f"SELECT {root_table_name}.{unique_column} {key_table_clauses[0]}" + select_statement = f"SELECT d.{unique_column} {key_table_clauses[0]}" sql.append(cls._to_temp_table(select_statement, temp_table_name)) for clause in key_table_clauses[1:]: sql.append(f"INSERT INTO {temp_table_name} SELECT {unique_column} {clause};") @@ -143,7 +143,7 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: Returns: sql statement that inserts data from selects into temp table """ - return f"CREATE TABLE {temp_table_name} AS {select_sql};" + return f"CREATE TEMP TABLE {temp_table_name} AS {select_sql};" @classmethod def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: @@ -162,7 +162,6 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien unique_column: str = None root_key_column: str = None insert_temp_table_name: str = None - delete_temp_table_name: str = None if len(table_chain) == 1: @@ -184,7 +183,7 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien # get first unique column unique_column = sql_client.capabilities.escape_identifier(unique_columns[0]) # create temp table with unique identifier - create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses, root_table_name) + create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses) sql.extend(create_delete_temp_table_sql) # delete top table sql.append(f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM {delete_temp_table_name});") @@ -230,10 +229,4 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien sql.append(insert_sql) # -- DELETE FROM {staging_table_name} WHERE 1=1; - # clean up - if insert_temp_table_name: - sql.append(f"DROP TABLE {insert_temp_table_name};") - if delete_temp_table_name: - sql.append(f"DROP TABLE {delete_temp_table_name};") - - return sql + return sql \ No newline at end of file diff --git a/tests/load/athena_iceberg/__init__.py b/tests/load/athena_iceberg/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/load/test_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py similarity index 75% rename from tests/load/test_iceberg.py rename to tests/load/athena_iceberg/test_athena_iceberg.py index f9cb891fbc..e1650549cc 100644 --- a/tests/load/test_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -1,6 +1,3 @@ -""" -Temporary test file for iceberg -""" import pytest import os @@ -22,14 +19,11 @@ def test_iceberg() -> None: - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "s3://dlt-ci-test-bucket" - os.environ['DESTINATION__ATHENA__ICEBERG_BUCKET_URL'] = "s3://dlt-ci-test-bucket/iceberg" - pipeline = dlt.pipeline(pipeline_name="aaathena", destination="athena", staging="filesystem", full_refresh=True) + pipeline = dlt.pipeline(pipeline_name="aaathena-iceberg", destination="athena", staging="filesystem", full_refresh=True) - @dlt.resource(name="items", write_disposition="append") - def items(): + def items() -> Iterator[Any]: yield { "id": 1, "name": "item", @@ -42,7 +36,17 @@ def items(): }] } - print(pipeline.run(items)) + @dlt.resource(name="items_normal", write_disposition="append") + def items_normal(): + yield from items() + + @dlt.resource(name="items_iceberg", write_disposition="append") + def items_iceberg(): + yield from items() + + print(pipeline.run([items_normal, items_iceberg])) + + return # see if we have athena tables with items table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) diff --git a/tests/load/utils.py b/tests/load/utils.py index e2d58102dc..a615b696e3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,7 @@ from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS +from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS from tests.cases import TABLE_UPDATE_COLUMNS_SCHEMA, TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES, assert_all_data_types_row # bucket urls @@ -49,7 +49,6 @@ class DestinationTestConfiguration: staging: Optional[str] = None file_format: Optional[TLoaderFileFormat] = None bucket_url: Optional[str] = None - iceberg_bucket_url: Optional[str] = None stage_name: Optional[str] = None staging_iam_role: Optional[str] = None extra_info: Optional[str] = None @@ -73,7 +72,6 @@ def setup(self) -> None: os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = self.bucket_url or "" os.environ['DESTINATION__STAGE_NAME'] = self.stage_name or "" os.environ['DESTINATION__STAGING_IAM_ROLE'] = self.staging_iam_role or "" - os.environ['DESTINATION__ATHENA__ICEBERG_BUCKET_URL'] = self.iceberg_bucket_url or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" if self.destination == "filesystem": @@ -110,7 +108,6 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, iceberg_bucket_url=AWS_BUCKET + "/iceberg", supports_merge=True, extra_info="iceberg")] if default_vector_configs: # for now only weaviate @@ -154,9 +151,6 @@ def destinations_configs( if exclude: destination_configs = [conf for conf in destination_configs if conf.destination not in exclude] - # filter out destination configs as obtained from the env - destination_configs = [conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS] - return destination_configs diff --git a/tests/utils.py b/tests/utils.py index 5e5e132703..2d675f514a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -34,8 +34,6 @@ # filter out active destinations for current tests ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS) -# exclude destination configs (for now used for athena and athena iceberg separation) -EXCLUDED_DESTINATION_CONFIGURATIONS = set(dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set()) ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) From 0924bc53b2831954633c1ee86fc82238988f1c69 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 9 Oct 2023 18:01:59 +0200 Subject: [PATCH 18/36] new iceberg approach --- dlt/common/destination/reference.py | 15 +-- dlt/common/schema/exceptions.py | 5 + dlt/common/schema/typing.py | 2 + dlt/common/schema/utils.py | 46 ++++++-- dlt/common/storages/load_storage.py | 5 +- dlt/destinations/athena/athena.py | 25 ++--- dlt/destinations/bigquery/bigquery.py | 5 +- dlt/destinations/exceptions.py | 6 - dlt/destinations/filesystem/filesystem.py | 29 +++-- dlt/destinations/job_client_impl.py | 20 ++-- dlt/extract/decorators.py | 7 +- dlt/extract/schema.py | 5 +- dlt/load/load.py | 103 +++++++----------- .../athena_iceberg/test_athena_iceberg.py | 39 +++++-- tests/load/utils.py | 4 +- tests/utils.py | 1 + 16 files changed, 178 insertions(+), 139 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index f4fbe4df76..5fea462159 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -10,6 +10,7 @@ from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName +from dlt.common.schema.utils import get_load_table from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config @@ -244,13 +245,8 @@ def restore_file_load(self, file_path: str) -> LoadJob: """Finds and restores already started loading job identified by `file_path` if destination supports it.""" pass - def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition]: - # in the base job, all replace strategies are treated the same, see filesystem for example - return ["replace"] - - def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: - # some clients need to additionally be able to get the staging destination to truncate tables - return [] + def table_needs_truncating(self, table: TTableSchema) -> bool: + return table["write_disposition"] == "replace" def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" @@ -313,9 +309,8 @@ class WithStagingDataset(ABC): """Adds capability to use staging dataset and request it from the loader""" @abstractmethod - def get_stage_dispositions(self) -> List[TWriteDisposition]: - """Returns a list of write dispositions that require staging dataset""" - return [] + def table_needs_staging(self, table: TTableSchema) -> bool: + return False @abstractmethod def with_staging_dataset(self)-> ContextManager["JobClientBase"]: diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 2245a77b61..5f638a111d 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -69,3 +69,8 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi self.from_engine = from_engine self.to_engine = to_engine super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + +class UnknownTableException(SchemaException): + def __init__(self, table_name: str) -> None: + self.table_name = table_name + super().__init__(f"Trying to access unknown table {table_name}.") \ No newline at end of file diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index ae24691e2d..2cc057560c 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -24,6 +24,7 @@ TColumnHint = Literal["not_null", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "root_key", "merge_key"] """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] +TTableFormat = Literal["iceberg"] TTypeDetections = Literal["timestamp", "iso_timestamp", "large_integer", "hexbytes_to_text", "wei_to_double"] TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]] TColumnNames = Union[str, Sequence[str]] @@ -86,6 +87,7 @@ class TTableSchema(TypedDict, total=False): filters: Optional[TRowFilters] columns: TTableSchemaColumns resource: Optional[str] + table_format: Optional[TTableFormat] class TPartialTableSchema(TTableSchema): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index d3c0b31cc0..93f0913550 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -15,10 +15,10 @@ from dlt.common.validation import TCustomValidator, validate_dict, validate_dict_ignoring_xkeys from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, - TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, + TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, - TablePropertiesConflictException, InvalidSchemaName) + TablePropertiesConflictException, InvalidSchemaName, UnknownTableException) from dlt.common.normalizers.utils import import_normalizers from dlt.common.schema.typing import TAnySchemaColumns @@ -493,18 +493,29 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl return aggregated_update -def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition: - """Returns write disposition of a table if present. If not, looks up into parent table""" +def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_name: str, allow_none: bool = False) -> Any: table = tables[table_name] - w_d = table.get("write_disposition") - if w_d: - return w_d + hint = table.get(table_hint_name) + if hint: + return hint parent = table.get("parent") if parent: - return get_write_disposition(tables, parent) + return get_inherited_table_hint(tables, parent, table_hint_name, allow_none) + + if allow_none: + return None + + raise ValueError(f"No table hint '{table_hint_name} found in the chain of tables for '{table_name}'.") + + +def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition: + """Returns table hint of a table if present. If not, looks up into parent table""" + return get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False) + - raise ValueError(f"No write disposition found in the chain of tables for '{table_name}'.") +def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: + return get_inherited_table_hint(tables, table_name, "table_format", allow_none=True) def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: @@ -525,6 +536,18 @@ def get_top_level_table(tables: TSchemaTables, table_name: str) -> TTableSchema: return get_top_level_table(tables, parent) return table +def get_load_table(tables: TSchemaTables, table_name: str) -> TTableSchema: + try: + # make a copy of the schema so modifications do not affect the original document + table = copy(tables[table_name]) + # add write disposition if not specified - in child tables + if "write_disposition" not in table: + table["write_disposition"] = get_write_disposition(tables, table_name) + if "table_format" not in table: + table["table_format"] = get_table_format(tables, table_name) + return table + except KeyError: + raise UnknownTableException(table_name) def get_child_tables(tables: TSchemaTables, table_name: str) -> List[TTableSchema]: """Get child tables for table name and return a list of tables ordered by ancestry so the child tables are always after their parents""" @@ -637,7 +660,8 @@ def new_table( write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, - resource: str = None + resource: str = None, + table_format: TTableFormat = None ) -> TTableSchema: table: TTableSchema = { @@ -652,6 +676,8 @@ def new_table( # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name + if table_format: + table["table_format"] = table_format if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 95170ac46c..8e8a0ac5a8 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -237,8 +237,11 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]: return self.storage.list_folder_files(self._get_job_folder_path(load_id, LoadStorage.FAILED_JOBS_FOLDER)) def list_jobs_for_table(self, load_id: str, table_name: str) -> Sequence[LoadJobInfo]: + return [job for job in self.list_all_jobs(load_id) if job.job_file_info.table_name == table_name] + + def list_all_jobs(self, load_id: str) -> Sequence[LoadJobInfo]: info = self.get_load_package_info(load_id) - return [job for job in flatten_list_or_items(iter(info.jobs.values())) if job.job_file_info.table_name == table_name] # type: ignore + return [job for job in flatten_list_or_items(iter(info.jobs.values()))] # type: ignore def list_completed_failed_jobs(self, load_id: str) -> Sequence[str]: return self.storage.list_folder_files(self._get_job_folder_completed_path(load_id, LoadStorage.FAILED_JOBS_FOLDER)) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 792617badd..c9d6f3abb7 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -17,7 +17,7 @@ from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition -from dlt.common.schema.utils import table_schema_has_type +from dlt.common.schema.utils import table_schema_has_type, get_table_format from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob, FollowupJob from dlt.common.destination.reference import TLoadJobState, NewLoadJob @@ -325,12 +325,13 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip" + is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or (self._is_iceberg_table(self.schema.tables[table_name]) and not self.in_staging_mode) columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined table_prefix = self.table_prefix_layout.format(table_name=table_name) location = f"{bucket}/{dataset}/{table_prefix}" + # use qualified table names qualified_table_name = self.sql_client.make_qualified_ddl_table_name(table_name) if is_iceberg and not generate_alter: @@ -372,18 +373,14 @@ def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> return super()._create_replace_followup_jobs(table_chain) def _is_iceberg_table(self, table: TTableSchema) -> bool: - return False - - def get_stage_dispositions(self) -> List[TWriteDisposition]: - # in iceberg mode, we always use staging tables - # if self.iceberg_mode: - # return ["append", "replace", "merge"] - return super().get_stage_dispositions() - - def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]: - # if self.iceberg_mode: - # return ["append", "replace", "merge"] - return [] + table_format = get_table_format(self.schema.tables, table["name"]) + return table_format == "iceberg" + + def table_needs_staging(self, table: TTableSchema) -> bool: + # all iceberg tables need staging + if self._is_iceberg_table(table): + return True + return super().table_needs_staging(table) @staticmethod def is_dbapi_exception(ex: Exception) -> bool: diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index a5aa0cc703..eceb2ed57a 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -12,9 +12,10 @@ from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.exceptions import UnknownTableException from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException, LoadJobUnknownTableException +from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException from dlt.destinations.bigquery import capabilities from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration @@ -220,7 +221,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> reason = BigQuerySqlClient._get_reason_from_errors(gace) if reason == "notFound": # google.api_core.exceptions.NotFound: 404 - table not found - raise LoadJobUnknownTableException(table["name"], file_path) + raise UnknownTableException(table["name"]) elif reason == "duplicate": # google.api_core.exceptions.Conflict: 409 PUT - already exists return self.restore_file_load(file_path) diff --git a/dlt/destinations/exceptions.py b/dlt/destinations/exceptions.py index f0fe32f950..5c20f081f1 100644 --- a/dlt/destinations/exceptions.py +++ b/dlt/destinations/exceptions.py @@ -63,12 +63,6 @@ def __init__(self, file_path: str, message: str) -> None: super().__init__(f"Job with id/file name {file_path} encountered unrecoverable problem: {message}") -class LoadJobUnknownTableException(DestinationTerminalException): - def __init__(self, table_name: str, file_name: str) -> None: - self.table_name = table_name - super().__init__(f"Client does not know table {table_name} for load file {file_name}") - - class LoadJobInvalidStateTransitionException(DestinationTerminalException): def __init__(self, from_state: TLoadJobState, to_state: TLoadJobState) -> None: self.from_state = from_state diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index 3691c6417b..6ad5954496 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -1,14 +1,15 @@ import posixpath import os from types import TracebackType -from typing import ClassVar, List, Type, Iterable, Set +from typing import ClassVar, List, Type, Iterable, Set, Iterator from fsspec import AbstractFileSystem +from contextlib import contextmanager from dlt.common import logger from dlt.common.schema import Schema, TSchemaTables, TTableSchema from dlt.common.storages import FileStorage, LoadStorage, filesystem_from_config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import NewLoadJob, TLoadJobState, LoadJob, JobClientBase, FollowupJob +from dlt.common.destination.reference import NewLoadJob, TLoadJobState, LoadJob, JobClientBase, FollowupJob, WithStagingDataset from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.filesystem import capabilities @@ -68,7 +69,7 @@ def create_followup_jobs(self, next_state: str) -> List[NewLoadJob]: return jobs -class FilesystemClient(JobClientBase): +class FilesystemClient(JobClientBase, WithStagingDataset): """filesystem client storing jobs in memory""" capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -82,16 +83,22 @@ def __init__(self, schema: Schema, config: FilesystemDestinationClientConfigurat # verify files layout. we need {table_name} and only allow {schema_name} before it, otherwise tables # cannot be replaced and we cannot initialize folders consistently self.table_prefix_layout = path_utils.get_table_prefix_layout(config.layout) - - @property - def dataset_path(self) -> str: - ds_path = posixpath.join(self.fs_path, self.config.normalize_dataset_name(self.schema)) - return ds_path + self.dataset_path = posixpath.join(self.fs_path, self.config.normalize_dataset_name(self.schema)) def drop_storage(self) -> None: if self.is_storage_initialized(): self.fs_client.rm(self.dataset_path, recursive=True) + @contextmanager + def with_staging_dataset(self) -> Iterator["FilesystemClient"]: + current_dataset_path = self.dataset_path + try: + self.dataset_path = posixpath.join(self.fs_path, self.config.normalize_dataset_name(self.schema)) + "_staging" + yield self + finally: + # restore previous dataset name + self.dataset_path = current_dataset_path + def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # clean up existing files for tables selected for truncating if truncate_tables and self.fs_client.isdir(self.dataset_path): @@ -169,3 +176,9 @@ def __enter__(self) -> "FilesystemClient": def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: pass + + def table_needs_staging(self, table: TTableSchema) -> bool: + # not so nice, how to do it better, collect this info from the main destination as before? + if table["table_format"] == "iceberg": + return True + return super().table_needs_staging(table) diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 91f7d15bee..ec3afced94 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -140,10 +140,8 @@ def maybe_ddl_transaction(self) -> Iterator[None]: else: yield - def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition]: - if self.config.replace_strategy == "truncate-and-insert": - return ["replace"] - return [] + def table_needs_truncating(self, table: TTableSchema) -> bool: + return table["write_disposition"] == "replace" and self.config.replace_strategy == "truncate-and-insert" def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [] @@ -442,10 +440,10 @@ def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: finally: self.in_staging_mode = False - def get_stage_dispositions(self) -> List[TWriteDisposition]: - """Returns a list of dispositions that require staging tables to be populated""" - dispositions: List[TWriteDisposition] = ["merge"] - # if we have anything but the truncate-and-insert replace strategy, we need staging tables - if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: - dispositions.append("replace") - return dispositions + def table_needs_staging(self, table: TTableSchema) -> bool: + if table["write_disposition"] == "merge": + return True + elif table["write_disposition"] == "replace" and (self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]): + return True + return False + diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 84dfcb83f9..e122ad10cd 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -14,7 +14,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat from dlt.extract.utils import ensure_table_schema_columns_hint from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -200,6 +200,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> DltResource: @@ -215,6 +216,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: @@ -230,6 +232,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None ) -> DltResource: @@ -245,6 +248,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, data_from: TUnboundDltResource = None, @@ -313,6 +317,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=columns, primary_key=primary_key, merge_key=merge_key, + table_format=table_format ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) diff --git a/dlt/extract/schema.py b/dlt/extract/schema.py index 80e9f6f32f..524bfabc0b 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/schema.py @@ -3,7 +3,7 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -211,6 +211,7 @@ def new_table_template( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None ) -> TTableSchemaTemplate: if not table_name: raise TableNameMissing() @@ -224,7 +225,7 @@ def new_table_template( validator = None # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table( - table_name, parent_table_name, write_disposition=write_disposition, columns=columns # type: ignore + table_name, parent_table_name, write_disposition=write_disposition, columns=columns, table_format=table_format # type: ignore ) if primary_key: new_template["primary_key"] = primary_key diff --git a/dlt/load/load.py b/dlt/load/load.py index 488a0ce4f2..ddce9bf8e9 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -2,7 +2,7 @@ from copy import copy from functools import reduce import datetime # noqa: 251 -from typing import Dict, List, Optional, Tuple, Set, Iterator +from typing import Dict, List, Optional, Tuple, Set, Iterator, Iterable, Callable from multiprocessing.pool import ThreadPool import os @@ -10,20 +10,19 @@ from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.pipeline import LoadInfo, SupportsPipeline -from dlt.common.schema.utils import get_child_tables, get_top_level_table, get_write_disposition +from dlt.common.schema.utils import get_child_tables, get_top_level_table, get_load_table from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState from dlt.common.typing import StrAny from dlt.common.runners import TRunMetrics, Runnable, workermethod from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.runtime.logger import pretty_format_exception from dlt.common.exceptions import TerminalValueError, DestinationTerminalException, DestinationTransientException -from dlt.common.schema import Schema +from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TTableSchema, TWriteDisposition from dlt.common.storages import LoadStorage from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, DestinationReference, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration from dlt.destinations.job_impl import EmptyLoadJob -from dlt.destinations.exceptions import LoadJobUnknownTableException from dlt.load.configuration import LoaderConfiguration from dlt.load.exceptions import LoadClientJobFailed, LoadClientJobRetry, LoadClientUnsupportedWriteDisposition, LoadClientUnsupportedFileFormats @@ -69,19 +68,6 @@ def create_storage(self, is_storage_owner: bool) -> LoadStorage: ) return load_storage - @staticmethod - def get_load_table(schema: Schema, file_name: str) -> TTableSchema: - table_name = LoadStorage.parse_job_file_name(file_name).table_name - try: - # make a copy of the schema so modifications do not affect the original document - table = copy(schema.get_table(table_name)) - # add write disposition if not specified - in child tables - if "write_disposition" not in table: - table["write_disposition"] = get_write_disposition(schema.tables, table_name) - return table - except KeyError: - raise LoadJobUnknownTableException(table_name, file_name) - def get_destination_client(self, schema: Schema) -> JobClientBase: return self.destination.client(schema, self.initial_client_config) @@ -94,7 +80,7 @@ def is_staging_destination_job(self, file_path: str) -> bool: @contextlib.contextmanager def maybe_with_staging_dataset(self, job_client: JobClientBase, table: TTableSchema) -> Iterator[None]: """Executes job client methods in context of staging dataset if `table` has `write_disposition` that requires it""" - if isinstance(job_client, WithStagingDataset) and table["write_disposition"] in job_client.get_stage_dispositions(): + if isinstance(job_client, WithStagingDataset) and job_client.table_needs_staging(table): with job_client.with_staging_dataset(): yield else: @@ -112,7 +98,7 @@ def w_spool_job(self: "Load", file_path: str, load_id: str, schema: Schema) -> O if job_info.file_format not in self.load_storage.supported_file_formats: raise LoadClientUnsupportedFileFormats(job_info.file_format, self.capabilities.supported_loader_file_formats, file_path) logger.info(f"Will load file {file_path} with table name {job_info.table_name}") - table = self.get_load_table(schema, file_path) + table = get_load_table(schema.tables, job_info.table_name) if table["write_disposition"] not in ["append", "replace", "merge"]: raise LoadClientUnsupportedWriteDisposition(job_info.table_name, table["write_disposition"], file_path) with self.maybe_with_staging_dataset(job_client, table): @@ -173,13 +159,8 @@ def retrieve_jobs(self, client: JobClientBase, load_id: str, staging_client: Job return len(jobs), jobs - def get_new_jobs_info(self, load_id: str, schema: Schema, dispositions: List[TWriteDisposition] = None) -> List[ParsedLoadJobFileName]: - jobs_info: List[ParsedLoadJobFileName] = [] - new_job_files = self.load_storage.list_new_jobs(load_id) - for job_file in new_job_files: - if dispositions is None or self.get_load_table(schema, job_file)["write_disposition"] in dispositions: - jobs_info.append(LoadStorage.parse_job_file_name(job_file)) - return jobs_info + def get_new_jobs_info(self, load_id: str) -> List[ParsedLoadJobFileName]: + return [LoadStorage.parse_job_file_name(job_file) for job_file in self.load_storage.list_new_jobs(load_id)] def get_completed_table_chain(self, load_id: str, schema: Schema, top_merged_table: TTableSchema, being_completed_job_id: str = None) -> List[TTableSchema]: """Gets a table chain starting from the `top_merged_table` containing only tables with completed/failed jobs. None is returned if there's any job that is not completed @@ -210,7 +191,7 @@ def create_followup_jobs(self, load_id: str, state: TLoadJobState, starting_job: starting_job_file_name = starting_job.file_name() if state == "completed" and not self.is_staging_destination_job(starting_job_file_name): client = self.destination.client(schema, self.initial_client_config) - top_job_table = get_top_level_table(schema.tables, self.get_load_table(schema, starting_job_file_name)["name"]) + top_job_table = get_top_level_table(schema.tables, starting_job.job_file_info().table_name) # if all tables of chain completed, create follow up jobs if table_chain := self.get_completed_table_chain(load_id, schema, top_job_table, starting_job.job_file_info().job_id()): if follow_up_jobs := client.create_table_chain_completed_followup_jobs(table_chain): @@ -278,56 +259,56 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) self.load_storage.complete_load_package(load_id, aborted) logger.info(f"All jobs completed, archiving package {load_id} with aborted set to {aborted}") - def get_table_chain_tables_for_write_disposition(self, load_id: str, schema: Schema, dispositions: List[TWriteDisposition]) -> Set[str]: + @staticmethod + def _get_table_chain_tables_with_filter(schema: Schema, filter: Callable, tables_with_jobs: Iterable[str]) -> Set[str]: """Get all jobs for tables with given write disposition and resolve the table chain""" result: Set[str] = set() - table_jobs = self.get_new_jobs_info(load_id, schema, dispositions) - for job in table_jobs: - top_job_table = get_top_level_table(schema.tables, self.get_load_table(schema, job.job_id())["name"]) - table_chain = get_child_tables(schema.tables, top_job_table["name"]) - for table in table_chain: - existing_jobs = self.load_storage.list_jobs_for_table(load_id, table["name"]) - # only add tables for tables that have jobs unless the disposition is replace - if not existing_jobs and top_job_table["write_disposition"] != "replace": - continue + for table_name in tables_with_jobs: + top_job_table = get_top_level_table(schema.tables, table_name) + if not filter(top_job_table): + continue + for table in get_child_tables(schema.tables, top_job_table["name"]): result.add(table["name"]) return result + @staticmethod + def _init_client_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: + staging_text = "for staging dataset" if staging_info else "" + logger.info(f"Client for {job_client.config.destination_name} will start initialize storage {staging_text}") + job_client.initialize_storage() + logger.info(f"Client for {job_client.config.destination_name} will update schema to package schema {staging_text}") + applied_update = job_client.update_stored_schema(only_tables=update_tables, expected_update=expected_update) + logger.info(f"Client for {job_client.config.destination_name} will truncate tables {staging_text}") + job_client.initialize_storage(truncate_tables=truncate_tables) + return applied_update + def load_single_package(self, load_id: str, schema: Schema) -> None: # initialize analytical storage ie. create dataset required by passed schema - job_client: JobClientBase with self.get_destination_client(schema) as job_client: - expected_update = self.load_storage.begin_schema_update(load_id) - if expected_update is not None: - # update the default dataset - logger.info(f"Client for {job_client.config.destination_name} will start initialize storage") - job_client.initialize_storage() - logger.info(f"Client for {job_client.config.destination_name} will update schema to package schema") - all_jobs = self.get_new_jobs_info(load_id, schema) - all_tables = set(job.table_name for job in all_jobs) + + if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: + + tables_with_jobs = set(job.table_name for job in self.get_new_jobs_info(load_id)) dlt_tables = set(t["name"] for t in schema.dlt_tables()) + + # update the default dataset + truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_truncating, tables_with_jobs) + applied_update = self._init_client_and_update_schema(job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + # only update tables that are present in the load package - applied_update = job_client.update_stored_schema(only_tables=all_tables | dlt_tables, expected_update=expected_update) - truncate_tables = self.get_table_chain_tables_for_write_disposition(load_id, schema, job_client.get_truncate_destination_table_dispositions()) - job_client.initialize_storage(truncate_tables=truncate_tables) - # initialize staging storage if needed if self.staging_destination: with self.get_staging_destination_client(schema) as staging_client: - truncate_dispositions = staging_client.get_truncate_destination_table_dispositions() - truncate_dispositions.extend(job_client.get_truncate_staging_destination_table_dispositions()) - truncate_tables = self.get_table_chain_tables_for_write_disposition(load_id, schema, truncate_dispositions) - staging_client.initialize_storage(truncate_tables) + truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_truncating, tables_with_jobs) + self._init_client_and_update_schema(staging_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + # update the staging dataset if client supports this if isinstance(job_client, WithStagingDataset): - if staging_tables := self.get_table_chain_tables_for_write_disposition(load_id, schema, job_client.get_stage_dispositions()): + if staging_tables := self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging, tables_with_jobs): with job_client.with_staging_dataset(): - logger.info(f"Client for {job_client.config.destination_name} will start initialize STAGING storage") - job_client.initialize_storage() - logger.info(f"Client for {job_client.config.destination_name} will UPDATE STAGING SCHEMA to package schema") - job_client.update_stored_schema(only_tables=staging_tables | {schema.version_table_name}, expected_update=expected_update) - logger.info(f"Client for {job_client.config.destination_name} will TRUNCATE STAGING TABLES: {staging_tables}") - job_client.initialize_storage(truncate_tables=staging_tables) + self._init_client_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) + self.load_storage.commit_schema_update(load_id, applied_update) + # initialize staging destination and spool or retrieve unfinished jobs if self.staging_destination: with self.get_staging_destination_client(schema) as staging_client: diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index e1650549cc..72772b0e2d 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -14,14 +14,20 @@ from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration from tests.utils import skip_if_not_active +from dlt.destinations.exceptions import DatabaseTerminalException + skip_if_not_active("athena") def test_iceberg() -> None: + """ + We write two tables, one with the iceberg flag, one without. We expect the iceberg table and its subtables to accept update commands + and the other table to reject them. + """ os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "s3://dlt-ci-test-bucket" - pipeline = dlt.pipeline(pipeline_name="aaathena-iceberg", destination="athena", staging="filesystem", full_refresh=True) + pipeline = dlt.pipeline(pipeline_name="aaaaathena-iceberg", destination="athena", staging="filesystem", full_refresh=True) def items() -> Iterator[Any]: yield { @@ -40,22 +46,33 @@ def items() -> Iterator[Any]: def items_normal(): yield from items() - @dlt.resource(name="items_iceberg", write_disposition="append") + @dlt.resource(name="items_iceberg", write_disposition="append", table_format="iceberg") def items_iceberg(): yield from items() print(pipeline.run([items_normal, items_iceberg])) - return - # see if we have athena tables with items table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) - assert table_counts["items"] == 1 - assert table_counts["items__sub_items"] == 2 + assert table_counts["items_normal"] == 1 + assert table_counts["items_normal__sub_items"] == 2 assert table_counts["_dlt_loads"] == 1 - pipeline.run(items) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) - assert table_counts["items"] == 2 - assert table_counts["items__sub_items"] == 4 - assert table_counts["_dlt_loads"] == 2 \ No newline at end of file + assert table_counts["items_iceberg"] == 1 + assert table_counts["items_iceberg__sub_items"] == 2 + + with pipeline.sql_client() as client: + client.execute_sql("SELECT * FROM items_normal") + + # modifying regular athena table will fail + with pytest.raises(DatabaseTerminalException) as dbex: + client.execute_sql("UPDATE items_normal SET name='new name'") + assert "Modifying Hive table rows is only supported for transactional tables" in str(dbex) + with pytest.raises(DatabaseTerminalException) as dbex: + client.execute_sql("UPDATE items_normal__sub_items SET name='super new name'") + assert "Modifying Hive table rows is only supported for transactional tables" in str(dbex) + + # modifying iceberg table will succeed + client.execute_sql("UPDATE items_iceberg SET name='new name'") + client.execute_sql("UPDATE items_iceberg__sub_items SET name='super new name'") + diff --git a/tests/load/utils.py b/tests/load/utils.py index a615b696e3..9fd4f033b7 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -17,7 +17,7 @@ from dlt.common.data_writers import DataWriter from dlt.common.schema import TColumnSchema, TTableSchemaColumns, Schema from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration -from dlt.common.schema.utils import new_table +from dlt.common.schema.utils import new_table, get_load_table from dlt.common.storages.load_storage import ParsedLoadJobFileName, LoadStorage from dlt.common.typing import StrAny from dlt.common.utils import uniq_id @@ -170,7 +170,7 @@ def load_table(name: str) -> Dict[str, TTableSchemaColumns]: def expect_load_file(client: JobClientBase, file_storage: FileStorage, query: str, table_name: str, status = "completed") -> LoadJob: file_name = ParsedLoadJobFileName(table_name, uniq_id(), 0, client.capabilities.preferred_loader_file_format).job_id() file_storage.save(file_name, query.encode("utf-8")) - table = Load.get_load_table(client.schema, file_name) + table = get_load_table(client.schema.tables, table_name) job = client.start_file_load(table, file_storage.make_full_path(file_name), uniq_id()) while job.state() == "running": sleep(0.5) diff --git a/tests/utils.py b/tests/utils.py index 2d675f514a..7321049c9d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -34,6 +34,7 @@ # filter out active destinations for current tests ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS) +# ACTIVE_DESTINATIONS = {"duckdb"} ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) From 122d035d147114562639a4651f9ad38f80eca33d Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 11 Oct 2023 16:55:29 +0200 Subject: [PATCH 19/36] PR changes --- .github/workflows/test_destination_athena.yml | 1 + .../test_destination_athena_iceberg.yml | 94 +++++++++++++++++++ dlt/common/destination/reference.py | 17 +++- dlt/common/schema/utils.py | 16 +--- dlt/destinations/athena/athena.py | 10 +- dlt/destinations/athena/configuration.py | 1 + dlt/load/load.py | 4 +- tests/load/test_dummy_client.py | 11 +-- tests/load/utils.py | 9 +- tests/utils.py | 4 + 10 files changed, 135 insertions(+), 32 deletions(-) create mode 100644 .github/workflows/test_destination_athena_iceberg.yml diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 16c9caff53..704e66522b 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -21,6 +21,7 @@ env: RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB ACTIVE_DESTINATIONS: "[\"athena\"]" ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-parquet-staging-iceberg\"]" jobs: get_docs_changes: diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml new file mode 100644 index 0000000000..6892a96bf1 --- /dev/null +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -0,0 +1,94 @@ + +name: test athena iceberg + +on: + pull_request: + branches: + - master + - devel + workflow_dispatch: + +env: + DESTINATION__FILESYSTEM__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 + DESTINATION__FILESYSTEM__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DESTINATION__ATHENA__CREDENTIALS__AWS_ACCESS_KEY_ID: AKIAT4QMVMC4J46G55G4 + DESTINATION__ATHENA__CREDENTIALS__AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + DESTINATION__ATHENA__CREDENTIALS__REGION_NAME: eu-central-1 + DESTINATION__ATHENA__QUERY_RESULT_BUCKET: s3://dlt-athena-output + + RUNTIME__SENTRY_DSN: https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752 + RUNTIME__LOG_LEVEL: ERROR + RUNTIME__DLTHUB_TELEMETRY_SEGMENT_WRITE_KEY: TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB + ACTIVE_DESTINATIONS: "[\"athena\"]" + ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" + EXCLUDED_DESTINATION_CONFIGURATIONS: "[\"athena-no-staging\"]" + +jobs: + get_docs_changes: + uses: ./.github/workflows/get_docs_changes.yml + # Tests that require credentials do not run in forks + if: ${{ !github.event.pull_request.head.repo.fork }} + + run_loader: + name: test destination athena iceberg + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' + strategy: + fail-fast: false + matrix: + os: ["ubuntu-latest"] + # os: ["ubuntu-latest", "macos-latest", "windows-latest"] + defaults: + run: + shell: bash + runs-on: ${{ matrix.os }} + + steps: + + - name: Check out + uses: actions/checkout@master + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10.x" + + - name: Install Poetry + uses: snok/install-poetry@v1.3.2 + with: + virtualenvs-create: true + virtualenvs-in-project: true + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v3 + with: + # path: ${{ steps.pip-cache.outputs.dir }} + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-athena + + - name: Install dependencies + # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction -E athena + + - run: | + poetry run pytest tests/load + if: runner.os != 'Windows' + name: Run tests Linux/MAC + - run: | + poetry run pytest tests/load + if: runner.os == 'Windows' + name: Run tests Windows + shell: cmd + + matrix_job_required_check: + name: Redshift, PostgreSQL and DuckDB tests + needs: run_loader + runs-on: ubuntu-latest + if: always() + steps: + - name: Check matrix job results + if: contains(needs.*.result, 'failure') || contains(needs.*.result, 'cancelled') + run: | + echo "One or more matrix job tests failed or were cancelled. You may need to re-run them." && exit 1 diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 5fea462159..c2dc0ebcc6 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -4,18 +4,20 @@ from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any from contextlib import contextmanager import datetime # noqa: 251 +from copy import deepcopy from dlt.common import logger from dlt.common.exceptions import IdentifierTooLongException, InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName -from dlt.common.schema.utils import get_load_table +from dlt.common.schema.utils import get_write_disposition, get_table_format from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.schema.utils import is_complete_column +from dlt.common.schema.exceptions import UnknownTableException from dlt.common.storages import FileStorage from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.utils import get_module_name @@ -287,6 +289,19 @@ def _verify_schema(self) -> None: if not is_complete_column(column): logger.warning(f"A column {column_name} in table {table_name} in schema {self.schema.name} is incomplete. It was not bound to the data during normalizations stage and its data type is unknown. Did you add this column manually in code ie. as a merge key?") + def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> TTableSchema: + try: + # make a copy of the schema so modifications do not affect the original document + table = deepcopy(self.schema.tables[table_name]) + # add write disposition if not specified - in child tables + if "write_disposition" not in table: + table["write_disposition"] = get_write_disposition(self.schema.tables, table_name) + if "table_format" not in table: + table["table_format"] = get_table_format(self.schema.tables, table_name) + return table + except KeyError: + raise UnknownTableException(table_name) + class WithStateSync(ABC): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 93f0913550..b89925d9b0 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -511,11 +511,11 @@ def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_ def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition: """Returns table hint of a table if present. If not, looks up into parent table""" - return get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False) + return cast(TWriteDisposition, get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False)) def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: - return get_inherited_table_hint(tables, table_name, "table_format", allow_none=True) + return cast(TTableFormat, get_inherited_table_hint(tables, table_name, "table_format", allow_none=True)) def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: @@ -536,18 +536,6 @@ def get_top_level_table(tables: TSchemaTables, table_name: str) -> TTableSchema: return get_top_level_table(tables, parent) return table -def get_load_table(tables: TSchemaTables, table_name: str) -> TTableSchema: - try: - # make a copy of the schema so modifications do not affect the original document - table = copy(tables[table_name]) - # add write disposition if not specified - in child tables - if "write_disposition" not in table: - table["write_disposition"] = get_write_disposition(tables, table_name) - if "table_format" not in table: - table["table_format"] = get_table_format(tables, table_name) - return table - except KeyError: - raise UnknownTableException(table_name) def get_child_tables(tables: TSchemaTables, table_name: str) -> List[TTableSchema]: """Get child tables for table name and return a list of tables ordered by ancestry so the child tables are always after their parents""" diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index c9d6f3abb7..514d868047 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -15,7 +15,7 @@ from dlt.common import logger from dlt.common.utils import without_none from dlt.common.data_types import TDataType -from dlt.common.schema import TColumnSchema, Schema +from dlt.common.schema import TColumnSchema, Schema, TSchemaTables, TTableSchema from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition from dlt.common.schema.utils import table_schema_has_type, get_table_format from dlt.common.destination import DestinationCapabilitiesContext @@ -325,7 +325,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or (self._is_iceberg_table(self.schema.tables[table_name]) and not self.in_staging_mode) + is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or self._is_iceberg_table(self.schema.tables[table_name]) columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -381,6 +381,12 @@ def table_needs_staging(self, table: TTableSchema) -> bool: if self._is_iceberg_table(table): return True return super().table_needs_staging(table) + + def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: + table = super().get_load_table(table_name, staging) + if staging and table.get("table_format", None) == "iceberg": + table.pop("table_format") + return table @staticmethod def is_dbapi_exception(ex: Exception) -> bool: diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py index 7eca85fe41..a7f05e520d 100644 --- a/dlt/destinations/athena/configuration.py +++ b/dlt/destinations/athena/configuration.py @@ -13,6 +13,7 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): athena_work_group: Optional[str] = None aws_data_catalog: Optional[str] = "awsdatacatalog" supports_truncate_command: bool = False + force_iceberg: Optional[bool] = True __config_gen_annotations__: ClassVar[List[str]] = ["athena_work_group"] diff --git a/dlt/load/load.py b/dlt/load/load.py index ddce9bf8e9..e1873761a6 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -10,7 +10,7 @@ from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.pipeline import LoadInfo, SupportsPipeline -from dlt.common.schema.utils import get_child_tables, get_top_level_table, get_load_table +from dlt.common.schema.utils import get_child_tables, get_top_level_table from dlt.common.storages.load_storage import LoadPackageInfo, ParsedLoadJobFileName, TJobState from dlt.common.typing import StrAny from dlt.common.runners import TRunMetrics, Runnable, workermethod @@ -98,7 +98,7 @@ def w_spool_job(self: "Load", file_path: str, load_id: str, schema: Schema) -> O if job_info.file_format not in self.load_storage.supported_file_formats: raise LoadClientUnsupportedFileFormats(job_info.file_format, self.capabilities.supported_loader_file_formats, file_path) logger.info(f"Will load file {file_path} with table name {job_info.table_name}") - table = get_load_table(schema.tables, job_info.table_name) + table = job_client.get_load_table(job_info.table_name) if table["write_disposition"] not in ["append", "replace", "merge"]: raise LoadClientUnsupportedWriteDisposition(job_info.table_name, table["write_disposition"], file_path) with self.maybe_with_staging_dataset(job_client, table): diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index dcea7bd94d..aaa89ebfb1 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -100,16 +100,7 @@ def test_get_new_jobs_info() -> None: ) # no write disposition specified - get all new jobs - assert len(load.get_new_jobs_info(load_id, schema)) == 2 - # empty list - none - assert len(load.get_new_jobs_info(load_id, schema, [])) == 0 - # two appends - assert len(load.get_new_jobs_info(load_id, schema, ["append"])) == 2 - assert len(load.get_new_jobs_info(load_id, schema, ["replace"])) == 0 - assert len(load.get_new_jobs_info(load_id, schema, ["replace", "append"])) == 2 - - load.load_storage.start_job(load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl") - assert len(load.get_new_jobs_info(load_id, schema, ["replace", "append"])) == 1 + assert len(load.get_new_jobs_info(load_id)) == 2 def test_get_completed_table_chain_single_job_per_table() -> None: diff --git a/tests/load/utils.py b/tests/load/utils.py index 9fd4f033b7..60f389d064 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -17,7 +17,7 @@ from dlt.common.data_writers import DataWriter from dlt.common.schema import TColumnSchema, TTableSchemaColumns, Schema from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration -from dlt.common.schema.utils import new_table, get_load_table +from dlt.common.schema.utils import new_table from dlt.common.storages.load_storage import ParsedLoadJobFileName, LoadStorage from dlt.common.typing import StrAny from dlt.common.utils import uniq_id @@ -26,7 +26,7 @@ from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS +from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS from tests.cases import TABLE_UPDATE_COLUMNS_SCHEMA, TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES, assert_all_data_types_row # bucket urls @@ -53,6 +53,7 @@ class DestinationTestConfiguration: staging_iam_role: Optional[str] = None extra_info: Optional[str] = None supports_merge: bool = True # TODO: take it from client base class + force_iceberg: bool = True @property def name(self) -> str: @@ -72,6 +73,7 @@ def setup(self) -> None: os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = self.bucket_url or "" os.environ['DESTINATION__STAGE_NAME'] = self.stage_name or "" os.environ['DESTINATION__STAGING_IAM_ROLE'] = self.staging_iam_role or "" + os.environ['DESTINATION__ATHENA__FORCE_ICEBERG'] = str(self.force_iceberg) or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" if self.destination == "filesystem": @@ -108,6 +110,7 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] + destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] if default_vector_configs: # for now only weaviate @@ -170,7 +173,7 @@ def load_table(name: str) -> Dict[str, TTableSchemaColumns]: def expect_load_file(client: JobClientBase, file_storage: FileStorage, query: str, table_name: str, status = "completed") -> LoadJob: file_name = ParsedLoadJobFileName(table_name, uniq_id(), 0, client.capabilities.preferred_loader_file_format).job_id() file_storage.save(file_name, query.encode("utf-8")) - table = get_load_table(client.schema.tables, table_name) + table = client.get_load_table(table_name) job = client.start_file_load(table, file_storage.make_full_path(file_name), uniq_id()) while job.state() == "running": sleep(0.5) diff --git a/tests/utils.py b/tests/utils.py index 7321049c9d..2a0238cdfe 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -31,6 +31,10 @@ NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck"} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS +# exclude destination configs (for now used for athena and athena iceberg separation) +EXCLUDED_DESTINATION_CONFIGURATIONS = set(dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set()) + + # filter out active destinations for current tests ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS) From 7750318f99b9587955a2a9ad3db3f3ee03c24894 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 11 Oct 2023 17:05:50 +0200 Subject: [PATCH 20/36] small changes --- dlt/destinations/athena/configuration.py | 2 +- dlt/destinations/filesystem/configuration.py | 1 + dlt/destinations/filesystem/filesystem.py | 2 +- tests/load/utils.py | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py index a7f05e520d..5dd1341c34 100644 --- a/dlt/destinations/athena/configuration.py +++ b/dlt/destinations/athena/configuration.py @@ -13,7 +13,7 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration): athena_work_group: Optional[str] = None aws_data_catalog: Optional[str] = "awsdatacatalog" supports_truncate_command: bool = False - force_iceberg: Optional[bool] = True + force_iceberg: Optional[bool] = False __config_gen_annotations__: ClassVar[List[str]] = ["athena_work_group"] diff --git a/dlt/destinations/filesystem/configuration.py b/dlt/destinations/filesystem/configuration.py index 174dfafb1a..8939060231 100644 --- a/dlt/destinations/filesystem/configuration.py +++ b/dlt/destinations/filesystem/configuration.py @@ -10,6 +10,7 @@ @configspec class FilesystemDestinationClientConfiguration(FilesystemConfiguration, DestinationClientStagingConfiguration): # type: ignore[misc] destination_name: Final[str] = "filesystem" # type: ignore + force_iceberg: Optional[bool] = False @resolve_type('credentials') def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index 6ad5954496..f3d5a04f2d 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -179,6 +179,6 @@ def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb def table_needs_staging(self, table: TTableSchema) -> bool: # not so nice, how to do it better, collect this info from the main destination as before? - if table["table_format"] == "iceberg": + if table["table_format"] == "iceberg" or self.config.force_iceberg: return True return super().table_needs_staging(table) diff --git a/tests/load/utils.py b/tests/load/utils.py index 60f389d064..13ce5dcc0a 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -73,7 +73,7 @@ def setup(self) -> None: os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = self.bucket_url or "" os.environ['DESTINATION__STAGE_NAME'] = self.stage_name or "" os.environ['DESTINATION__STAGING_IAM_ROLE'] = self.staging_iam_role or "" - os.environ['DESTINATION__ATHENA__FORCE_ICEBERG'] = str(self.force_iceberg) or "" + os.environ['DESTINATION__FORCE_ICEBERG'] = str(self.force_iceberg) or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" if self.destination == "filesystem": From 0deecdaf0c824a745754e9c3d44b8afd93742bc7 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 11 Oct 2023 17:17:31 +0200 Subject: [PATCH 21/36] small changes --- dlt/common/destination/reference.py | 2 +- dlt/destinations/filesystem/filesystem.py | 4 ++-- dlt/destinations/job_client_impl.py | 2 +- dlt/load/load.py | 16 ++++++++-------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index c2dc0ebcc6..b3d54571b0 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -324,7 +324,7 @@ class WithStagingDataset(ABC): """Adds capability to use staging dataset and request it from the loader""" @abstractmethod - def table_needs_staging(self, table: TTableSchema) -> bool: + def table_needs_staging_dataset(self, table: TTableSchema) -> bool: return False @abstractmethod diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index f3d5a04f2d..629f39f220 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -177,8 +177,8 @@ def __enter__(self) -> "FilesystemClient": def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: pass - def table_needs_staging(self, table: TTableSchema) -> bool: + def table_needs_staging_dataset(self, table: TTableSchema) -> bool: # not so nice, how to do it better, collect this info from the main destination as before? if table["table_format"] == "iceberg" or self.config.force_iceberg: return True - return super().table_needs_staging(table) + return super().table_needs_staging_dataset(table) diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index ec3afced94..fa657db39e 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -440,7 +440,7 @@ def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: finally: self.in_staging_mode = False - def table_needs_staging(self, table: TTableSchema) -> bool: + def table_needs_staging_dataset(self, table: TTableSchema) -> bool: if table["write_disposition"] == "merge": return True elif table["write_disposition"] == "replace" and (self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]): diff --git a/dlt/load/load.py b/dlt/load/load.py index e1873761a6..4a29db3282 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -80,7 +80,7 @@ def is_staging_destination_job(self, file_path: str) -> bool: @contextlib.contextmanager def maybe_with_staging_dataset(self, job_client: JobClientBase, table: TTableSchema) -> Iterator[None]: """Executes job client methods in context of staging dataset if `table` has `write_disposition` that requires it""" - if isinstance(job_client, WithStagingDataset) and job_client.table_needs_staging(table): + if isinstance(job_client, WithStagingDataset) and job_client.table_needs_staging_dataset(table): with job_client.with_staging_dataset(): yield else: @@ -295,18 +295,18 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_truncating, tables_with_jobs) applied_update = self._init_client_and_update_schema(job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) - # only update tables that are present in the load package - if self.staging_destination: - with self.get_staging_destination_client(schema) as staging_client: - truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_truncating, tables_with_jobs) - self._init_client_and_update_schema(staging_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) - # update the staging dataset if client supports this if isinstance(job_client, WithStagingDataset): - if staging_tables := self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging, tables_with_jobs): + if staging_tables := self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging_dataset, tables_with_jobs): with job_client.with_staging_dataset(): self._init_client_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) + # only update tables that are present in the load package + if self.staging_destination: + with self.get_staging_destination_client(schema) as staging_client: + truncate_tables = self._get_table_chain_tables_with_filter(schema, staging_client.table_needs_truncating, tables_with_jobs) + self._init_client_and_update_schema(staging_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + self.load_storage.commit_schema_update(load_id, applied_update) # initialize staging destination and spool or retrieve unfinished jobs From 702fd4b610937e248feffbbf937aaac2d165676e Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 11 Oct 2023 21:02:56 +0200 Subject: [PATCH 22/36] fix two tests --- tests/load/test_job_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 030f9171d9..918eac67a8 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -428,14 +428,14 @@ def test_load_with_all_types(client: SqlJobClientBase, write_disposition: TWrite client.schema.bump_version() client.update_stored_schema() - if write_disposition in client.get_stage_dispositions(): # type: ignore[attr-defined] + if client.table_needs_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] with client.with_staging_dataset(): # type: ignore[attr-defined] # create staging for merge dataset client.initialize_storage() client.update_stored_schema() with client.sql_client.with_staging_dataset( - write_disposition in client.get_stage_dispositions() # type: ignore[attr-defined] + client.table_needs_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] ): canonical_name = client.sql_client.make_qualified_table_name(table_name) # write row @@ -493,7 +493,7 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: TWriteD with io.BytesIO() as f: write_dataset(client, f, [table_row], TABLE_UPDATE_COLUMNS_SCHEMA) query = f.getvalue().decode() - if write_disposition in client.get_stage_dispositions(): # type: ignore[attr-defined] + if client.table_needs_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] # load to staging dataset on merge with client.with_staging_dataset(): # type: ignore[attr-defined] expect_load_file(client, file_storage, query, t) From d70985d3f16f14f42d43ba54a14393e2213a3e28 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 11 Oct 2023 21:21:12 +0200 Subject: [PATCH 23/36] add missing athena fixes --- dlt/destinations/athena/athena.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 514d868047..1888354a34 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -325,7 +325,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or self._is_iceberg_table(self.schema.tables[table_name]) + is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or (self._is_iceberg_table(self.schema.tables[table_name]) and not self.in_staging_mode) columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -374,18 +374,18 @@ def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> def _is_iceberg_table(self, table: TTableSchema) -> bool: table_format = get_table_format(self.schema.tables, table["name"]) - return table_format == "iceberg" + return table_format == "iceberg" or self.config.force_iceberg - def table_needs_staging(self, table: TTableSchema) -> bool: + def table_needs_staging_dataset(self, table: TTableSchema) -> bool: # all iceberg tables need staging if self._is_iceberg_table(table): return True - return super().table_needs_staging(table) + return super().table_needs_staging_dataset(table) def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) - if staging and table.get("table_format", None) == "iceberg": - table.pop("table_format") + # if staging and table.get("table_format", None) == "iceberg": + # table.pop("table_format") return table @staticmethod From 06dbaebe5926b4be3770fd2d58b2d9fd4ad4caa0 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 12 Oct 2023 11:03:54 +0200 Subject: [PATCH 24/36] small changes --- dlt/common/schema/utils.py | 2 +- dlt/destinations/filesystem/filesystem.py | 2 +- tests/load/utils.py | 5 ++++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index b89925d9b0..3c0f663fcf 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -494,7 +494,7 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_name: str, allow_none: bool = False) -> Any: - table = tables[table_name] + table = tables.get(table_name, {}) hint = table.get(table_hint_name) if hint: return hint diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index 715713d8e9..1902a926e3 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -179,6 +179,6 @@ def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb def table_needs_staging_dataset(self, table: TTableSchema) -> bool: # not so nice, how to do it better, collect this info from the main destination as before? - if table["table_format"] == "iceberg" or self.config.force_iceberg: + if table.get("table_format") == "iceberg" or self.config.force_iceberg: return True return super().table_needs_staging_dataset(table) diff --git a/tests/load/utils.py b/tests/load/utils.py index 13ce5dcc0a..57c97c830c 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -110,7 +110,7 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] + # destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] if default_vector_configs: # for now only weaviate @@ -154,6 +154,9 @@ def destinations_configs( if exclude: destination_configs = [conf for conf in destination_configs if conf.destination not in exclude] + # filter out excluded configs + destination_configs = [conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS] + return destination_configs From baa5e44091ed35aa36445f4fe3b99db3c6051041 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 12 Oct 2023 17:00:27 +0200 Subject: [PATCH 25/36] fixes --- dlt/common/schema/utils.py | 4 ++-- dlt/common/storages/load_storage.py | 2 +- dlt/destinations/athena/athena.py | 4 ++-- dlt/destinations/filesystem/filesystem.py | 2 +- dlt/extract/decorators.py | 1 + dlt/load/load.py | 15 ++++++++------- tests/load/utils.py | 5 ++--- 7 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 3c0f663fcf..6d001c8b90 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -15,7 +15,7 @@ from dlt.common.validation import TCustomValidator, validate_dict, validate_dict_ignoring_xkeys from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, - TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, + TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName, UnknownTableException) @@ -502,7 +502,7 @@ def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_ parent = table.get("parent") if parent: return get_inherited_table_hint(tables, parent, table_hint_name, allow_none) - + if allow_none: return None diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 8e8a0ac5a8..2f52365787 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -238,7 +238,7 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]: def list_jobs_for_table(self, load_id: str, table_name: str) -> Sequence[LoadJobInfo]: return [job for job in self.list_all_jobs(load_id) if job.job_file_info.table_name == table_name] - + def list_all_jobs(self, load_id: str) -> Sequence[LoadJobInfo]: info = self.get_load_package_info(load_id) return [job for job in flatten_list_or_items(iter(info.jobs.values()))] # type: ignore diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 1888354a34..e5fd71e066 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -375,13 +375,13 @@ def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> def _is_iceberg_table(self, table: TTableSchema) -> bool: table_format = get_table_format(self.schema.tables, table["name"]) return table_format == "iceberg" or self.config.force_iceberg - + def table_needs_staging_dataset(self, table: TTableSchema) -> bool: # all iceberg tables need staging if self._is_iceberg_table(table): return True return super().table_needs_staging_dataset(table) - + def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) # if staging and table.get("table_format", None) == "iceberg": diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index 1902a926e3..5adf6c618d 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -179,6 +179,6 @@ def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb def table_needs_staging_dataset(self, table: TTableSchema) -> bool: # not so nice, how to do it better, collect this info from the main destination as before? - if table.get("table_format") == "iceberg" or self.config.force_iceberg: + if table.get("table_format") == "iceberg" or (self.config.force_iceberg is True): return True return super().table_needs_staging_dataset(table) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 2b0fd2d9b4..b036bdc4a6 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -233,6 +233,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True diff --git a/dlt/load/load.py b/dlt/load/load.py index 4a29db3282..73746b3684 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -260,27 +260,27 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) logger.info(f"All jobs completed, archiving package {load_id} with aborted set to {aborted}") @staticmethod - def _get_table_chain_tables_with_filter(schema: Schema, filter: Callable, tables_with_jobs: Iterable[str]) -> Set[str]: + def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchema], bool], tables_with_jobs: Iterable[str]) -> Set[str]: """Get all jobs for tables with given write disposition and resolve the table chain""" result: Set[str] = set() for table_name in tables_with_jobs: top_job_table = get_top_level_table(schema.tables, table_name) - if not filter(top_job_table): + if not f(top_job_table): continue for table in get_child_tables(schema.tables, top_job_table["name"]): result.add(table["name"]) return result @staticmethod - def _init_client_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: - staging_text = "for staging dataset" if staging_info else "" + def _init_client_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: + staging_text = "for staging dataset" if staging_info else "" logger.info(f"Client for {job_client.config.destination_name} will start initialize storage {staging_text}") job_client.initialize_storage() logger.info(f"Client for {job_client.config.destination_name} will update schema to package schema {staging_text}") applied_update = job_client.update_stored_schema(only_tables=update_tables, expected_update=expected_update) logger.info(f"Client for {job_client.config.destination_name} will truncate tables {staging_text}") job_client.initialize_storage(truncate_tables=truncate_tables) - return applied_update + return applied_update def load_single_package(self, load_id: str, schema: Schema) -> None: # initialize analytical storage ie. create dataset required by passed schema @@ -302,9 +302,10 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: self._init_client_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) # only update tables that are present in the load package - if self.staging_destination: + if self.staging_destination and isinstance(job_client, WithStagingDataset): with self.get_staging_destination_client(schema) as staging_client: - truncate_tables = self._get_table_chain_tables_with_filter(schema, staging_client.table_needs_truncating, tables_with_jobs) + # truncate all the tables in staging that are requested by the job client (TODO: make this better...) + truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging_dataset, tables_with_jobs) self._init_client_and_update_schema(staging_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) self.load_storage.commit_schema_update(load_id, applied_update) diff --git a/tests/load/utils.py b/tests/load/utils.py index 57c97c830c..813bd69ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -53,7 +53,7 @@ class DestinationTestConfiguration: staging_iam_role: Optional[str] = None extra_info: Optional[str] = None supports_merge: bool = True # TODO: take it from client base class - force_iceberg: bool = True + force_iceberg: bool = False @property def name(self) -> str: @@ -110,7 +110,7 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] - # destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] + destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] if default_vector_configs: # for now only weaviate @@ -157,7 +157,6 @@ def destinations_configs( # filter out excluded configs destination_configs = [conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS] - return destination_configs From ad8dc9b3bbc54b075aba7d3f61a9228411d65a45 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 12 Oct 2023 18:15:11 +0200 Subject: [PATCH 26/36] update --- dlt/common/destination/reference.py | 13 ++++- dlt/destinations/athena/athena.py | 19 +++++-- dlt/destinations/bigquery/bigquery.py | 4 +- dlt/destinations/filesystem/filesystem.py | 7 +-- dlt/destinations/job_client_impl.py | 4 +- dlt/destinations/redshift/redshift.py | 4 +- dlt/destinations/snowflake/snowflake.py | 8 +-- dlt/load/load.py | 67 ++++++++++++++--------- tests/load/test_job_client.py | 6 +- 9 files changed, 82 insertions(+), 50 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index b3d54571b0..a4567a5b3e 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -247,7 +247,7 @@ def restore_file_load(self, file_path: str) -> LoadJob: """Finds and restores already started loading job identified by `file_path` if destination supports it.""" pass - def table_needs_truncating(self, table: TTableSchema) -> bool: + def should_truncate_table_before_load(self, table: TTableSchema) -> bool: return table["write_disposition"] == "replace" def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: @@ -324,7 +324,7 @@ class WithStagingDataset(ABC): """Adds capability to use staging dataset and request it from the loader""" @abstractmethod - def table_needs_staging_dataset(self, table: TTableSchema) -> bool: + def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: return False @abstractmethod @@ -332,6 +332,15 @@ def with_staging_dataset(self)-> ContextManager["JobClientBase"]: """Executes job client methods on staging dataset""" return self # type: ignore +class SupportsStagingDestination(): + """Adds capability to support a staging destination for the load""" + + def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTableSchema) -> bool: + return False + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + # the default is to truncate the tables on the staging destination... + return True TDestinationReferenceArg = Union["DestinationReference", ModuleType, None, str] diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index e5fd71e066..3b09f47488 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -20,7 +20,7 @@ from dlt.common.schema.utils import table_schema_has_type, get_table_format from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob, FollowupJob -from dlt.common.destination.reference import TLoadJobState, NewLoadJob +from dlt.common.destination.reference import TLoadJobState, NewLoadJob, SupportsStagingDestination from dlt.common.storages import FileStorage from dlt.common.data_writers.escape import escape_bigquery_identifier from dlt.destinations.sql_jobs import SqlStagingCopyJob @@ -286,7 +286,7 @@ def has_dataset(self) -> bool: return len(rows) > 0 -class AthenaClient(SqlJobClientWithStaging): +class AthenaClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -376,11 +376,22 @@ def _is_iceberg_table(self, table: TTableSchema) -> bool: table_format = get_table_format(self.schema.tables, table["name"]) return table_format == "iceberg" or self.config.force_iceberg - def table_needs_staging_dataset(self, table: TTableSchema) -> bool: + def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: # all iceberg tables need staging if self._is_iceberg_table(table): return True - return super().table_needs_staging_dataset(table) + return super().should_load_data_to_staging_dataset(table) + + def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: + # on athena we only truncate replace tables that are not iceberg + table = self.get_load_table(table["name"]) + if table["write_disposition"] == "replace" and not self._is_iceberg_table(table): + return True + return False + + def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTableSchema) -> bool: + """iceberg table data goes into staging on staging destination""" + return self._is_iceberg_table(table) def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index eceb2ed57a..f1478697fc 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -7,7 +7,7 @@ from dlt.common import json, logger from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob +from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, SupportsStagingDestination from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -151,7 +151,7 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") return sql -class BigQueryClient(SqlJobClientWithStaging): +class BigQueryClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index 5adf6c618d..ecadb74f8e 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -177,8 +177,5 @@ def __enter__(self) -> "FilesystemClient": def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: pass - def table_needs_staging_dataset(self, table: TTableSchema) -> bool: - # not so nice, how to do it better, collect this info from the main destination as before? - if table.get("table_format") == "iceberg" or (self.config.force_iceberg is True): - return True - return super().table_needs_staging_dataset(table) + def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: + return False diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index fa657db39e..8b26ac06ee 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -140,7 +140,7 @@ def maybe_ddl_transaction(self) -> Iterator[None]: else: yield - def table_needs_truncating(self, table: TTableSchema) -> bool: + def should_truncate_table_before_load(self, table: TTableSchema) -> bool: return table["write_disposition"] == "replace" and self.config.replace_strategy == "truncate-and-insert" def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: @@ -440,7 +440,7 @@ def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: finally: self.in_staging_mode = False - def table_needs_staging_dataset(self, table: TTableSchema) -> bool: + def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: if table["write_disposition"] == "merge": return True elif table["write_disposition"] == "replace" and (self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]): diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/redshift/redshift.py index f210d757c1..77ef05de74 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/redshift/redshift.py @@ -14,7 +14,7 @@ from typing import ClassVar, Dict, List, Optional, Sequence, Any from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import NewLoadJob, CredentialsConfiguration +from dlt.common.destination.reference import NewLoadJob, CredentialsConfiguration, SupportsStagingDestination from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType @@ -187,7 +187,7 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st return SqlMergeJob.gen_key_table_clauses(root_table_name, staging_root_table_name, key_clauses, for_delete) -class RedshiftClient(InsertValuesJobClient): +class RedshiftClient(InsertValuesJobClient, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/snowflake/snowflake.py index 58d81a602d..5aa721dd03 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/snowflake/snowflake.py @@ -1,9 +1,9 @@ -from typing import ClassVar, Dict, Optional, Sequence, Tuple, List, Any +from typing import ClassVar, Optional, Sequence, Tuple, List, Any from urllib.parse import urlparse, urlunparse from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, CredentialsConfiguration -from dlt.common.configuration.specs import AwsCredentialsWithoutDefaults, AzureCredentials, AzureCredentialsWithoutDefaults +from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, CredentialsConfiguration, SupportsStagingDestination +from dlt.common.configuration.specs import AwsCredentialsWithoutDefaults, AzureCredentialsWithoutDefaults from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -169,7 +169,7 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient return sql -class SnowflakeClient(SqlJobClientWithStaging): +class SnowflakeClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: SnowflakeClientConfiguration) -> None: diff --git a/dlt/load/load.py b/dlt/load/load.py index 73746b3684..1248274a8f 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -20,7 +20,7 @@ from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TTableSchema, TWriteDisposition from dlt.common.storages import LoadStorage -from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, DestinationReference, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration +from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, DestinationReference, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination from dlt.destinations.job_impl import EmptyLoadJob @@ -78,9 +78,9 @@ def is_staging_destination_job(self, file_path: str) -> bool: return self.staging_destination is not None and os.path.splitext(file_path)[1][1:] in self.staging_destination.capabilities().supported_loader_file_formats @contextlib.contextmanager - def maybe_with_staging_dataset(self, job_client: JobClientBase, table: TTableSchema) -> Iterator[None]: + def maybe_with_staging_dataset(self, job_client: JobClientBase, use_staging: bool) -> Iterator[None]: """Executes job client methods in context of staging dataset if `table` has `write_disposition` that requires it""" - if isinstance(job_client, WithStagingDataset) and job_client.table_needs_staging_dataset(table): + if isinstance(job_client, WithStagingDataset) and use_staging: with job_client.with_staging_dataset(): yield else: @@ -91,18 +91,26 @@ def maybe_with_staging_dataset(self, job_client: JobClientBase, table: TTableSch def w_spool_job(self: "Load", file_path: str, load_id: str, schema: Schema) -> Optional[LoadJob]: job: LoadJob = None try: + is_staging_destination_job = self.is_staging_destination_job(file_path) + job_client = self.get_destination_client(schema) + # if we have a staging destination and the file is not a reference, send to staging - job_client = self.get_staging_destination_client(schema) if self.is_staging_destination_job(file_path) else self.get_destination_client(schema) - with job_client as job_client: + with (self.get_staging_destination_client(schema) if is_staging_destination_job else job_client) as client: job_info = self.load_storage.parse_job_file_name(file_path) if job_info.file_format not in self.load_storage.supported_file_formats: raise LoadClientUnsupportedFileFormats(job_info.file_format, self.capabilities.supported_loader_file_formats, file_path) logger.info(f"Will load file {file_path} with table name {job_info.table_name}") - table = job_client.get_load_table(job_info.table_name) + table = client.get_load_table(job_info.table_name) if table["write_disposition"] not in ["append", "replace", "merge"]: raise LoadClientUnsupportedWriteDisposition(job_info.table_name, table["write_disposition"], file_path) - with self.maybe_with_staging_dataset(job_client, table): - job = job_client.start_file_load(table, self.load_storage.storage.make_full_path(file_path), load_id) + + if is_staging_destination_job: + use_staging_dataset = isinstance(job_client, SupportsStagingDestination) and job_client.should_load_data_to_staging_dataset_on_staging_destination(table) + else: + use_staging_dataset = isinstance(job_client, WithStagingDataset) and job_client.should_load_data_to_staging_dataset(table) + + with self.maybe_with_staging_dataset(client, use_staging_dataset): + job = client.start_file_load(table, self.load_storage.storage.make_full_path(file_path), load_id) except (DestinationTerminalException, TerminalValueError): # if job irreversibly cannot be started, mark it as failed logger.exception(f"Terminal problem when adding job {file_path}") @@ -272,7 +280,7 @@ def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchem return result @staticmethod - def _init_client_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: + def _init_dataset_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: staging_text = "for staging dataset" if staging_info else "" logger.info(f"Client for {job_client.config.destination_name} will start initialize storage {staging_text}") job_client.initialize_storage() @@ -282,31 +290,38 @@ def _init_client_and_update_schema(job_client: JobClientBase, expected_update: T job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update + + def _init_client(self, job_client: JobClientBase, schema: Schema, expected_update: TSchemaTables, load_id: str, truncate_filter: Callable[[TTableSchema], bool], truncate_staging_filter: Callable[[TTableSchema], bool]) -> TSchemaTables: + + tables_with_jobs = set(job.table_name for job in self.get_new_jobs_info(load_id)) + dlt_tables = set(t["name"] for t in schema.dlt_tables()) + + # update the default dataset + truncate_tables = self._get_table_chain_tables_with_filter(schema, truncate_filter, tables_with_jobs) + applied_update = self._init_dataset_and_update_schema(job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + + # update the staging dataset if client supports this + if isinstance(job_client, WithStagingDataset): + if staging_tables := self._get_table_chain_tables_with_filter(schema, truncate_staging_filter, tables_with_jobs): + with job_client.with_staging_dataset(): + self._init_dataset_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) + + return applied_update + + def load_single_package(self, load_id: str, schema: Schema) -> None: # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: - tables_with_jobs = set(job.table_name for job in self.get_new_jobs_info(load_id)) - dlt_tables = set(t["name"] for t in schema.dlt_tables()) - - # update the default dataset - truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_truncating, tables_with_jobs) - applied_update = self._init_client_and_update_schema(job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) - - # update the staging dataset if client supports this - if isinstance(job_client, WithStagingDataset): - if staging_tables := self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging_dataset, tables_with_jobs): - with job_client.with_staging_dataset(): - self._init_client_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) + # init job client + applied_update = self._init_client(job_client, schema, expected_update, load_id, job_client.should_truncate_table_before_load, job_client.should_load_data_to_staging_dataset if isinstance(job_client, WithStagingDataset) else None) - # only update tables that are present in the load package - if self.staging_destination and isinstance(job_client, WithStagingDataset): + # init staging client + if self.staging_destination and isinstance(job_client, SupportsStagingDestination): with self.get_staging_destination_client(schema) as staging_client: - # truncate all the tables in staging that are requested by the job client (TODO: make this better...) - truncate_tables = self._get_table_chain_tables_with_filter(schema, job_client.table_needs_staging_dataset, tables_with_jobs) - self._init_client_and_update_schema(staging_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + self._init_client(staging_client, schema, expected_update, load_id, job_client.should_truncate_table_before_load_on_staging_destination, job_client.should_load_data_to_staging_dataset_on_staging_destination) self.load_storage.commit_schema_update(load_id, applied_update) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 918eac67a8..f18eb738c5 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -428,14 +428,14 @@ def test_load_with_all_types(client: SqlJobClientBase, write_disposition: TWrite client.schema.bump_version() client.update_stored_schema() - if client.table_needs_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] + if client.should_load_data_to_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] with client.with_staging_dataset(): # type: ignore[attr-defined] # create staging for merge dataset client.initialize_storage() client.update_stored_schema() with client.sql_client.with_staging_dataset( - client.table_needs_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] + client.should_load_data_to_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] ): canonical_name = client.sql_client.make_qualified_table_name(table_name) # write row @@ -493,7 +493,7 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: TWriteD with io.BytesIO() as f: write_dataset(client, f, [table_row], TABLE_UPDATE_COLUMNS_SCHEMA) query = f.getvalue().decode() - if client.table_needs_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] + if client.should_load_data_to_staging_dataset(client.schema.tables[table_name]): # type: ignore[attr-defined] # load to staging dataset on merge with client.with_staging_dataset(): # type: ignore[attr-defined] expect_load_file(client, file_storage, query, t) From 00e474ce9ec8d1a89b1f021d46027d44a9c4d58a Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 13 Oct 2023 08:48:46 +0200 Subject: [PATCH 27/36] fix some tests --- dlt/destinations/athena/athena.py | 4 ++++ dlt/load/load.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 3b09f47488..c78b9ba2d7 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -371,6 +371,10 @@ def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> if self._is_iceberg_table(table_chain[0]): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] return super()._create_replace_followup_jobs(table_chain) + + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + # fall back to append jobs for merge + return self._create_append_followup_jobs(table_chain) def _is_iceberg_table(self, table: TTableSchema) -> bool: table_format = get_table_format(self.schema.tables, table["name"]) diff --git a/dlt/load/load.py b/dlt/load/load.py index 1248274a8f..b32156ae6e 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -276,6 +276,11 @@ def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchem if not f(top_job_table): continue for table in get_child_tables(schema.tables, top_job_table["name"]): + # only add tables for tables that have jobs unless the disposition is replace + # TODO: this is a (formerly used) hack to make test_merge_on_keys_in_schema, + # we should change that test + if not table["name"] in tables_with_jobs and top_job_table["write_disposition"] != "replace": + continue result.add(table["name"]) return result From acfcd166b93eed8ff12e80e8bbc3dc91816cb562 Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 13 Oct 2023 11:22:04 +0200 Subject: [PATCH 28/36] small changes --- dlt/destinations/athena/athena.py | 23 +++++++++++--------- dlt/destinations/filesystem/configuration.py | 1 - dlt/load/load.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index c78b9ba2d7..9aa0493a4e 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -325,7 +325,8 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries # or if we are in iceberg mode, we create iceberg tables for all tables - is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or (self._is_iceberg_table(self.schema.tables[table_name]) and not self.in_staging_mode) + table = self.get_load_table(table_name, self.in_staging_mode) + is_iceberg = self._is_iceberg_table(table) or table.get("write_disposition", None) == "skip" columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) # this will fail if the table prefix is not properly defined @@ -359,7 +360,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> ) job = super().start_file_load(table, file_path, load_id) if not job: - job = DoNothingFollowupJob(file_path) if self._is_iceberg_table(table) else DoNothingJob(file_path) + job = DoNothingFollowupJob(file_path) if self._is_iceberg_table(self.get_load_table(table["name"])) else DoNothingJob(file_path) return job def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: @@ -371,36 +372,38 @@ def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> if self._is_iceberg_table(table_chain[0]): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] return super()._create_replace_followup_jobs(table_chain) - + def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: # fall back to append jobs for merge return self._create_append_followup_jobs(table_chain) def _is_iceberg_table(self, table: TTableSchema) -> bool: - table_format = get_table_format(self.schema.tables, table["name"]) - return table_format == "iceberg" or self.config.force_iceberg + table_format = table.get("table_format") + return table_format == "iceberg" def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: # all iceberg tables need staging - if self._is_iceberg_table(table): + if self._is_iceberg_table(self.get_load_table(table["name"])): return True return super().should_load_data_to_staging_dataset(table) def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: # on athena we only truncate replace tables that are not iceberg table = self.get_load_table(table["name"]) - if table["write_disposition"] == "replace" and not self._is_iceberg_table(table): + if table["write_disposition"] == "replace" and not self._is_iceberg_table(self.get_load_table(table["name"])): return True return False def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTableSchema) -> bool: """iceberg table data goes into staging on staging destination""" - return self._is_iceberg_table(table) + return self._is_iceberg_table(self.get_load_table(table["name"])) def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) - # if staging and table.get("table_format", None) == "iceberg": - # table.pop("table_format") + if staging and table.get("table_format", None) == "iceberg": + table.pop("table_format") + elif self.config.force_iceberg: + table["table_format"] = "iceberg" return table @staticmethod diff --git a/dlt/destinations/filesystem/configuration.py b/dlt/destinations/filesystem/configuration.py index 8939060231..174dfafb1a 100644 --- a/dlt/destinations/filesystem/configuration.py +++ b/dlt/destinations/filesystem/configuration.py @@ -10,7 +10,6 @@ @configspec class FilesystemDestinationClientConfiguration(FilesystemConfiguration, DestinationClientStagingConfiguration): # type: ignore[misc] destination_name: Final[str] = "filesystem" # type: ignore - force_iceberg: Optional[bool] = False @resolve_type('credentials') def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: diff --git a/dlt/load/load.py b/dlt/load/load.py index b32156ae6e..ca8fff66df 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -277,7 +277,7 @@ def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchem continue for table in get_child_tables(schema.tables, top_job_table["name"]): # only add tables for tables that have jobs unless the disposition is replace - # TODO: this is a (formerly used) hack to make test_merge_on_keys_in_schema, + # TODO: this is a (formerly used) hack to make test_merge_on_keys_in_schema, # we should change that test if not table["name"] in tables_with_jobs and top_job_table["write_disposition"] != "replace": continue From 07076293bec0aa8f48dc3a31d2f7bb49859de3be Mon Sep 17 00:00:00 2001 From: Dave Date: Fri, 13 Oct 2023 15:58:25 +0200 Subject: [PATCH 29/36] small changes --- dlt/destinations/athena/athena.py | 19 +++++++++---------- dlt/destinations/job_client_impl.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 9aa0493a4e..6e032a5acf 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -69,18 +69,17 @@ class AthenaTypeMapper(TypeMapper): "int": "bigint", } - def __init__(self, capabilities: DestinationCapabilitiesContext, iceberg_mode: bool): + def __init__(self, capabilities: DestinationCapabilitiesContext): super().__init__(capabilities) - self.iceberg_mode = iceberg_mode def to_db_integer_type(self, precision: Optional[int]) -> str: if precision is None: return "bigint" - # iceberg does not support smallint and tinyint + # TODO: iceberg does not support smallint and tinyint if precision <= 8: - return "int" if self.iceberg_mode else "tinyint" + return "int" elif precision <= 16: - return "int" if self.iceberg_mode else "smallint" + return "int" elif precision <= 32: return "int" return "bigint" @@ -303,7 +302,7 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config - self.type_mapper = AthenaTypeMapper(self.capabilities, True) + self.type_mapper = AthenaTypeMapper(self.capabilities) def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # only truncate tables in iceberg mode @@ -364,12 +363,12 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> return job def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: - if self._is_iceberg_table(table_chain[0]): + if self._is_iceberg_table(self.get_load_table(table_chain[0]["name"])): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False})] return super()._create_append_followup_jobs(table_chain) def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: - if self._is_iceberg_table(table_chain[0]): + if self._is_iceberg_table(self.get_load_table(table_chain[0]["name"])): return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] return super()._create_replace_followup_jobs(table_chain) @@ -400,10 +399,10 @@ def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTab def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) + if self.config.force_iceberg: + table["table_format"] ="iceberg" if staging and table.get("table_format", None) == "iceberg": table.pop("table_format") - elif self.config.force_iceberg: - table["table_format"] = "iceberg" return table @staticmethod diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 8b26ac06ee..cfde6625d5 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -311,7 +311,7 @@ def _build_schema_update_sql(self, only_tables: Iterable[str]) -> Tuple[List[str sql += ";" sql_updates.append(sql) # create a schema update for particular table - partial_table = copy(self.schema.get_table(table_name)) + partial_table = copy(self.get_load_table(table_name)) # keep only new columns partial_table["columns"] = {c["name"]: c for c in new_columns} schema_update[table_name] = partial_table From 4692e373f28fb95529633bb120536554d5780013 Mon Sep 17 00:00:00 2001 From: Dave Date: Sat, 14 Oct 2023 10:25:57 +0200 Subject: [PATCH 30/36] make type mapper table format sensitive --- dlt/destinations/athena/athena.py | 15 +++++++-------- dlt/destinations/bigquery/bigquery.py | 6 +++--- dlt/destinations/duckdb/duck.py | 6 +++--- dlt/destinations/job_client_impl.py | 13 +++++++------ dlt/destinations/mssql/mssql.py | 10 +++++----- dlt/destinations/postgres/postgres.py | 6 +++--- dlt/destinations/redshift/redshift.py | 6 +++--- dlt/destinations/snowflake/snowflake.py | 8 ++++---- dlt/destinations/type_mapping.py | 8 ++++---- 9 files changed, 39 insertions(+), 39 deletions(-) diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py index 6e032a5acf..44d020c127 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/athena/athena.py @@ -16,7 +16,7 @@ from dlt.common.utils import without_none from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, Schema, TSchemaTables, TTableSchema -from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition +from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition, TTableFormat from dlt.common.schema.utils import table_schema_has_type, get_table_format from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import LoadJob, FollowupJob @@ -72,14 +72,13 @@ class AthenaTypeMapper(TypeMapper): def __init__(self, capabilities: DestinationCapabilitiesContext): super().__init__(capabilities) - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: if precision is None: return "bigint" - # TODO: iceberg does not support smallint and tinyint if precision <= 8: - return "int" + return "int" if table_format == "iceberg" else "tinyint" elif precision <= 16: - return "int" + return "int" if table_format == "iceberg" else "smallint" elif precision <= 32: return "int" return "bigint" @@ -312,8 +311,8 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: return self.type_mapper.from_db_type(hive_t, precision, scale) - def _get_column_def_sql(self, c: TColumnSchema) -> str: - return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c)}" + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: + return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: @@ -326,7 +325,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # or if we are in iceberg mode, we create iceberg tables for all tables table = self.get_load_table(table_name, self.in_staging_mode) is_iceberg = self._is_iceberg_table(table) or table.get("write_disposition", None) == "skip" - columns = ", ".join([self._get_column_def_sql(c) for c in new_columns]) + columns = ", ".join([self._get_column_def_sql(c, table.get("table_format")) for c in new_columns]) # this will fail if the table prefix is not properly defined table_prefix = self.table_prefix_layout.format(table_name=table_name) diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py index f1478697fc..4cb467a7af 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/bigquery/bigquery.py @@ -11,7 +11,7 @@ from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.schema.exceptions import UnknownTableException from dlt.destinations.job_client_impl import SqlJobClientWithStaging @@ -250,9 +250,9 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc return sql - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: name = self.capabilities.escape_identifier(c["name"]) - return f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" + return f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}" def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: schema_table: TTableSchemaColumns = {} diff --git a/dlt/destinations/duckdb/duck.py b/dlt/destinations/duckdb/duck.py index c40abd56a0..fe4ebac37e 100644 --- a/dlt/destinations/duckdb/duck.py +++ b/dlt/destinations/duckdb/duck.py @@ -5,7 +5,7 @@ from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.destination.reference import LoadJob, FollowupJob, TLoadJobState -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.storages.file_storage import FileStorage from dlt.common.utils import maybe_context @@ -65,7 +65,7 @@ class DuckDbTypeMapper(TypeMapper): "HUGEINT": "bigint", } - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: if precision is None: return "BIGINT" # Precision is number of bits @@ -141,7 +141,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> job = DuckDbCopyJob(table["name"], file_path, self.sql_client) return job - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: hints_str = " ".join(self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True) column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index cfde6625d5..0124a277d3 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -12,7 +12,7 @@ from dlt.common import json, pendulum, logger from dlt.common.data_types import TDataType -from dlt.common.schema.typing import COLUMN_HINTS, TColumnType, TColumnSchemaBase, TTableSchema, TWriteDisposition +from dlt.common.schema.typing import COLUMN_HINTS, TColumnType, TColumnSchemaBase, TTableSchema, TWriteDisposition, TTableFormat from dlt.common.storages import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables from dlt.common.destination.reference import StateInfo, StorageSchemaInfo,WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, DestinationClientDwhWithStagingConfiguration, NewLoadJob, WithStagingDataset, TLoadJobState, LoadJob, JobClientBase, FollowupJob, CredentialsConfiguration @@ -318,23 +318,24 @@ def _build_schema_update_sql(self, only_tables: Iterable[str]) -> Tuple[List[str return sql_updates, schema_update - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str]: + def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)""" - return [f"ADD COLUMN {self._get_column_def_sql(c)}" for c in new_columns] + return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns] def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: # build sql canonical_name = self.sql_client.make_qualified_table_name(table_name) + table = self.get_load_table(table_name) sql_result: List[str] = [] if not generate_alter: # build CREATE sql = f"CREATE TABLE {canonical_name} (\n" - sql += ",\n".join([self._get_column_def_sql(c) for c in new_columns]) + sql += ",\n".join([self._get_column_def_sql(c, table.get("table_format")) for c in new_columns]) sql += ")" sql_result.append(sql) else: sql_base = f"ALTER TABLE {canonical_name}\n" - add_column_statements = self._make_add_column_sql(new_columns) + add_column_statements = self._make_add_column_sql(new_columns, table.get("table_format")) if self.capabilities.alter_add_multi_column: column_sql = ",\n" sql_result.append(sql_base + column_sql.join(add_column_statements)) @@ -357,7 +358,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc return sql_result @abstractmethod - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: pass @staticmethod diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/mssql/mssql.py index c06ddeadbd..cd999441ff 100644 --- a/dlt/destinations/mssql/mssql.py +++ b/dlt/destinations/mssql/mssql.py @@ -5,7 +5,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.utils import uniq_id from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob, SqlJobParams @@ -62,7 +62,7 @@ class MsSqlTypeMapper(TypeMapper): "int": "bigint", } - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: if precision is None: return "bigint" if precision <= 8: @@ -136,11 +136,11 @@ def __init__(self, schema: Schema, config: MsSqlClientConfiguration) -> None: def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)] - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str]: + def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: # Override because mssql requires multiple columns in a single ADD COLUMN clause - return ["ADD \n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)] + return ["ADD \n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns)] - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: sc_type = c["data_type"] if sc_type == "text" and c.get("unique"): # MSSQL does not allow index on large TEXT columns diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/postgres/postgres.py index 72837b42b3..2812d1d4c4 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/postgres/postgres.py @@ -5,7 +5,7 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams @@ -59,7 +59,7 @@ class PostgresTypeMapper(TypeMapper): "integer": "bigint", } - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: if precision is None: return "bigint" # Precision is number of bits @@ -109,7 +109,7 @@ def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: self.active_hints = HINT_TO_POSTGRES_ATTR if self.config.create_indexes else {} self.type_mapper = PostgresTypeMapper(self.capabilities) - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: hints_str = " ".join(self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True) column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/redshift/redshift.py index 77ef05de74..888f27ae7c 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/redshift/redshift.py @@ -17,7 +17,7 @@ from dlt.common.destination.reference import NewLoadJob, CredentialsConfiguration, SupportsStagingDestination from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.common.configuration.specs import AwsCredentialsWithoutDefaults from dlt.destinations.insert_job_client import InsertValuesJobClient @@ -76,7 +76,7 @@ class RedshiftTypeMapper(TypeMapper): "integer": "bigint", } - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: if precision is None: return "bigint" if precision <= 16: @@ -204,7 +204,7 @@ def __init__(self, schema: Schema, config: RedshiftClientConfiguration) -> None: def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: hints_str = " ".join(HINT_TO_REDSHIFT_ATTR.get(h, "") for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True) column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/snowflake/snowflake.py index 5aa721dd03..f433ec7e7d 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/snowflake/snowflake.py @@ -7,7 +7,7 @@ from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns -from dlt.common.schema.typing import TTableSchema, TColumnType +from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat from dlt.destinations.job_client_impl import SqlJobClientWithStaging @@ -200,9 +200,9 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> def restore_file_load(self, file_path: str) -> LoadJob: return EmptyLoadJob.from_file_path(file_path, "completed") - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str]: + def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: # Override because snowflake requires multiple columns in a single ADD COLUMN clause - return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)] + return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns)] def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: if self.config.replace_strategy == "staging-optimized": @@ -222,7 +222,7 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc def _from_db_type(self, bq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: return self.type_mapper.from_db_type(bq_t, precision, scale) - def _get_column_def_sql(self, c: TColumnSchema) -> str: + def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: name = self.capabilities.escape_identifier(c["name"]) return f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" diff --git a/dlt/destinations/type_mapping.py b/dlt/destinations/type_mapping.py index dcbb1a4261..3ddfee5904 100644 --- a/dlt/destinations/type_mapping.py +++ b/dlt/destinations/type_mapping.py @@ -1,6 +1,6 @@ from typing import Tuple, ClassVar, Dict, Optional -from dlt.common.schema.typing import TColumnSchema, TDataType, TColumnType +from dlt.common.schema.typing import TColumnSchema, TDataType, TColumnType, TTableFormat from dlt.common.destination.capabilities import DestinationCapabilitiesContext from dlt.common.utils import without_none @@ -20,15 +20,15 @@ class TypeMapper: def __init__(self, capabilities: DestinationCapabilitiesContext) -> None: self.capabilities = capabilities - def to_db_integer_type(self, precision: Optional[int]) -> str: + def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: # Override in subclass if db supports other integer types (e.g. smallint, integer, tinyint, etc.) return self.sct_to_unbound_dbt["bigint"] - def to_db_type(self, column: TColumnSchema) -> str: + def to_db_type(self, column: TColumnSchema, table_format: TTableFormat = None) -> str: precision, scale = column.get("precision"), column.get("scale") sc_t = column["data_type"] if sc_t == "bigint": - return self.to_db_integer_type(precision) + return self.to_db_integer_type(precision, table_format) bounded_template = self.sct_to_dbt.get(sc_t) if not bounded_template: return self.sct_to_unbound_dbt[sc_t] From 10131e49d39f501899591372a3a80e9d8a9f0692 Mon Sep 17 00:00:00 2001 From: Dave Date: Sat, 14 Oct 2023 10:34:43 +0200 Subject: [PATCH 31/36] disable dbt tests for athena iceberg --- tests/load/pipeline/test_dbt_helper.py | 4 ++++ tests/load/utils.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index e55f5b2964..d6e9167555 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -58,6 +58,8 @@ def test_run_jaffle_package(destination_config: DestinationTestConfiguration, db @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess + if not destination_config.supports_dbt: + pytest.mark.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" @@ -95,6 +97,8 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) def test_run_chess_dbt_to_other_dataset(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess + if not destination_config.supports_dbt: + pytest.mark.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" diff --git a/tests/load/utils.py b/tests/load/utils.py index 813bd69ec3..b03e325cc0 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -54,6 +54,7 @@ class DestinationTestConfiguration: extra_info: Optional[str] = None supports_merge: bool = True # TODO: take it from client base class force_iceberg: bool = False + supports_dbt: bool = True @property def name(self) -> str: @@ -110,7 +111,7 @@ def destinations_configs( destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though destination_configs += [DestinationTestConfiguration(destination="athena", supports_merge=False, bucket_url=AWS_BUCKET)] - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, extra_info="iceberg")] + destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, supports_dbt=False, extra_info="iceberg")] if default_vector_configs: # for now only weaviate From 243246e280ad8d619250981311e770ba6058eaaf Mon Sep 17 00:00:00 2001 From: Dave Date: Sat, 14 Oct 2023 10:57:57 +0200 Subject: [PATCH 32/36] update doc --- .../docs/dlt-ecosystem/destinations/athena.md | 24 +++++++++++++++++-- tests/utils.py | 2 -- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 74771ba74f..4a24122220 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -6,7 +6,7 @@ keywords: [aws, athena, glue catalog] # AWS Athena / Glue Catalog -The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. dlt metadata will be stored in the same bucket as the parquet files, but as iceberg tables. +The athena destination stores data as parquet files in s3 buckets and creates [external tables in aws athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with athena sql commands which will then scan the whole folder of parquet files and return the results. This destination works very similar to other sql based destinations, with the exception of the merge write disposition not being supported at this time. dlt metadata will be stored in the same bucket as the parquet files, but as iceberg tables. Athena additionally supports writing individual data tables as iceberg tables, so the may be manipulated later, a common use-case would be to strip gdpr data from them. ## Setup Guide ### 1. Initialize the dlt project @@ -110,11 +110,31 @@ Using a staging destination is mandatory when using the athena destination. If y If you decide to change the [filename layout](./filesystem#data-loading) from the default value, keep the following in mind so that athena can reliable build your tables: - You need to provide the `{table_name}` placeholder and this placeholder needs to be followed by a forward slash - You need to provide the `{file_id}` placeholder and it needs to be somewhere after the `{table_name}` placeholder. - - {table_name} must be a first placeholder in the layout. + - {table_name} must be the first placeholder in the layout. ## Additional destination options +### iceberg data tables +You can save your tables as iceberg tables to athena. This will enable you to for example delete data from them later if you need to. To switch a resouce to the iceberg table-format, +supply the table_format argument like this: + +```python +@dlt.resource(table_format="iceberg") +def data() -> Iterable[TDataItem]: + ... +``` + +Alternatively you can set all tables to use the iceberg format with a config variable: + +```toml +[destination.athena] +force_iceberg = "True" +``` + +For every table created as an iceberg table, the athena destination will create a regular athena table in the staging dataset of both the filesystem as well as the athena glue catalog and then +copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both filesystem and the glue catalog. Switching from iceberg to regular table or vice versa +is not supported. ### dbt support diff --git a/tests/utils.py b/tests/utils.py index 00be15ffb2..2eba788542 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -38,8 +38,6 @@ # filter out active destinations for current tests ACTIVE_DESTINATIONS = set(dlt.config.get("ACTIVE_DESTINATIONS", list) or IMPLEMENTED_DESTINATIONS) -# ACTIVE_DESTINATIONS = {"duckdb"} - ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) From ba0c593abc8678bd9008f275ee51b7725e29b2f6 Mon Sep 17 00:00:00 2001 From: Dave Date: Sat, 14 Oct 2023 11:15:07 +0200 Subject: [PATCH 33/36] small fix --- dlt/common/destination/reference.py | 2 ++ dlt/destinations/job_client_impl.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index a4567a5b3e..13172b41e9 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -290,6 +290,8 @@ def _verify_schema(self) -> None: logger.warning(f"A column {column_name} in table {table_name} in schema {self.schema.name} is incomplete. It was not bound to the data during normalizations stage and its data type is unknown. Did you add this column manually in code ie. as a merge key?") def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> TTableSchema: + if table_name not in self.schema.tables: + return None try: # make a copy of the schema so modifications do not affect the original document table = deepcopy(self.schema.tables[table_name]) diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 0124a277d3..b54748eca2 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -326,16 +326,17 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # build sql canonical_name = self.sql_client.make_qualified_table_name(table_name) table = self.get_load_table(table_name) + table_format = table.get("table_format") if table else None sql_result: List[str] = [] if not generate_alter: # build CREATE sql = f"CREATE TABLE {canonical_name} (\n" - sql += ",\n".join([self._get_column_def_sql(c, table.get("table_format")) for c in new_columns]) + sql += ",\n".join([self._get_column_def_sql(c, table_format) for c in new_columns]) sql += ")" sql_result.append(sql) else: sql_base = f"ALTER TABLE {canonical_name}\n" - add_column_statements = self._make_add_column_sql(new_columns, table.get("table_format")) + add_column_statements = self._make_add_column_sql(new_columns, table_format) if self.capabilities.alter_add_multi_column: column_sql = ",\n" sql_result.append(sql_base + column_sql.join(add_column_statements)) From 1e8605c89d09619d898d33e4241cd6135d3f89c7 Mon Sep 17 00:00:00 2001 From: Dave Date: Mon, 16 Oct 2023 11:51:20 +0200 Subject: [PATCH 34/36] pr changes --- dlt/destinations/filesystem/filesystem.py | 13 +++++++++---- .../docs/dlt-ecosystem/destinations/athena.md | 4 ++-- tests/load/pipeline/test_dbt_helper.py | 4 ++-- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/filesystem/filesystem.py index ecadb74f8e..49ad36dd16 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/filesystem/filesystem.py @@ -83,21 +83,26 @@ def __init__(self, schema: Schema, config: FilesystemDestinationClientConfigurat # verify files layout. we need {table_name} and only allow {schema_name} before it, otherwise tables # cannot be replaced and we cannot initialize folders consistently self.table_prefix_layout = path_utils.get_table_prefix_layout(config.layout) - self.dataset_path = posixpath.join(self.fs_path, self.config.normalize_dataset_name(self.schema)) + self._dataset_path = self.config.normalize_dataset_name(self.schema) def drop_storage(self) -> None: if self.is_storage_initialized(): self.fs_client.rm(self.dataset_path, recursive=True) + @property + def dataset_path(self) -> str: + return posixpath.join(self.fs_path, self._dataset_path) + + @contextmanager def with_staging_dataset(self) -> Iterator["FilesystemClient"]: - current_dataset_path = self.dataset_path + current_dataset_path = self._dataset_path try: - self.dataset_path = posixpath.join(self.fs_path, self.config.normalize_dataset_name(self.schema)) + "_staging" + self._dataset_path = self.schema.naming.normalize_table_identifier(current_dataset_path + "_staging") yield self finally: # restore previous dataset name - self.dataset_path = current_dataset_path + self._dataset_path = current_dataset_path def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # clean up existing files for tables selected for truncating diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 4a24122220..74da2e8a6e 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -138,8 +138,8 @@ is not supported. ### dbt support -Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. -Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** +Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. At this point dbt is not supported for iceberg tables +on Athena. The Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** ```toml [destination.athena] aws_data_catalog="awsdatacatalog" diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index d6e9167555..37c1f0c607 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -59,7 +59,7 @@ def test_run_jaffle_package(destination_config: DestinationTestConfiguration, db def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess if not destination_config.supports_dbt: - pytest.mark.skip("dbt is not supported for this destination configuration") + pytest.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" @@ -98,7 +98,7 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven def test_run_chess_dbt_to_other_dataset(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess if not destination_config.supports_dbt: - pytest.mark.skip("dbt is not supported for this destination configuration") + pytest.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" From 78fc17a59b9ba9fb0c2ab9358fc6f33e1ed9d809 Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 16 Oct 2023 19:56:10 +0200 Subject: [PATCH 35/36] updates athena dbt docs --- docs/website/docs/dlt-ecosystem/destinations/athena.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 74da2e8a6e..9bd1682e97 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -132,14 +132,15 @@ Alternatively you can set all tables to use the iceberg format with a config var force_iceberg = "True" ``` -For every table created as an iceberg table, the athena destination will create a regular athena table in the staging dataset of both the filesystem as well as the athena glue catalog and then +For every table created as an iceberg table, the athena destination will create a regular athena table in the staging dataset of both the filesystem as well as the athena glue catalog and then copy all data into the final iceberg table that lives with the non-iceberg tables in the same dataset on both filesystem and the glue catalog. Switching from iceberg to regular table or vice versa is not supported. ### dbt support -Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. At this point dbt is not supported for iceberg tables -on Athena. The Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** +Athena is supported via `dbt-athena-community`. Credentials are passed into `aws_access_key_id` and `aws_secret_access_key` of generated dbt profile. Iceberg tables are supported but you need to make sure that you materialize your models as iceberg tables if your source table is iceberg. We encountered problems with materializing +date time columns due to different precision on iceberg (nanosecond) and regular Athena tables (millisecond). +The Athena adapter requires that you setup **region_name** in Athena configuration below. You can also setup table catalog name to change the default: **awsdatacatalog** ```toml [destination.athena] aws_data_catalog="awsdatacatalog" From 918c4d52d7a9c5b7416a9809f0119369f09d448e Mon Sep 17 00:00:00 2001 From: Marcin Rudolf Date: Mon, 16 Oct 2023 20:04:08 +0200 Subject: [PATCH 36/36] adds docsting on table format to decorators --- dlt/extract/decorators.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index b036bdc4a6..2125d0855e 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -317,6 +317,8 @@ def resource( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + table_format (Literal["iceberg"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, other destinations ignore this hint. + selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source.