dlt-hub · rudolfix · Oct 16, 2023 · Sep 28, 2023 · Sep 28, 2023 · Sep 28, 2023
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -248,6 +248,10 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition]
         # in the base job, all replace strategies are treated the same, see filesystem for example
         return ["replace"]
 
+    def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]:
+        # some clients need to additionally be able to get the staging destination to truncate tables
+        return []
+
     def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
         """Creates a list of followup jobs that should be executed after a table chain is completed"""
         return []

diff --git a/dlt/destinations/athena/__init__.py b/dlt/destinations/athena/__init__.py
@@ -36,6 +36,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.alter_add_multi_column = True
     caps.schema_supports_numeric_precision = False
     caps.timestamp_precision = 3
+    caps.supports_truncate_command = False
     return caps
 
 

diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py
@@ -16,21 +16,21 @@
 from dlt.common.utils import without_none
 from dlt.common.data_types import TDataType
 from dlt.common.schema import TColumnSchema, Schema
-from dlt.common.schema.typing import TTableSchema, TColumnType
+from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition
 from dlt.common.schema.utils import table_schema_has_type
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import LoadJob
-from dlt.common.destination.reference import TLoadJobState
+from dlt.common.destination.reference import LoadJob, FollowupJob
+from dlt.common.destination.reference import TLoadJobState, NewLoadJob
 from dlt.common.storages import FileStorage
 from dlt.common.data_writers.escape import escape_bigquery_identifier
-
+from dlt.destinations.sql_jobs import SqlStagingCopyJob
 
 from dlt.destinations.typing import DBApi, DBTransaction
 from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException
 from dlt.destinations.athena import capabilities
 from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error
 from dlt.destinations.typing import DBApiCursor
-from dlt.destinations.job_client_impl import SqlJobClientBase, StorageSchemaInfo
+from dlt.destinations.job_client_impl import SqlJobClientWithStaging
 from dlt.destinations.athena.configuration import AthenaClientConfiguration
 from dlt.destinations.type_mapping import TypeMapper
 from dlt.destinations import path_utils
@@ -69,13 +69,18 @@ class AthenaTypeMapper(TypeMapper):
         "int": "bigint",
     }
 
+    def __init__(self, capabilities: DestinationCapabilitiesContext, iceberg_mode: bool):
+        super().__init__(capabilities)
+        self.iceberg_mode = iceberg_mode
+
     def to_db_integer_type(self, precision: Optional[int]) -> str:
         if precision is None:
             return "bigint"
+        # iceberg does not support smallint and tinyint
         if precision <= 8:
-            return "tinyint"
+            return "int" if self.iceberg_mode else "tinyint"
         elif precision <= 16:
-            return "smallint"
+            return "int" if self.iceberg_mode else "smallint"
         elif precision <= 32:
             return "int"
         return "bigint"
@@ -135,6 +140,11 @@ def exception(self) -> str:
         # this part of code should be never reached
         raise NotImplementedError()
 
+class DoNothingFollowupJob(DoNothingJob, FollowupJob):
+    """The second most lazy class of dlt"""
+    pass
+
+
 class AthenaSQLClient(SqlClientBase[Connection]):
 
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
@@ -276,7 +286,7 @@ def has_dataset(self) -> bool:
         return len(rows) > 0
 
 
-class AthenaClient(SqlJobClientBase):
+class AthenaClient(SqlJobClientWithStaging):
 
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
 
@@ -293,11 +303,14 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None:
         super().__init__(schema, config, sql_client)
         self.sql_client: AthenaSQLClient = sql_client  # type: ignore
         self.config: AthenaClientConfiguration = config
-        self.type_mapper = AthenaTypeMapper(self.capabilities)
+        self.iceberg_mode = not (not self.config.iceberg_bucket_url)
+        self.type_mapper = AthenaTypeMapper(self.capabilities, self.iceberg_mode)
 
     def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
-        # never truncate tables in athena
-        super().initialize_storage([])
+        # only truncate tables in iceberg mode
+        if not self.iceberg_mode or self.in_staging_mode:
+            truncate_tables = []
+        super().initialize_storage(truncate_tables)
 
     def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType:
         return self.type_mapper.from_db_type(hive_t, precision, scale)
@@ -307,12 +320,19 @@ def _get_column_def_sql(self, c: TColumnSchema) -> str:
 
     def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]:
 
+        create_data_iceberg_tables = self.iceberg_mode and not self.in_staging_mode
+
         bucket = self.config.staging_config.bucket_url
-        dataset = self.sql_client.dataset_name
+        if create_data_iceberg_tables:
+            bucket = self.config.iceberg_bucket_url
+
+        # TODO: we need to strip the staging layout from the table name, find a better way!
+        dataset = self.sql_client.dataset_name.replace("_staging", "")
         sql: List[str] = []
 
         # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries
-        is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip"
+        # or if we are in iceberg mode, we create iceberg tables for all tables
+        is_iceberg = create_data_iceberg_tables or (self.schema.tables[table_name].get("write_disposition", None) == "skip")
         columns = ", ".join([self._get_column_def_sql(c) for c in new_columns])
 
         # this will fail if the table prefix is not properly defined
@@ -345,9 +365,38 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
             )
         job = super().start_file_load(table, file_path, load_id)
         if not job:
-            job = DoNothingJob(file_path)
+            job = DoNothingFollowupJob(file_path) if self.iceberg_mode else DoNothingJob(file_path)
         return job
 
+    def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
+        """Creates a list of followup jobs for merge write disposition and staging replace strategies"""
+        jobs = super().create_table_chain_completed_followup_jobs(table_chain)
+
+        # add some additional jobs
+        write_disposition = table_chain[0]["write_disposition"]
+        if write_disposition == "append":
+            jobs.append(self._create_staging_copy_job(table_chain, False))
+        elif write_disposition == "replace" and self.config.replace_strategy == "truncate-and-insert":
+            jobs.append(self._create_staging_copy_job(table_chain, False))
+        return jobs
+
+    def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob:
+        """update destination tables from staging tables"""
+        if self.iceberg_mode:
+            return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": replace})
+        return super()._create_staging_copy_job(table_chain, replace=replace)
+
+    def get_stage_dispositions(self) -> List[TWriteDisposition]:
+        # in iceberg mode, we always use staging tables
+        if self.iceberg_mode:
+            return ["append", "replace", "merge"]
+        return super().get_stage_dispositions()
+
+    def get_truncate_staging_destination_table_dispositions(self) -> List[TWriteDisposition]:
+        if self.iceberg_mode:
+            return ["append", "replace", "merge"]
+        return []
+
     @staticmethod
     def is_dbapi_exception(ex: Exception) -> bool:
         return isinstance(ex, Error)
diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py
@@ -9,9 +9,11 @@
 class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration):
     destination_name: Final[str] = "athena"  # type: ignore[misc]
     query_result_bucket: str = None
+    iceberg_bucket_url: Optional[str] = None
     credentials: AwsCredentials = None
     athena_work_group: Optional[str] = None
     aws_data_catalog: Optional[str] = "awsdatacatalog"
+    supports_truncate_command: bool = False
 
     __config_gen_annotations__: ClassVar[List[str]] = ["athena_work_group"]
 

diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py
@@ -19,7 +19,7 @@
 from dlt.destinations.bigquery import capabilities
 from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration
 from dlt.destinations.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS
-from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob
+from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams
 from dlt.destinations.job_impl import NewReferenceJob
 from dlt.destinations.sql_client import SqlClientBase
 from dlt.destinations.type_mapping import TypeMapper
@@ -138,7 +138,7 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st
 class BigqueryStagingCopyJob(SqlStagingCopyJob):
 
     @classmethod
-    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]:
+    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]:
         sql: List[str] = []
         for table in table_chain:
             with sql_client.with_staging_dataset(staging=True):

diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -148,24 +148,26 @@ def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition]
     def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
         return SqlMergeJob.from_table_chain(table_chain, self.sql_client)
 
-    def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
+    def _create_staging_copy_job(self, table_chain: Sequence[TTableSchema], replace: bool) -> NewLoadJob:
         """update destination tables from staging tables"""
-        return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)
+        return SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})
 
     def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
         """optimized replace strategy, defaults to _create_staging_copy_job for the basic client
            for some destinations there are much faster destination updates at the cost of
            dropping tables possible"""
-        return self._create_staging_copy_job(table_chain)
+        return self._create_staging_copy_job(table_chain, True)
 
     def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
         """Creates a list of followup jobs for merge write disposition and staging replace strategies"""
         jobs = super().create_table_chain_completed_followup_jobs(table_chain)
         write_disposition = table_chain[0]["write_disposition"]
-        if write_disposition == "merge":
+        if write_disposition == "append":
+            pass
+        elif write_disposition == "merge":
             jobs.append(self._create_merge_job(table_chain))
         elif write_disposition == "replace" and self.config.replace_strategy == "insert-from-staging":
-            jobs.append(self._create_staging_copy_job(table_chain))
+            jobs.append(self._create_staging_copy_job(table_chain, True))
         elif write_disposition == "replace" and self.config.replace_strategy == "staging-optimized":
             jobs.append(self._create_optimized_replace_job(table_chain))
         return jobs
@@ -431,10 +433,17 @@ def _commit_schema_update(self, schema: Schema, schema_str: str) -> None:
 
 
 class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset):
+
+    in_staging_mode: bool = False
+
     @contextlib.contextmanager
     def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]:
-        with self.sql_client.with_staging_dataset(True):
-            yield self
+        try:
+            with self.sql_client.with_staging_dataset(True):
+                self.in_staging_mode = True
+                yield self
+        finally:
+            self.in_staging_mode = False
 
     def get_stage_dispositions(self) -> List[TWriteDisposition]:
         """Returns a list of dispositions that require staging tables to be populated"""

diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/mssql/mssql.py
@@ -8,7 +8,7 @@
 from dlt.common.schema.typing import TTableSchema, TColumnType
 from dlt.common.utils import uniq_id
 
-from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob
+from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlMergeJob, SqlJobParams
 
 from dlt.destinations.insert_job_client import InsertValuesJobClient
 
@@ -83,7 +83,7 @@ def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[i
 class MsSqlStagingCopyJob(SqlStagingCopyJob):
 
     @classmethod
-    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]:
+    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]:
         sql: List[str] = []
         for table in table_chain:
             with sql_client.with_staging_dataset(staging=True):

diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/postgres/postgres.py
@@ -7,7 +7,7 @@
 from dlt.common.schema import TColumnSchema, TColumnHint, Schema
 from dlt.common.schema.typing import TTableSchema, TColumnType
 
-from dlt.destinations.sql_jobs import SqlStagingCopyJob
+from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams
 
 from dlt.destinations.insert_job_client import InsertValuesJobClient
 
@@ -79,7 +79,7 @@ def from_db_type(self, db_type: str, precision: Optional[int] = None, scale: Opt
 class PostgresStagingCopyJob(SqlStagingCopyJob):
 
     @classmethod
-    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]:
+    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]:
         sql: List[str] = []
         for table in table_chain:
             with sql_client.with_staging_dataset(staging=True):

diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/snowflake/snowflake.py
@@ -17,7 +17,7 @@
 from dlt.destinations.snowflake import capabilities
 from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration
 from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient
-from dlt.destinations.sql_jobs import SqlStagingCopyJob
+from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams
 from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient
 from dlt.destinations.job_impl import NewReferenceJob
 from dlt.destinations.sql_client import SqlClientBase
@@ -157,13 +157,12 @@ def exception(self) -> str:
 class SnowflakeStagingCopyJob(SqlStagingCopyJob):
 
     @classmethod
-    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]:
+    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]:
         sql: List[str] = []
         for table in table_chain:
             with sql_client.with_staging_dataset(staging=True):
                 staging_table_name = sql_client.make_qualified_table_name(table["name"])
             table_name = sql_client.make_qualified_table_name(table["name"])
-            # drop destination table
             sql.append(f"DROP TABLE IF EXISTS {table_name};")
             # recreate destination table with data cloned from staging table
             sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};")
@@ -206,7 +205,7 @@ def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema]) -> List[str
         return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c) for c in new_columns)]
 
     def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
-        return SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client)
+        return SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})
 
     def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool, separate_alters: bool = False) -> List[str]:
         sql = super()._get_table_update_sql(table_name, new_columns, generate_alter)