dlt-hub · rudolfix · Oct 16, 2023 · Sep 28, 2023 · Sep 28, 2023 · Sep 28, 2023
diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -10,6 +10,7 @@
 from dlt.common.schema import Schema, TTableSchema, TSchemaTables
 from dlt.common.schema.typing import TWriteDisposition
 from dlt.common.schema.exceptions import InvalidDatasetName
+from dlt.common.schema.utils import get_load_table
 from dlt.common.configuration import configspec
 from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration
 from dlt.common.configuration.accessors import config
@@ -244,9 +245,8 @@ def restore_file_load(self, file_path: str) -> LoadJob:
         """Finds and restores already started loading job identified by `file_path` if destination supports it."""
         pass
 
-    def get_truncate_destination_table_dispositions(self) -> List[TWriteDisposition]:
-        # in the base job, all replace strategies are treated the same, see filesystem for example
-        return ["replace"]
+    def table_needs_truncating(self, table: TTableSchema) -> bool:
+        return table["write_disposition"] == "replace"
 
     def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
         """Creates a list of followup jobs that should be executed after a table chain is completed"""
@@ -309,9 +309,8 @@ class WithStagingDataset(ABC):
     """Adds capability to use staging dataset and request it from the loader"""
 
     @abstractmethod
-    def get_stage_dispositions(self) -> List[TWriteDisposition]:
-        """Returns a list of write dispositions that require staging dataset"""
-        return []
+    def table_needs_staging(self, table: TTableSchema) -> bool:
+        return False
 
     @abstractmethod
     def with_staging_dataset(self)-> ContextManager["JobClientBase"]:

diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py
@@ -69,3 +69,8 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi
         self.from_engine = from_engine
         self.to_engine = to_engine
         super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}")
+
+class UnknownTableException(SchemaException):
+    def __init__(self, table_name: str) -> None:
+        self.table_name = table_name
+        super().__init__(f"Trying to access unknown table {table_name}.")
diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -24,6 +24,7 @@
 TColumnHint = Literal["not_null", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "root_key", "merge_key"]
 """Known hints of a column used to declare hint regexes."""
 TWriteDisposition = Literal["skip", "append", "replace", "merge"]
+TTableFormat = Literal["iceberg"]
 TTypeDetections = Literal["timestamp", "iso_timestamp", "large_integer", "hexbytes_to_text", "wei_to_double"]
 TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]]
 TColumnNames = Union[str, Sequence[str]]
@@ -86,6 +87,7 @@ class TTableSchema(TypedDict, total=False):
     filters: Optional[TRowFilters]
     columns: TTableSchemaColumns
     resource: Optional[str]
+    table_format: Optional[TTableFormat]
 
 
 class TPartialTableSchema(TTableSchema):

diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
@@ -15,10 +15,10 @@
 from dlt.common.validation import TCustomValidator, validate_dict, validate_dict_ignoring_xkeys
 from dlt.common.schema import detections
 from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate,
-                                      TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp,
+                                      TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, 
                                       TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition)
 from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException,
-                                          TablePropertiesConflictException, InvalidSchemaName)
+                                          TablePropertiesConflictException, InvalidSchemaName, UnknownTableException)
 
 from dlt.common.normalizers.utils import import_normalizers
 from dlt.common.schema.typing import TAnySchemaColumns
@@ -493,18 +493,29 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl
     return aggregated_update
 
 
-def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition:
-    """Returns write disposition of a table if present. If not, looks up into parent table"""
+def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_name: str, allow_none: bool = False) -> Any:
     table = tables[table_name]
-    w_d = table.get("write_disposition")
-    if w_d:
-        return w_d
+    hint = table.get(table_hint_name)
+    if hint:
+        return hint
 
     parent = table.get("parent")
     if parent:
-        return get_write_disposition(tables, parent)
+        return get_inherited_table_hint(tables, parent, table_hint_name, allow_none)
+
+    if allow_none:
+        return None
+
+    raise ValueError(f"No table hint '{table_hint_name} found in the chain of tables for '{table_name}'.")
+
+
+def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition:
+    """Returns table hint of a table if present. If not, looks up into parent table"""
+    return get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False)
+
 
-    raise ValueError(f"No write disposition found in the chain of tables for '{table_name}'.")
+def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat:
+    return get_inherited_table_hint(tables, table_name, "table_format", allow_none=True)
 
 
 def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool:
@@ -525,6 +536,18 @@ def get_top_level_table(tables: TSchemaTables, table_name: str) -> TTableSchema:
         return get_top_level_table(tables, parent)
     return table
 
+def get_load_table(tables: TSchemaTables, table_name: str) -> TTableSchema:
+    try:
+        # make a copy of the schema so modifications do not affect the original document
+        table = copy(tables[table_name])
+        # add write disposition if not specified - in child tables
+        if "write_disposition" not in table:
+            table["write_disposition"] = get_write_disposition(tables, table_name)
+        if "table_format" not in table:
+            table["table_format"] = get_table_format(tables, table_name)
+        return table
+    except KeyError:
+        raise UnknownTableException(table_name)
 
 def get_child_tables(tables: TSchemaTables, table_name: str) -> List[TTableSchema]:
     """Get child tables for table name and return a list of tables ordered by ancestry so the child tables are always after their parents"""
@@ -637,7 +660,8 @@ def new_table(
     write_disposition: TWriteDisposition = None,
     columns: Sequence[TColumnSchema] = None,
     validate_schema: bool = False,
-    resource: str = None
+    resource: str = None,
+    table_format: TTableFormat = None
 ) -> TTableSchema:
 
     table: TTableSchema = {
@@ -652,6 +676,8 @@ def new_table(
         # set write disposition only for root tables
         table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION
         table["resource"] = resource or table_name
+        if table_format:
+            table["table_format"] = table_format
     if validate_schema:
         validate_dict_ignoring_xkeys(
             spec=TColumnSchema,

diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py
@@ -237,8 +237,11 @@ def list_failed_jobs(self, load_id: str) -> Sequence[str]:
         return self.storage.list_folder_files(self._get_job_folder_path(load_id, LoadStorage.FAILED_JOBS_FOLDER))
 
     def list_jobs_for_table(self, load_id: str, table_name: str) -> Sequence[LoadJobInfo]:
+        return [job for job in self.list_all_jobs(load_id) if job.job_file_info.table_name == table_name]
+
+    def list_all_jobs(self, load_id: str) -> Sequence[LoadJobInfo]:
         info = self.get_load_package_info(load_id)
-        return [job for job in flatten_list_or_items(iter(info.jobs.values())) if job.job_file_info.table_name == table_name]  # type: ignore
+        return [job for job in flatten_list_or_items(iter(info.jobs.values()))]  # type: ignore
 
     def list_completed_failed_jobs(self, load_id: str) -> Sequence[str]:
         return self.storage.list_folder_files(self._get_job_folder_completed_path(load_id, LoadStorage.FAILED_JOBS_FOLDER))

diff --git a/dlt/destinations/athena/__init__.py b/dlt/destinations/athena/__init__.py
@@ -36,6 +36,7 @@ def capabilities() -> DestinationCapabilitiesContext:
     caps.alter_add_multi_column = True
     caps.schema_supports_numeric_precision = False
     caps.timestamp_precision = 3
+    caps.supports_truncate_command = False
     return caps
 
 

diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/athena/athena.py
@@ -16,21 +16,21 @@
 from dlt.common.utils import without_none
 from dlt.common.data_types import TDataType
 from dlt.common.schema import TColumnSchema, Schema
-from dlt.common.schema.typing import TTableSchema, TColumnType
-from dlt.common.schema.utils import table_schema_has_type
+from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition
+from dlt.common.schema.utils import table_schema_has_type, get_table_format
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import LoadJob
-from dlt.common.destination.reference import TLoadJobState
+from dlt.common.destination.reference import LoadJob, FollowupJob
+from dlt.common.destination.reference import TLoadJobState, NewLoadJob
 from dlt.common.storages import FileStorage
 from dlt.common.data_writers.escape import escape_bigquery_identifier
-
+from dlt.destinations.sql_jobs import SqlStagingCopyJob
 
 from dlt.destinations.typing import DBApi, DBTransaction
 from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException
 from dlt.destinations.athena import capabilities
 from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error
 from dlt.destinations.typing import DBApiCursor
-from dlt.destinations.job_client_impl import SqlJobClientBase, StorageSchemaInfo
+from dlt.destinations.job_client_impl import SqlJobClientWithStaging
 from dlt.destinations.athena.configuration import AthenaClientConfiguration
 from dlt.destinations.type_mapping import TypeMapper
 from dlt.destinations import path_utils
@@ -69,13 +69,18 @@ class AthenaTypeMapper(TypeMapper):
         "int": "bigint",
     }
 
+    def __init__(self, capabilities: DestinationCapabilitiesContext, iceberg_mode: bool):
+        super().__init__(capabilities)
+        self.iceberg_mode = iceberg_mode
+
     def to_db_integer_type(self, precision: Optional[int]) -> str:
         if precision is None:
             return "bigint"
+        # iceberg does not support smallint and tinyint
         if precision <= 8:
-            return "tinyint"
+            return "int" if self.iceberg_mode else "tinyint"
         elif precision <= 16:
-            return "smallint"
+            return "int" if self.iceberg_mode else "smallint"
         elif precision <= 32:
             return "int"
         return "bigint"
@@ -135,6 +140,11 @@ def exception(self) -> str:
         # this part of code should be never reached
         raise NotImplementedError()
 
+class DoNothingFollowupJob(DoNothingJob, FollowupJob):
+    """The second most lazy class of dlt"""
+    pass
+
+
 class AthenaSQLClient(SqlClientBase[Connection]):
 
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
@@ -276,7 +286,7 @@ def has_dataset(self) -> bool:
         return len(rows) > 0
 
 
-class AthenaClient(SqlJobClientBase):
+class AthenaClient(SqlJobClientWithStaging):
 
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
 
@@ -293,11 +303,12 @@ def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None:
         super().__init__(schema, config, sql_client)
         self.sql_client: AthenaSQLClient = sql_client  # type: ignore
         self.config: AthenaClientConfiguration = config
-        self.type_mapper = AthenaTypeMapper(self.capabilities)
+        self.type_mapper = AthenaTypeMapper(self.capabilities, True)
 
     def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None:
-        # never truncate tables in athena
-        super().initialize_storage([])
+        # only truncate tables in iceberg mode
+        truncate_tables = []
+        super().initialize_storage(truncate_tables)
 
     def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType:
         return self.type_mapper.from_db_type(hive_t, precision, scale)
@@ -309,15 +320,18 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc
 
         bucket = self.config.staging_config.bucket_url
         dataset = self.sql_client.dataset_name
+
         sql: List[str] = []
 
         # for the system tables we need to create empty iceberg tables to be able to run, DELETE and UPDATE queries
-        is_iceberg = self.schema.tables[table_name].get("write_disposition", None) == "skip"
+        # or if we are in iceberg mode, we create iceberg tables for all tables
+        is_iceberg = (self.schema.tables[table_name].get("write_disposition", None) == "skip") or (self._is_iceberg_table(self.schema.tables[table_name]) and not self.in_staging_mode)
         columns = ", ".join([self._get_column_def_sql(c) for c in new_columns])
 
         # this will fail if the table prefix is not properly defined
         table_prefix = self.table_prefix_layout.format(table_name=table_name)
         location = f"{bucket}/{dataset}/{table_prefix}"
+
         # use qualified table names
         qualified_table_name = self.sql_client.make_qualified_ddl_table_name(table_name)
         if is_iceberg and not generate_alter:
@@ -345,9 +359,29 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
             )
         job = super().start_file_load(table, file_path, load_id)
         if not job:
-            job = DoNothingJob(file_path)
+            job = DoNothingFollowupJob(file_path) if self._is_iceberg_table(table) else DoNothingJob(file_path)
         return job
 
+    def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
+        if self._is_iceberg_table(table_chain[0]):
+            return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False})]
+        return super()._create_append_followup_jobs(table_chain)
+
+    def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
+        if self._is_iceberg_table(table_chain[0]):
+            return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})]
+        return super()._create_replace_followup_jobs(table_chain)
+
+    def _is_iceberg_table(self, table: TTableSchema) -> bool:
+        table_format = get_table_format(self.schema.tables, table["name"])
+        return table_format == "iceberg"
+
+    def table_needs_staging(self, table: TTableSchema) -> bool:
+        # all iceberg tables need staging
+        if self._is_iceberg_table(table):
+            return True
+        return super().table_needs_staging(table)
+
     @staticmethod
     def is_dbapi_exception(ex: Exception) -> bool:
         return isinstance(ex, Error)
diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/athena/configuration.py
@@ -12,6 +12,7 @@ class AthenaClientConfiguration(DestinationClientDwhWithStagingConfiguration):
     credentials: AwsCredentials = None
     athena_work_group: Optional[str] = None
     aws_data_catalog: Optional[str] = "awsdatacatalog"
+    supports_truncate_command: bool = False
 
     __config_gen_annotations__: ClassVar[List[str]] = ["athena_work_group"]
 

diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/bigquery/bigquery.py
@@ -12,14 +12,15 @@
 from dlt.common.storages.file_storage import FileStorage
 from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns
 from dlt.common.schema.typing import TTableSchema, TColumnType
+from dlt.common.schema.exceptions import UnknownTableException
 
 from dlt.destinations.job_client_impl import SqlJobClientWithStaging
-from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException, LoadJobUnknownTableException
+from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException
 
 from dlt.destinations.bigquery import capabilities
 from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration
 from dlt.destinations.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS
-from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob
+from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams
 from dlt.destinations.job_impl import NewReferenceJob
 from dlt.destinations.sql_client import SqlClientBase
 from dlt.destinations.type_mapping import TypeMapper
@@ -138,7 +139,7 @@ def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: st
 class BigqueryStagingCopyJob(SqlStagingCopyJob):
 
     @classmethod
-    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]:
+    def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]:
         sql: List[str] = []
         for table in table_chain:
             with sql_client.with_staging_dataset(staging=True):
@@ -167,11 +168,13 @@ def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None:
         self.sql_client: BigQuerySqlClient = sql_client  # type: ignore
         self.type_mapper = BigQueryTypeMapper(self.capabilities)
 
-    def _create_merge_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
-        return BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)
+    def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
+        return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)]
 
-    def _create_optimized_replace_job(self, table_chain: Sequence[TTableSchema]) -> NewLoadJob:
-        return BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client)
+    def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]:
+        if self.config.replace_strategy == "staging-optimized":
+            return [BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client)]
+        return super()._create_replace_followup_jobs(table_chain)
 
     def restore_file_load(self, file_path: str) -> LoadJob:
         """Returns a completed SqlLoadJob or restored BigQueryLoadJob
@@ -218,7 +221,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 reason = BigQuerySqlClient._get_reason_from_errors(gace)
                 if reason == "notFound":
                     # google.api_core.exceptions.NotFound: 404 - table not found
-                    raise LoadJobUnknownTableException(table["name"], file_path)
+                    raise UnknownTableException(table["name"])
                 elif reason == "duplicate":
                     # google.api_core.exceptions.Conflict: 409 PUT - already exists
                     return self.restore_file_load(file_path)

diff --git a/dlt/destinations/exceptions.py b/dlt/destinations/exceptions.py
@@ -63,12 +63,6 @@ def __init__(self, file_path: str, message: str) -> None:
         super().__init__(f"Job with id/file name {file_path} encountered unrecoverable problem: {message}")
 
 
-class LoadJobUnknownTableException(DestinationTerminalException):
-    def __init__(self, table_name: str, file_name: str) -> None:
-        self.table_name = table_name
-        super().__init__(f"Client does not know table {table_name} for load file {file_name}")
-
-
 class LoadJobInvalidStateTransitionException(DestinationTerminalException):
     def __init__(self, from_state: TLoadJobState, to_state: TLoadJobState) -> None:
         self.from_state = from_state