dlt-hub · rudolfix · Feb 24, 2024 · Feb 12, 2024 · Feb 12, 2024 · Feb 12, 2024
diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py
@@ -112,6 +112,7 @@ class TColumnSchema(TColumnSchemaBase, total=False):
     root_key: Optional[bool]
     merge_key: Optional[bool]
     variant: Optional[bool]
+    hard_delete: Optional[bool]
 
 
 TTableSchemaColumns = Dict[str, TColumnSchema]

diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py
@@ -573,6 +573,13 @@ def get_columns_names_with_prop(
     ]
 
 
+def has_column_with_prop(
+    table: TTableSchema, column_prop: Union[TColumnProp, str], include_incomplete: bool = False
+) -> bool:
+    """Checks if `table` schema contains column with property `column_prop`."""
+    return len(get_columns_names_with_prop(table, column_prop, include_incomplete)) > 0
+
+
 def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTables:
     aggregated_update: TSchemaTables = {}
     for schema_update in schema_updates:

diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -351,7 +351,9 @@ def _from_db_type(
         return self.type_mapper.from_db_type(hive_t, precision, scale)
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
-        return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}"
+        return (
+            f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}"
+        )
 
     def _get_table_update_sql(
         self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
@@ -376,19 +378,15 @@ def _get_table_update_sql(
         # use qualified table names
         qualified_table_name = self.sql_client.make_qualified_ddl_table_name(table_name)
         if is_iceberg and not generate_alter:
-            sql.append(
-                f"""CREATE TABLE {qualified_table_name}
+            sql.append(f"""CREATE TABLE {qualified_table_name}
                     ({columns})
                     LOCATION '{location}'
-                    TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');"""
-            )
+                    TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');""")
         elif not generate_alter:
-            sql.append(
-                f"""CREATE EXTERNAL TABLE {qualified_table_name}
+            sql.append(f"""CREATE EXTERNAL TABLE {qualified_table_name}
                     ({columns})
                     STORED AS PARQUET
-                    LOCATION '{location}';"""
-            )
+                    LOCATION '{location}';""")
         # alter table to add new columns at the end
         else:
             sql.append(f"""ALTER TABLE {qualified_table_name} ADD COLUMNS ({columns});""")

diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py
@@ -252,9 +252,9 @@ def _get_table_update_sql(
             elif (c := partition_list[0])["data_type"] == "date":
                 sql[0] = f"{sql[0]}\nPARTITION BY {self.capabilities.escape_identifier(c['name'])}"
             elif (c := partition_list[0])["data_type"] == "timestamp":
-                sql[
-                    0
-                ] = f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})"
+                sql[0] = (
+                    f"{sql[0]}\nPARTITION BY DATE({self.capabilities.escape_identifier(c['name'])})"
+                )
             # Automatic partitioning of an INT64 type requires us to be prescriptive - we treat the column as a UNIX timestamp.
             # This is due to the bounds requirement of GENERATE_ARRAY function for partitioning.
             # The 10,000 partitions limit makes it infeasible to cover the entire `bigint` range.
@@ -272,7 +272,9 @@ def _get_table_update_sql(
 
     def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str:
         name = self.capabilities.escape_identifier(c["name"])
-        return f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}"
+        return (
+            f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}"
+        )
 
     def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]:
         schema_table: TTableSchemaColumns = {}

diff --git a/dlt/destinations/impl/databricks/databricks.py b/dlt/destinations/impl/databricks/databricks.py
@@ -166,12 +166,14 @@ def __init__(
             else:
                 raise LoadJobTerminalException(
                     file_path,
-                    f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and azure buckets are supported",
+                    f"Databricks cannot load data from staging bucket {bucket_path}. Only s3 and"
+                    " azure buckets are supported",
                 )
         else:
             raise LoadJobTerminalException(
                 file_path,
-                "Cannot load from local file. Databricks does not support loading from local files. Configure staging with an s3 or azure storage bucket.",
+                "Cannot load from local file. Databricks does not support loading from local files."
+                " Configure staging with an s3 or azure storage bucket.",
             )
 
         # decide on source format, stage_file_path will either be a local file or a bucket path
@@ -181,27 +183,33 @@ def __init__(
             if not config.get("data_writer.disable_compression"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader does not support gzip compressed JSON files. Please disable compression in the data writer configuration: https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression",
+                    "Databricks loader does not support gzip compressed JSON files. Please disable"
+                    " compression in the data writer configuration:"
+                    " https://dlthub.com/docs/reference/performance#disabling-and-enabling-file-compression",
                 )
             if table_schema_has_type(table, "decimal"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load DECIMAL type columns from json files. Switch to parquet format to load decimals.",
+                    "Databricks loader cannot load DECIMAL type columns from json files. Switch to"
+                    " parquet format to load decimals.",
                 )
             if table_schema_has_type(table, "binary"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load BINARY type columns from json files. Switch to parquet format to load byte values.",
+                    "Databricks loader cannot load BINARY type columns from json files. Switch to"
+                    " parquet format to load byte values.",
                 )
             if table_schema_has_type(table, "complex"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load complex columns (lists and dicts) from json files. Switch to parquet format to load complex types.",
+                    "Databricks loader cannot load complex columns (lists and dicts) from json"
+                    " files. Switch to parquet format to load complex types.",
                 )
             if table_schema_has_type(table, "date"):
                 raise LoadJobTerminalException(
                     file_path,
-                    "Databricks loader cannot load DATE type columns from json files. Switch to parquet format to load dates.",
+                    "Databricks loader cannot load DATE type columns from json files. Switch to"
+                    " parquet format to load dates.",
                 )
 
             source_format = "JSON"
@@ -311,7 +319,7 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non
 
     def _get_storage_table_query_columns(self) -> List[str]:
         fields = super()._get_storage_table_query_columns()
-        fields[
-            1
-        ] = "full_data_type"  # Override because this is the only way to get data type with precision
+        fields[1] = (  # Override because this is the only way to get data type with precision
+            "full_data_type"
+        )
         return fields
diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py
@@ -175,15 +175,13 @@ def __init__(
                     f'PUT file://{file_path} @{stage_name}/"{load_id}" OVERWRITE = TRUE,'
                     " AUTO_COMPRESS = FALSE"
                 )
-            client.execute_sql(
-                f"""COPY INTO {qualified_table_name}
+            client.execute_sql(f"""COPY INTO {qualified_table_name}
                 {from_clause}
                 {files_clause}
                 {credentials_clause}
                 FILE_FORMAT = {source_format}
                 MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE'
-                """
-            )
+                """)
             if stage_file_path and not keep_staged_files:
                 client.execute_sql(f"REMOVE {stage_file_path}")
 

diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py
@@ -35,6 +35,7 @@
 )
 from dlt.common.storages import FileStorage
 from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables
+from dlt.common.schema.utils import get_columns_names_with_prop, has_column_with_prop
 from dlt.common.destination.reference import (
     StateInfo,
     StorageSchemaInfo,
@@ -588,3 +589,15 @@ def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool:
         ):
             return True
         return False
+
+    def _create_table_update(
+        self, table_name: str, storage_columns: TTableSchemaColumns
+    ) -> Sequence[TColumnSchema]:
+        updates = super()._create_table_update(table_name, storage_columns)
+        table = self.schema.get_table(table_name)
+        if has_column_with_prop(table, "hard_delete"):
+            # hard_delete column should only be present in staging table, not in final table
+            if not self.in_staging_mode:
+                hard_delete_column = get_columns_names_with_prop(table, "hard_delete")[0]
+                updates = [d for d in updates if d["name"] != hard_delete_column]
+        return updates
diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py
@@ -4,7 +4,7 @@
 from dlt.common.runtime.logger import pretty_format_exception
 
 from dlt.common.schema.typing import TTableSchema
-from dlt.common.schema.utils import get_columns_names_with_prop
+from dlt.common.schema.utils import get_columns_names_with_prop, has_column_with_prop
 from dlt.common.storages.load_storage import ParsedLoadJobFileName
 from dlt.common.utils import uniq_id
 from dlt.destinations.exceptions import MergeDispositionException
@@ -147,6 +147,8 @@ def generate_sql(
 
         First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset.
         At the end we copy the data from the staging dataset into destination dataset.
+
+        If sort and/or hard_delete column hints are provided, records are deleted from the staging dataset before its data is copied to the destination dataset.
         """
         return cls.gen_merge_sql(table_chain, sql_client)
 
@@ -252,6 +254,8 @@ def gen_merge_sql(
     ) -> List[str]:
         sql: List[str] = []
         root_table = table_chain[0]
+        escape_identifier = sql_client.capabilities.escape_identifier
+        escape_literal = sql_client.capabilities.escape_literal
 
         # get top level table full identifiers
         root_table_name = sql_client.make_qualified_table_name(root_table["name"])
@@ -260,13 +264,13 @@ def gen_merge_sql(
         # get merge and primary keys from top level
         primary_keys = list(
             map(
-                sql_client.capabilities.escape_identifier,
+                escape_identifier,
                 get_columns_names_with_prop(root_table, "primary_key"),
             )
         )
         merge_keys = list(
             map(
-                sql_client.capabilities.escape_identifier,
+                escape_identifier,
                 get_columns_names_with_prop(root_table, "merge_key"),
             )
         )
@@ -298,7 +302,7 @@ def gen_merge_sql(
                     " it is not possible to link child tables to it.",
                 )
             # get first unique column
-            unique_column = sql_client.capabilities.escape_identifier(unique_columns[0])
+            unique_column = escape_identifier(unique_columns[0])
             # create temp table with unique identifier
             create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(
                 unique_column, key_table_clauses
@@ -319,7 +323,7 @@ def gen_merge_sql(
                         f" {table['name']} so it is not possible to refer to top level table"
                         f" {root_table['name']} unique column {unique_column}",
                     )
-                root_key_column = sql_client.capabilities.escape_identifier(root_key_columns[0])
+                root_key_column = escape_identifier(root_key_columns[0])
                 sql.append(
                     cls.gen_delete_from_sql(
                         table_name, root_key_column, delete_temp_table_name, unique_column
@@ -333,6 +337,44 @@ def gen_merge_sql(
                 )
             )
 
+        # remove "non-latest" records from staging table (deduplicate) if a sort column is provided
+        if len(primary_keys) > 0:
+            if has_column_with_prop(root_table, "sort"):
+                sort_column = escape_identifier(get_columns_names_with_prop(root_table, "sort")[0])
+                sql.append(f"""
+                    DELETE FROM {staging_root_table_name}
+                    WHERE {sort_column} IN (
+                        SELECT {sort_column} FROM (
+                            SELECT {sort_column}, ROW_NUMBER() OVER (partition BY {", ".join(primary_keys)} ORDER BY {sort_column} DESC) AS _rn
+                            FROM {staging_root_table_name}
+                        ) AS a
+                        WHERE a._rn > 1
+                    );
+                """)
+
+        # remove deleted records from staging tables if a hard_delete column is provided
+        if has_column_with_prop(root_table, "hard_delete"):
+            hard_delete_column = escape_identifier(
+                get_columns_names_with_prop(root_table, "hard_delete")[0]
+            )
+            # first delete from root staging table
+            sql.append(f"""
+                DELETE FROM {staging_root_table_name}
+                WHERE {hard_delete_column} IS NOT DISTINCT FROM {escape_literal(True)};
+            """)
+            # then delete from child staging tables
+            for table in table_chain[1:]:
+                with sql_client.with_staging_dataset(staging=True):
+                    staging_table_name = sql_client.make_qualified_table_name(table["name"])
+                sql.append(f"""
+                    DELETE FROM {staging_table_name}
+                    WHERE NOT EXISTS (
+                        SELECT 1 FROM {staging_root_table_name} AS p
+                        WHERE {staging_table_name}.{root_key_column} = p.{unique_column}
+                    );
+                """)
+
+        if len(table_chain) > 1:
             # create temp table used to deduplicate, only when we have primary keys
             if primary_keys:
                 (
@@ -343,15 +385,19 @@ def gen_merge_sql(
                 )
                 sql.extend(create_insert_temp_table_sql)
 
-        # insert from staging to dataset, truncate staging table
+        # insert from staging to dataset
         for table in table_chain:
             table_name = sql_client.make_qualified_table_name(table["name"])
             with sql_client.with_staging_dataset(staging=True):
                 staging_table_name = sql_client.make_qualified_table_name(table["name"])
             columns = ", ".join(
                 map(
-                    sql_client.capabilities.escape_identifier,
-                    get_columns_names_with_prop(table, "name"),
+                    escape_identifier,
+                    [
+                        c
+                        for c in get_columns_names_with_prop(table, "name")
+                        if c not in get_columns_names_with_prop(table, "hard_delete")
+                    ],
                 )
             )
             insert_sql = (
@@ -374,6 +420,5 @@ def gen_merge_sql(
             if insert_sql.strip()[-1] != ";":
                 insert_sql += ";"
             sql.append(insert_sql)
-            # -- DELETE FROM {staging_table_name} WHERE 1=1;
 
         return sql
diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py
@@ -300,13 +300,6 @@ def __init__(self, resource_name: str, msg: str) -> None:
         super().__init__(resource_name, f"This resource is not a transformer: {msg}")
 
 
-class TableNameMissing(DltSourceException):
-    def __init__(self) -> None:
-        super().__init__(
-            """Table name is missing in table template. Please provide a string or a function that takes a data item as an argument"""
-        )
-
-
 class InconsistentTableTemplate(DltSourceException):
     def __init__(self, reason: str) -> None:
         msg = f"A set of table hints provided to the resource is inconsistent: {reason}"

diff --git a/dlt/load/load.py b/dlt/load/load.py
@@ -21,6 +21,7 @@
 )
 from dlt.common.schema import Schema, TSchemaTables
 from dlt.common.schema.typing import TTableSchema, TWriteDisposition
+from dlt.common.schema.utils import has_column_with_prop
 from dlt.common.storages import LoadStorage
 from dlt.common.destination.reference import (
     DestinationClientDwhConfiguration,
@@ -246,8 +247,14 @@ def get_completed_table_chain(
                 for job in table_jobs
             ):
                 return None
-            # if there are no jobs for the table, skip it, unless the write disposition is replace, as we need to create and clear the child tables
-            if not table_jobs and top_merged_table["write_disposition"] != "replace":
+            # if there are no jobs for the table, skip it, unless child tables need to be replaced
+            needs_replacement = False
+            if top_merged_table["write_disposition"] == "replace" or (
+                top_merged_table["write_disposition"] == "merge"
+                and has_column_with_prop(top_merged_table, "hard_delete")
+            ):
+                needs_replacement = True
+            if not table_jobs and not needs_replacement:
                 continue
             table_chain.append(table)
         # there must be at least table

diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
@@ -1163,9 +1163,9 @@ def _set_context(self, is_active: bool) -> None:
             # set destination context on activation
             if self.destination:
                 # inject capabilities context
-                self._container[
-                    DestinationCapabilitiesContext
-                ] = self._get_destination_capabilities()
+                self._container[DestinationCapabilitiesContext] = (
+                    self._get_destination_capabilities()
+                )
         else:
             # remove destination context on deactivation
             if DestinationCapabilitiesContext in self._container: