dlt-hub · sh-rp · Mar 20, 2024 · Mar 18, 2024 · Mar 18, 2024 · Mar 18, 2024
diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py
@@ -55,6 +55,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext):
     insert_values_writer_type: str = "default"
     supports_multiple_statements: bool = True
     supports_clone_table: bool = False
+    max_table_nesting: Optional[int] = None  # destination can overwrite max table nesting
     """Destination supports CREATE TABLE ... CLONE ... statements"""
 
     # do not allow to create default value, destination caps must be always explicitly inserted into container

diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py
@@ -260,6 +260,27 @@ def create_followup_jobs(self, final_state: TLoadJobState) -> List[NewLoadJob]:
         return []
 
 
+class DoNothingJob(LoadJob):
+    """The most lazy class of dlt"""
+
+    def __init__(self, file_path: str) -> None:
+        super().__init__(FileStorage.get_file_name_from_file_path(file_path))
+
+    def state(self) -> TLoadJobState:
+        # this job is always done
+        return "completed"
+
+    def exception(self) -> str:
+        # this part of code should be never reached
+        raise NotImplementedError()
+
+
+class DoNothingFollowupJob(DoNothingJob, FollowupJob):
+    """The second most lazy class of dlt"""
+
+    pass
+
+
 class JobClientBase(ABC):
     capabilities: ClassVar[DestinationCapabilitiesContext] = None
 

diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py
@@ -23,6 +23,8 @@ def destination(
     batch_size: int = 10,
     name: str = None,
     naming_convention: str = "direct",
+    skip_dlt_columns_and_tables: bool = True,
+    max_table_nesting: int = 0,
     spec: Type[GenericDestinationClientConfiguration] = GenericDestinationClientConfiguration,
 ) -> Callable[
     [Callable[Concatenate[Union[TDataItems, str], TTableSchema, TDestinationCallableParams], Any]],
@@ -49,6 +51,8 @@ def wrapper(
                 batch_size=batch_size,
                 destination_name=name,
                 naming_convention=naming_convention,
+                skip_dlt_columns_and_tables=skip_dlt_columns_and_tables,
+                max_table_nesting=max_table_nesting,
                 **kwargs,  # type: ignore
             )
 

diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py
@@ -37,7 +37,7 @@
 from dlt.common.schema.typing import TTableSchema, TColumnType, TWriteDisposition, TTableFormat
 from dlt.common.schema.utils import table_schema_has_type, get_table_format
 from dlt.common.destination import DestinationCapabilitiesContext
-from dlt.common.destination.reference import LoadJob, FollowupJob
+from dlt.common.destination.reference import LoadJob, DoNothingFollowupJob, DoNothingJob
 from dlt.common.destination.reference import TLoadJobState, NewLoadJob, SupportsStagingDestination
 from dlt.common.storages import FileStorage
 from dlt.common.data_writers.escape import escape_bigquery_identifier
@@ -149,27 +149,6 @@ def __init__(self) -> None:
         DLTAthenaFormatter._INSTANCE = self
 
 
-class DoNothingJob(LoadJob):
-    """The most lazy class of dlt"""
-
-    def __init__(self, file_path: str) -> None:
-        super().__init__(FileStorage.get_file_name_from_file_path(file_path))
-
-    def state(self) -> TLoadJobState:
-        # this job is always done
-        return "completed"
-
-    def exception(self) -> str:
-        # this part of code should be never reached
-        raise NotImplementedError()
-
-
-class DoNothingFollowupJob(DoNothingJob, FollowupJob):
-    """The second most lazy class of dlt"""
-
-    pass
-
-
 class AthenaSQLClient(SqlClientBase[Connection]):
     capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities()
     dbapi: ClassVar[DBApi] = pyathena

diff --git a/dlt/destinations/impl/destination/__init__.py b/dlt/destinations/impl/destination/__init__.py
@@ -1,14 +1,17 @@
+from typing import Optional
 from dlt.common.destination import DestinationCapabilitiesContext
 from dlt.common.data_writers import TLoaderFileFormat
 
 
 def capabilities(
     preferred_loader_file_format: TLoaderFileFormat = "puae-jsonl",
     naming_convention: str = "direct",
+    max_table_nesting: Optional[int] = 0,
 ) -> DestinationCapabilitiesContext:
     caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format)
     caps.supported_loader_file_formats = ["puae-jsonl", "parquet"]
     caps.supports_ddl_transactions = False
     caps.supports_transactions = False
     caps.naming_convention = naming_convention
+    caps.max_table_nesting = max_table_nesting
     return caps
diff --git a/dlt/destinations/impl/destination/configuration.py b/dlt/destinations/impl/destination/configuration.py
@@ -20,6 +20,8 @@ class GenericDestinationClientConfiguration(DestinationClientConfiguration):
     destination_callable: Optional[Union[str, TDestinationCallable]] = None  # noqa: A003
     loader_file_format: TLoaderFileFormat = "puae-jsonl"
     batch_size: int = 10
+    skip_dlt_columns_and_tables: bool = True
+    max_table_nesting: int = 0
 
     if TYPE_CHECKING:
 

diff --git a/dlt/destinations/impl/destination/destination.py b/dlt/destinations/impl/destination/destination.py
@@ -1,7 +1,9 @@
 from abc import ABC, abstractmethod
 from types import TracebackType
-from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict
+from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict, List
+from copy import deepcopy
 
+from dlt.common.destination.reference import LoadJob
 from dlt.destinations.job_impl import EmptyLoadJob
 from dlt.common.typing import TDataItems, AnyFun
 from dlt.common import json
@@ -18,6 +20,7 @@
 from dlt.common.destination.reference import (
     TLoadJobState,
     LoadJob,
+    DoNothingJob,
     JobClientBase,
 )
 
@@ -27,6 +30,8 @@
     TDestinationCallable,
 )
 
+INTERNAL_MARKER = "_dlt"
+
 
 class DestinationLoadJob(LoadJob, ABC):
     def __init__(
@@ -37,6 +42,7 @@ def __init__(
         schema: Schema,
         destination_state: Dict[str, int],
         destination_callable: TDestinationCallable,
+        skipped_columns: List[str],
     ) -> None:
         super().__init__(FileStorage.get_file_name_from_file_path(file_path))
         self._file_path = file_path
@@ -47,6 +53,7 @@ def __init__(
         self._callable = destination_callable
         self._state: TLoadJobState = "running"
         self._storage_id = f"{self._parsed_file_name.table_name}.{self._parsed_file_name.file_id}"
+        self.skipped_columns = skipped_columns
         try:
             if self._config.batch_size == 0:
                 # on batch size zero we only call the callable with the filename
@@ -93,9 +100,14 @@ def run(self, start_index: int) -> Iterable[TDataItems]:
             start_index % self._config.batch_size
         ) == 0, "Batch size was changed during processing of one load package"
 
+        # on record batches we cannot drop columns, we need to
+        # select the ones we want to keep
+        keep_columns = list(self._table["columns"].keys())
         start_batch = start_index / self._config.batch_size
         with pyarrow.parquet.ParquetFile(self._file_path) as reader:
-            for record_batch in reader.iter_batches(batch_size=self._config.batch_size):
+            for record_batch in reader.iter_batches(
+                batch_size=self._config.batch_size, columns=keep_columns
+            ):
                 if start_batch > 0:
                     start_batch -= 1
                     continue
@@ -115,6 +127,9 @@ def run(self, start_index: int) -> Iterable[TDataItems]:
                 if start_index > 0:
                     start_index -= 1
                     continue
+                # skip internal columns
+                for column in self.skipped_columns:
+                    item.pop(column, None)
                 current_batch.append(item)
                 if len(current_batch) == self._config.batch_size:
                     yield current_batch
@@ -150,6 +165,17 @@ def update_stored_schema(
         return super().update_stored_schema(only_tables, expected_update)
 
     def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob:
+        # skip internal tables and remove columns from schema if so configured
+        skipped_columns: List[str] = []
+        if self.config.skip_dlt_columns_and_tables:
+            if table["name"].startswith(INTERNAL_MARKER):
+                return DoNothingJob(file_path)
+            table = deepcopy(table)
+            for column in list(table["columns"].keys()):
+                if column.startswith(INTERNAL_MARKER):
+                    table["columns"].pop(column)
+                    skipped_columns.append(column)
+
         # save our state in destination name scope
         load_state = destination_state()
         if file_path.endswith("parquet"):
@@ -160,6 +186,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 self.schema,
                 load_state,
                 self.destination_callable,
+                skipped_columns,
             )
         if file_path.endswith("jsonl"):
             return DestinationJsonlLoadJob(
@@ -169,6 +196,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) ->
                 self.schema,
                 load_state,
                 self.destination_callable,
+                skipped_columns,
             )
         return None
 

diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py
@@ -36,8 +36,9 @@ class DestinationInfo(t.NamedTuple):
 class destination(Destination[GenericDestinationClientConfiguration, "DestinationClient"]):
     def capabilities(self) -> DestinationCapabilitiesContext:
         return capabilities(
-            self.config_params.get("loader_file_format", "puae-jsonl"),
-            self.config_params.get("naming_convention", "direct"),
+            preferred_loader_file_format=self.config_params.get("loader_file_format", "puae-jsonl"),
+            naming_convention=self.config_params.get("naming_convention", "direct"),
+            max_table_nesting=self.config_params.get("max_table_nesting", None),
         )
 
     @property

diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py
@@ -456,7 +456,18 @@ def normalize(
             return None
 
         # make sure destination capabilities are available
-        self._get_destination_capabilities()
+        caps = self._get_destination_capabilities()
+        if caps.max_table_nesting is not None:
+            # destination settings override normalizer settings in schema
+            from dlt.common.normalizers.json.relational import (
+                DataItemNormalizer as RelationalNormalizer,
+            )
+
+            RelationalNormalizer.update_normalizer_config(
+                self.default_schema, {"max_nesting": caps.max_table_nesting}
+            )
+            self._schema_storage.save_schema(self.default_schema)
+
         # create default normalize config
         normalize_config = NormalizeConfiguration(
             workers=workers,

diff --git a/docs/website/docs/dlt-ecosystem/destinations/destination.md b/docs/website/docs/dlt-ecosystem/destinations/destination.md
@@ -69,7 +69,7 @@ the sink from your pipeline constructor. Now you can run your pipeline and see t
 The full signature of the destination decorator plus its function is the following:
 
 ```py
-@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink", naming="direct")
+@dlt.destination(batch_size=10, loader_file_format="jsonl", name="my_sink", naming_convention="direct", max_nesting_level=0, skip_dlt_columns_and_tables=True)
 def sink(items: TDataItems, table: TTableSchema) -> None:
     ...
 ```
@@ -82,6 +82,8 @@ in any way you like.
 this can be `jsonl` or `parquet`.
 * The `name` parameter on the destination decorator defines the name of the destination that get's created by the destination decorator.
 * The `naming_convention` parameter on the destination decorator defines the name of the destination that gets created by the destination decorator. This controls
+* The `max_nesting_level` parameter on the destination decorator defines how deep the normalizer will go to normalize complex fields on your data to create subtables. This overwrites any settings on your `source` and is set to zero to not create any nested tables by default.
+* The `skip_dlt_columns_and_tables` parameter on the destination decorator defines wether internal tables and columns will be fed into the custom destination function. This is set to False by default.
 how table and column names are normalized. The default is `direct` which will keep all names the same. 
 
 #### Sink function