-
Notifications
You must be signed in to change notification settings - Fork 199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Extend custom destination #1107
Changes from 3 commits
469b225
fc15c94
890c0e0
4278853
1e276c5
cee1e90
7e02f82
f6e7e6f
ed5956b
6467063
5f888ad
cd14cbe
897bf8b
5a5645f
e663e30
3739c9a
b3fe660
8bb0e30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,17 @@ | ||
from typing import Optional | ||
from dlt.common.destination import DestinationCapabilitiesContext | ||
from dlt.common.data_writers import TLoaderFileFormat | ||
|
||
|
||
def capabilities( | ||
preferred_loader_file_format: TLoaderFileFormat = "puae-jsonl", | ||
naming_convention: str = "direct", | ||
max_table_nesting: Optional[int] = 0, | ||
) -> DestinationCapabilitiesContext: | ||
caps = DestinationCapabilitiesContext.generic_capabilities(preferred_loader_file_format) | ||
caps.supported_loader_file_formats = ["puae-jsonl", "parquet"] | ||
caps.supports_ddl_transactions = False | ||
caps.supports_transactions = False | ||
caps.naming_convention = naming_convention | ||
caps.max_table_nesting = max_table_nesting | ||
return caps |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
from abc import ABC, abstractmethod | ||
from types import TracebackType | ||
from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict | ||
from typing import ClassVar, Dict, Optional, Type, Iterable, Iterable, cast, Dict, List | ||
from copy import deepcopy | ||
|
||
from dlt.common.destination.reference import LoadJob | ||
from dlt.destinations.job_impl import EmptyLoadJob | ||
from dlt.common.typing import TDataItems, AnyFun | ||
from dlt.common import json | ||
|
@@ -18,6 +20,7 @@ | |
from dlt.common.destination.reference import ( | ||
TLoadJobState, | ||
LoadJob, | ||
DoNothingJob, | ||
JobClientBase, | ||
) | ||
|
||
|
@@ -27,6 +30,8 @@ | |
TDestinationCallable, | ||
) | ||
|
||
INTERNAL_MARKER = "_dlt" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you must use schema._dlt_tables_prefix (which may be normalized) to detect dlt identifiers. you may add such method to schema (but it will be slower to call a method) |
||
|
||
|
||
class DestinationLoadJob(LoadJob, ABC): | ||
def __init__( | ||
|
@@ -37,6 +42,7 @@ def __init__( | |
schema: Schema, | ||
destination_state: Dict[str, int], | ||
destination_callable: TDestinationCallable, | ||
skipped_columns: List[str], | ||
) -> None: | ||
super().__init__(FileStorage.get_file_name_from_file_path(file_path)) | ||
self._file_path = file_path | ||
|
@@ -47,6 +53,7 @@ def __init__( | |
self._callable = destination_callable | ||
self._state: TLoadJobState = "running" | ||
self._storage_id = f"{self._parsed_file_name.table_name}.{self._parsed_file_name.file_id}" | ||
self.skipped_columns = skipped_columns | ||
try: | ||
if self._config.batch_size == 0: | ||
# on batch size zero we only call the callable with the filename | ||
|
@@ -93,9 +100,14 @@ def run(self, start_index: int) -> Iterable[TDataItems]: | |
start_index % self._config.batch_size | ||
) == 0, "Batch size was changed during processing of one load package" | ||
|
||
# on record batches we cannot drop columns, we need to | ||
# select the ones we want to keep | ||
keep_columns = list(self._table["columns"].keys()) | ||
start_batch = start_index / self._config.batch_size | ||
with pyarrow.parquet.ParquetFile(self._file_path) as reader: | ||
for record_batch in reader.iter_batches(batch_size=self._config.batch_size): | ||
for record_batch in reader.iter_batches( | ||
batch_size=self._config.batch_size, columns=keep_columns | ||
rudolfix marked this conversation as resolved.
Show resolved
Hide resolved
|
||
): | ||
if start_batch > 0: | ||
start_batch -= 1 | ||
continue | ||
|
@@ -115,6 +127,9 @@ def run(self, start_index: int) -> Iterable[TDataItems]: | |
if start_index > 0: | ||
start_index -= 1 | ||
continue | ||
# skip internal columns | ||
for column in self.skipped_columns: | ||
item.pop(column, None) | ||
current_batch.append(item) | ||
if len(current_batch) == self._config.batch_size: | ||
yield current_batch | ||
|
@@ -150,6 +165,17 @@ def update_stored_schema( | |
return super().update_stored_schema(only_tables, expected_update) | ||
|
||
def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: | ||
# skip internal tables and remove columns from schema if so configured | ||
skipped_columns: List[str] = [] | ||
if self.config.skip_dlt_columns_and_tables: | ||
if table["name"].startswith(INTERNAL_MARKER): | ||
return DoNothingJob(file_path) | ||
table = deepcopy(table) | ||
for column in list(table["columns"].keys()): | ||
if column.startswith(INTERNAL_MARKER): | ||
table["columns"].pop(column) | ||
skipped_columns.append(column) | ||
|
||
# save our state in destination name scope | ||
load_state = destination_state() | ||
if file_path.endswith("parquet"): | ||
|
@@ -160,6 +186,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> | |
self.schema, | ||
load_state, | ||
self.destination_callable, | ||
skipped_columns, | ||
) | ||
if file_path.endswith("jsonl"): | ||
return DestinationJsonlLoadJob( | ||
|
@@ -169,6 +196,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> | |
self.schema, | ||
load_state, | ||
self.destination_callable, | ||
skipped_columns, | ||
) | ||
return None | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -456,7 +456,18 @@ def normalize( | |
return None | ||
|
||
# make sure destination capabilities are available | ||
self._get_destination_capabilities() | ||
caps = self._get_destination_capabilities() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rudolfix i need some guidance where to inject / overwrite the max_nesting_level coming from a destination. I realize this place is very likely not the right one, but I am not sure where and how to do it. Should I get the capabilities context in the relationalnormalizer and not persist this setting to the schema at all, or what is the best way? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the only thing you need to do is to fix
detect the type of the json normalizer and apply the settings to it like the below. you can override existing settings. I think capabilities (if not None) should have precedence over the source settings. what happens later: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. since this is set on the nested json normalizer settings i had to change a bit more but not much, I hope it is ok to change the type from mapping to dict in there. |
||
if caps.max_table_nesting is not None: | ||
# destination settings override normalizer settings in schema | ||
from dlt.common.normalizers.json.relational import ( | ||
DataItemNormalizer as RelationalNormalizer, | ||
) | ||
|
||
RelationalNormalizer.update_normalizer_config( | ||
self.default_schema, {"max_nesting": caps.max_table_nesting} | ||
) | ||
self._schema_storage.save_schema(self.default_schema) | ||
|
||
# create default normalize config | ||
normalize_config = NormalizeConfiguration( | ||
workers=workers, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good! please add this to our docs: that the default settings are such that data comes to sink without changing identifiers, un-nested and with dlt identifiers removed. and that it is good to push stuff to queues and REST APIs