From 6602f70dd63703b9953d43edb9aca1a719a8f7a3 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 10 Dec 2024 21:41:17 +0100 Subject: [PATCH 01/12] checks notebook presence before finding userdata (#2117) --- dlt/common/configuration/providers/toml.py | 6 ++++++ .../configuration/test_toml_provider.py | 21 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 3636565fae..e586fef225 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -124,6 +124,12 @@ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLD """Try to load the toml from google colab userdata object""" try: from google.colab import userdata + from dlt.common.runtime.exec_info import is_notebook + + # make sure we work in interactive mode (get_ipython() is available) + # when dlt cli is run, userdata is available but without a kernel + if not is_notebook(): + return None try: return tomlkit.loads(userdata.get(file_name)) diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 481c21b7bb..9538849976 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -4,6 +4,7 @@ import yaml from typing import Any, Dict, Type import datetime # noqa: I251 +from unittest.mock import Mock import dlt from dlt.common import pendulum, json @@ -538,11 +539,28 @@ def loader() -> Dict[str, Any]: def test_colab_toml() -> None: + import builtins + # use a path without any settings files try: sys.path.append("tests/common/cases/modules") - # secrets are in user data + + # ipython not present provider: SettingsTomlProvider = SecretsTomlProvider("tests/common/null", global_dir=None) + assert provider.is_empty + + get_ipython_m = Mock() + get_ipython_m.return_value = "google.colab.Shell" + # make it available to all modules + builtins.get_ipython = get_ipython_m # type: ignore[attr-defined] + # test mock + assert get_ipython() == "google.colab.Shell" # type: ignore[name-defined] # noqa + from dlt.common.runtime.exec_info import is_notebook + + assert is_notebook() + + # secrets are in user data + provider = SecretsTomlProvider("tests/common/null", global_dir=None) assert provider.to_toml() == 'api_key="api"' # config is not in userdata provider = ConfigTomlProvider("tests/common/null", "unknown") @@ -551,4 +569,5 @@ def test_colab_toml() -> None: provider = SecretsTomlProvider("tests/common/cases/configuration/.dlt", global_dir=None) assert provider.get_value("secret_value", str, None) == ("2137", "secret_value") finally: + delattr(builtins, "get_ipython") sys.path.pop() From 51b11d24acf579d4f12abc15f2b661778f2995d9 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 10 Dec 2024 17:35:22 -0500 Subject: [PATCH 02/12] Add open/closed range arguments for incremental (#1991) * Add open/closed range arguments for incremental * Docs for incremental range args * Docstring * Typo * Ensure deduplication is disabled when range_start=='open' * Cache transformer settings --- dlt/common/incremental/typing.py | 4 + dlt/extract/incremental/__init__.py | 60 +++-- dlt/extract/incremental/transform.py | 75 ++++-- dlt/sources/sql_database/helpers.py | 12 +- .../verified-sources/sql_database/advanced.md | 49 +++- .../docs/general-usage/incremental-loading.md | 5 +- tests/extract/test_incremental.py | 111 +++++++- .../load/sources/sql_database/test_helpers.py | 237 ++++++++++++------ .../sql_database/test_sql_database_source.py | 17 +- 9 files changed, 434 insertions(+), 136 deletions(-) diff --git a/dlt/common/incremental/typing.py b/dlt/common/incremental/typing.py index 460e2f234b..2ca981bff0 100644 --- a/dlt/common/incremental/typing.py +++ b/dlt/common/incremental/typing.py @@ -8,6 +8,8 @@ LastValueFunc = Callable[[Sequence[TCursorValue]], Any] OnCursorValueMissing = Literal["raise", "include", "exclude"] +TIncrementalRange = Literal["open", "closed"] + class IncrementalColumnState(TypedDict): initial_value: Optional[Any] @@ -26,3 +28,5 @@ class IncrementalArgs(TypedDict, total=False): allow_external_schedulers: Optional[bool] lag: Optional[Union[float, int]] on_cursor_value_missing: Optional[OnCursorValueMissing] + range_start: Optional[TIncrementalRange] + range_end: Optional[TIncrementalRange] diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 28d33bb71f..5e7bae49c6 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -42,6 +42,7 @@ LastValueFunc, OnCursorValueMissing, IncrementalArgs, + TIncrementalRange, ) from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform from dlt.extract.incremental.transform import ( @@ -104,6 +105,11 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func. + range_start: Decide whether the incremental filtering range is `open` or `closed` on the start value side. Default is `closed`. + Setting this to `open` means that items with the same cursor value as the last value from the previous run (or `initial_value`) are excluded from the result. + The `open` range disables deduplication logic so it can serve as an optimization when you know cursors don't overlap between pipeline runs. + range_end: Decide whether the incremental filtering range is `open` or `closed` on the end value side. Default is `open` (exact `end_value` is excluded). + Setting this to `closed` means that items with the exact same cursor value as the `end_value` are included in the result. """ # this is config/dataclass so declare members @@ -116,6 +122,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa on_cursor_value_missing: OnCursorValueMissing = "raise" lag: Optional[float] = None duplicate_cursor_warning_threshold: ClassVar[int] = 200 + range_start: TIncrementalRange = "closed" + range_end: TIncrementalRange = "open" # incremental acting as empty EMPTY: ClassVar["Incremental[Any]"] = None @@ -132,6 +140,8 @@ def __init__( allow_external_schedulers: bool = False, on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: # make sure that path is valid if cursor_path: @@ -174,9 +184,11 @@ def __init__( self.start_out_of_range: bool = False """Becomes true on the first item that is out of range of `start_value`. I.e. when using `max` this is a value that is lower than `start_value`""" - self._transformers: Dict[str, IncrementalTransform] = {} + self._transformers: Dict[Type[IncrementalTransform], IncrementalTransform] = {} self._bound_pipe: SupportsPipe = None """Bound pipe""" + self.range_start = range_start + self.range_end = range_end @property def primary_key(self) -> Optional[TTableHintTemplate[TColumnNames]]: @@ -190,22 +202,6 @@ def primary_key(self, value: str) -> None: for transform in self._transformers.values(): transform.primary_key = value - def _make_transforms(self) -> None: - types = [("arrow", ArrowIncremental), ("json", JsonIncremental)] - for dt, kls in types: - self._transformers[dt] = kls( - self.resource_name, - self.cursor_path, - self.initial_value, - self.start_value, - self.end_value, - self.last_value_func, - self._primary_key, - set(self._cached_state["unique_hashes"]), - self.on_cursor_value_missing, - self.lag, - ) - @classmethod def from_existing_state( cls, resource_name: str, cursor_path: str @@ -489,7 +485,8 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]": ) # cache state self._cached_state = self.get_state() - self._make_transforms() + # Clear transforms so we get new instances + self._transformers.clear() return self def can_close(self) -> bool: @@ -520,15 +517,34 @@ def __str__(self) -> str: f" {self.last_value_func}" ) + def _make_or_get_transformer(self, cls: Type[IncrementalTransform]) -> IncrementalTransform: + if transformer := self._transformers.get(cls): + return transformer + transformer = self._transformers[cls] = cls( + self.resource_name, + self.cursor_path, + self.initial_value, + self.start_value, + self.end_value, + self.last_value_func, + self._primary_key, + set(self._cached_state["unique_hashes"]), + self.on_cursor_value_missing, + self.lag, + self.range_start, + self.range_end, + ) + return transformer + def _get_transformer(self, items: TDataItems) -> IncrementalTransform: # Assume list is all of the same type for item in items if isinstance(items, list) else [items]: if is_arrow_item(item): - return self._transformers["arrow"] + return self._make_or_get_transformer(ArrowIncremental) elif pandas is not None and isinstance(item, pandas.DataFrame): - return self._transformers["arrow"] - return self._transformers["json"] - return self._transformers["json"] + return self._make_or_get_transformer(ArrowIncremental) + return self._make_or_get_transformer(JsonIncremental) + return self._make_or_get_transformer(JsonIncremental) def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 22b1194b51..1d213e26c2 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -13,7 +13,12 @@ IncrementalPrimaryKeyMissing, IncrementalCursorPathHasValueNone, ) -from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing +from dlt.common.incremental.typing import ( + TCursorValue, + LastValueFunc, + OnCursorValueMissing, + TIncrementalRange, +) from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate @@ -57,6 +62,8 @@ def __init__( unique_hashes: Set[str], on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path @@ -71,6 +78,9 @@ def __init__( self.start_unique_hashes = set(unique_hashes) self.on_cursor_value_missing = on_cursor_value_missing self.lag = lag + self.range_start = range_start + self.range_end = range_end + # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) # for simple column name we'll fallback to search in dict @@ -107,6 +117,8 @@ def __call__( def deduplication_disabled(self) -> bool: """Skip deduplication when length of the key is 0 or if lag is applied.""" # disable deduplication if end value is set - state is not saved + if self.range_start == "open": + return True if self.end_value is not None: return True # disable deduplication if lag is applied - destination must deduplicate ranges @@ -191,10 +203,10 @@ def __call__( # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None: try: - if ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): + if last_value_func((row_value, self.end_value)) != self.end_value: + return None, False, True + + if self.range_end == "open" and last_value_func((row_value,)) == self.end_value: return None, False, True except Exception as ex: raise IncrementalCursorInvalidCoercion( @@ -221,6 +233,9 @@ def __call__( ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: + if self.range_start == "open": + # We only want greater than last_value + return None, False, False # use func to compute row_value into last_value compatible processed_row_value = last_value_func((row_value,)) # skip the record that is not a start_value or new_value: that record was already processed @@ -258,6 +273,31 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + if self.last_value_func is max: + self.compute = pa.compute.max + self.end_compare = ( + pa.compute.less if self.range_end == "open" else pa.compute.less_equal + ) + self.last_value_compare = ( + pa.compute.greater_equal if self.range_start == "closed" else pa.compute.greater + ) + self.new_value_compare = pa.compute.greater + elif self.last_value_func is min: + self.compute = pa.compute.min + self.end_compare = ( + pa.compute.greater if self.range_end == "open" else pa.compute.greater_equal + ) + self.last_value_compare = ( + pa.compute.less_equal if self.range_start == "closed" else pa.compute.less + ) + self.new_value_compare = pa.compute.less + else: + raise NotImplementedError( + "Only min or max last_value_func is supported for arrow tables" + ) + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: if not unique_columns: return [] @@ -312,28 +352,13 @@ def __call__( if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - if self.last_value_func is max: - compute = pa.compute.max - end_compare = pa.compute.less - last_value_compare = pa.compute.greater_equal - new_value_compare = pa.compute.greater - elif self.last_value_func is min: - compute = pa.compute.min - end_compare = pa.compute.greater - last_value_compare = pa.compute.less_equal - new_value_compare = pa.compute.less - else: - raise NotImplementedError( - "Only min or max last_value_func is supported for arrow tables" - ) - # TODO: Json path support. For now assume the cursor_path is a column name cursor_path = self.cursor_path # The new max/min value try: # NOTE: datetimes are always pendulum in UTC - row_value = from_arrow_scalar(compute(tbl[cursor_path])) + row_value = from_arrow_scalar(self.compute(tbl[cursor_path])) cursor_data_type = tbl.schema.field(cursor_path).type row_value_scalar = to_arrow_scalar(row_value, cursor_data_type) except KeyError as e: @@ -364,10 +389,10 @@ def __call__( cursor_data_type, str(ex), ) from ex - tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) + tbl = tbl.filter(self.end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary - end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() + end_out_of_range = not self.end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: try: @@ -383,7 +408,7 @@ def __call__( str(ex), ) from ex # Remove rows lower or equal than the last start value - keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + keep_filter = self.last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) tbl = tbl.filter(keep_filter) if not self.deduplication_disabled: @@ -407,7 +432,7 @@ def __call__( if ( self.last_value is None - or new_value_compare( + or self.new_value_compare( row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) ).as_py() ): # Last value has changed diff --git a/dlt/sources/sql_database/helpers.py b/dlt/sources/sql_database/helpers.py index a8be2a6427..ee38c7dd98 100644 --- a/dlt/sources/sql_database/helpers.py +++ b/dlt/sources/sql_database/helpers.py @@ -94,12 +94,16 @@ def __init__( self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order self.on_cursor_value_missing = self.incremental.on_cursor_value_missing + self.range_start = self.incremental.range_start + self.range_end = self.incremental.range_end else: self.cursor_column = None self.last_value = None self.end_value = None self.row_order = None self.on_cursor_value_missing = None + self.range_start = None + self.range_end = None def _make_query(self) -> SelectAny: table = self.table @@ -110,11 +114,11 @@ def _make_query(self) -> SelectAny: # generate where if last_value_func is max: # Query ordered and filtered according to last_value function - filter_op = operator.ge - filter_op_end = operator.lt + filter_op = operator.ge if self.range_start == "closed" else operator.gt + filter_op_end = operator.lt if self.range_end == "open" else operator.le elif last_value_func is min: - filter_op = operator.le - filter_op_end = operator.gt + filter_op = operator.le if self.range_start == "closed" else operator.lt + filter_op_end = operator.gt if self.range_end == "open" else operator.ge else: # Custom last_value, load everything and let incremental handle filtering return query # type: ignore[no-any-return] diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 6ff3a267d2..c532f6d357 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -16,7 +16,7 @@ Efficient data management often requires loading only new or updated data from y Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. -#### How to configure +### How to configure 1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. 1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. 1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key. @@ -27,7 +27,7 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I If your cursor column name contains special characters (e.g., `$`) you need to escape it when passing it to the `incremental` function. For example, if your cursor column is `example_$column`, you should pass it as `"'example_$column'"` or `'"example_$column"'` to the `incremental` function: `incremental("'example_$column'", initial_value=...)`. ::: -#### Examples +### Examples 1. **Incremental loading with the resource `sql_table`**. @@ -52,7 +52,7 @@ If your cursor column name contains special characters (e.g., `$`) you need to e print(extract_info) ``` - Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). + Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater or equal to the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state). 2. **Incremental loading with the source `sql_database`**. @@ -78,6 +78,49 @@ If your cursor column name contains special characters (e.g., `$`) you need to e * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. ::: +### Inclusive and exclusive filtering + +By default the incremental filtering is inclusive on the start value side so that +rows with cursor equal to the last run's cursor are fetched again from the database. + +The SQL query generated looks something like this (assuming `last_value_func` is `max`): + +```sql +SELECT * FROM family +WHERE last_modified >= :start_value +ORDER BY last_modified ASC +``` + +That means some rows overlapping with the previous load are fetched from the database. +Duplicates are then filtered out by dlt using either the primary key or a hash of the row's contents. + +This ensures there are no gaps in the extracted sequence. But it does come with some performance overhead, +both due to the deduplication processing and the cost of fetching redundant records from the database. + +This is not always needed. If you know that your data does not contain overlapping cursor values then you +can optimize extraction by passing `range_start="open"` to incremental. + +This both disables the deduplication process and changes the operator used in the SQL `WHERE` clause from `>=` (greater-or-equal) to `>` (greater than), so that no overlapping rows are fetched. + +E.g. + +```py +table = sql_table( + table='family', + incremental=dlt.sources.incremental( + 'last_modified', # Cursor column name + initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0), # Initial cursor value + range_start="open", # exclude the start value + ) +) +``` + +It's a good option if: + +* The cursor is an auto incrementing ID +* The cursor is a high precision timestamp and two records are never created at exactly the same time +* Your pipeline runs are timed in such a way that new data is not generated during the load + ## Parallelized extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows: diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 3f452f0d16..5008795ed4 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -693,7 +693,7 @@ august_issues = repo_issues( ... ``` -Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. +Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments. ### Declare row order to not request unnecessary data @@ -793,6 +793,9 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) yield {"delta": i, "item": {"ts": pendulum.now().timestamp()}} ``` +This deduplication process is always enabled when `range_start` is set to `"closed"` (default). +When you pass `range_start="open"` no deduplication is done as it is not needed as rows with the previous cursor value are excluded. This can be a useful optimization to avoid the performance overhead of deduplication if the cursor field is guaranteed to be unique. + ### Using `dlt.sources.incremental` with dynamically created resources When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well. diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 725872b621..3ebc9d1201 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -5,7 +5,7 @@ from datetime import datetime, date # noqa: I251 from itertools import chain, count from time import sleep -from typing import Any, Optional, Literal, Sequence, Dict +from typing import Any, Optional, Literal, Sequence, Dict, Iterable from unittest import mock import duckdb @@ -1522,6 +1522,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @@ -3851,3 +3852,111 @@ def some_data(): for col in table_schema["columns"].values(): assert "incremental" not in col + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_start_range_open(item_type: TestDataItemFormat, last_value_func: Any) -> None: + data_range: Iterable[int] = range(1, 12) + if last_value_func == max: + initial_value = 5 + # Only items higher than inital extracted + expected_items = list(range(6, 12)) + order_dir = "ASC" + elif last_value_func == min: + data_range = reversed(data_range) # type: ignore[call-overload] + initial_value = 5 + # Only items lower than inital extracted + expected_items = list(reversed(range(1, 5))) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=initial_value, + range_start="open", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in data_range] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + assert items == expected_items + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_start_range_open_no_deduplication(item_type: TestDataItemFormat) -> None: + @dlt.source + def dummy(): + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + range_start="open", + ) + ): + yield [{"updated_at": i} for i in range(3)] + + yield some_data + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(dummy()) + + state = pipeline.state["sources"]["dummy"]["resources"]["some_data"]["incremental"][ + "updated_at" + ] + + # No unique values should be computed + assert state["unique_hashes"] == [] + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_end_range_closed(item_type: TestDataItemFormat, last_value_func: Any) -> None: + values = [5, 10] + expected_items = list(range(5, 11)) + if last_value_func == max: + order_dir = "ASC" + elif last_value_func == min: + values = list(reversed(values)) + expected_items = list(reversed(expected_items)) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=values[0], + end_value=values[1], + range_end="closed", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in range(1, 12)] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + # Includes values 5-10 inclusive + assert items == expected_items diff --git a/tests/load/sources/sql_database/test_helpers.py b/tests/load/sources/sql_database/test_helpers.py index def5430146..43da9c955f 100644 --- a/tests/load/sources/sql_database/test_helpers.py +++ b/tests/load/sources/sql_database/test_helpers.py @@ -1,3 +1,6 @@ +from typing import Callable, Any, TYPE_CHECKING +from dataclasses import dataclass + import pytest import dlt @@ -14,6 +17,18 @@ pytest.skip("Tests require sql alchemy", allow_module_level=True) +@dataclass +class MockIncremental: + last_value: Any + last_value_func: Callable[[Any], Any] + cursor_path: str + row_order: str = None + end_value: Any = None + on_cursor_value_missing: str = "raise" + range_start: str = "closed" + range_end: str = "open" + + @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) def test_cursor_or_unique_column_not_in_table( sql_source_db: SQLAlchemySourceDB, backend: TableBackend @@ -36,13 +51,12 @@ def test_make_query_incremental_max( ) -> None: """Verify query is generated according to incremental settings""" - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -50,14 +64,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) - .where(table.c.created_at >= MockIncremental.last_value) + .where(table.c.created_at >= incremental.last_value) ) assert query.compare(expected) @@ -67,13 +81,14 @@ class MockIncremental: def test_make_query_incremental_min( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = min - cursor_path = "created_at" - row_order = "desc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=min, + cursor_path="created_at", + row_order="desc", + end_value=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -81,14 +96,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) # `min` func swaps order - .where(table.c.created_at <= MockIncremental.last_value) + .where(table.c.created_at <= incremental.last_value) ) assert query.compare(expected) @@ -103,13 +118,14 @@ def test_make_query_incremental_on_cursor_value_missing_set( with_end_value: bool, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None if not with_end_value else dlt.common.pendulum.now().add(hours=1) - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None if not with_end_value else dlt.common.pendulum.now().add(hours=1), + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -117,7 +133,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -131,14 +147,14 @@ class MockIncremental: if with_end_value: where_clause = operator( sa.and_( - table.c.created_at >= MockIncremental.last_value, - table.c.created_at < MockIncremental.end_value, + table.c.created_at >= incremental.last_value, + table.c.created_at < incremental.end_value, ), missing_cond, ) else: where_clause = operator( - table.c.created_at >= MockIncremental.last_value, + table.c.created_at >= incremental.last_value, missing_cond, ) expected = table.select().order_by(table.c.created_at.asc()).where(where_clause) @@ -152,13 +168,14 @@ def test_make_query_incremental_on_cursor_value_missing_no_last_value( backend: TableBackend, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = None - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=None, + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None, + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -166,7 +183,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -189,13 +206,14 @@ def test_make_query_incremental_end_value( ) -> None: now = dlt.common.pendulum.now() - class MockIncremental: - last_value = now - last_value_func = min - cursor_path = "created_at" - end_value = now.add(hours=1) - row_order = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=now, + last_value_func=min, + cursor_path="created_at", + end_value=now.add(hours=1), + row_order=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -203,14 +221,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = table.select().where( sa.and_( - table.c.created_at <= MockIncremental.last_value, - table.c.created_at > MockIncremental.end_value, + table.c.created_at <= incremental.last_value, + table.c.created_at > incremental.end_value, ) ) @@ -221,13 +239,14 @@ class MockIncremental: def test_make_query_incremental_any_fun( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = lambda x: x[-1] - cursor_path = "created_at" - row_order = "asc" - end_value = dlt.common.pendulum.now() - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=lambda x: x[-1], + cursor_path="created_at", + row_order="asc", + end_value=dlt.common.pendulum.now(), + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -235,7 +254,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -256,12 +275,11 @@ def test_cursor_path_field_name_with_a_special_chars( if special_field_name not in table.c: table.append_column(sa.Column(special_field_name, sa.String)) - class MockIncremental: - cursor_path = "'id$field'" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="'id$field'", + last_value=None, + last_value_func=max, + ) # Should not raise any exception loader = TableLoader( @@ -269,7 +287,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert loader.cursor_column == table.c[special_field_name] @@ -281,12 +299,11 @@ def test_cursor_path_multiple_fields( """Test that a cursor_path with multiple fields raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "created_at,updated_at" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="created_at,updated_at", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -294,7 +311,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) @@ -306,12 +323,11 @@ def test_cursor_path_complex_expression( """Test that a complex JSONPath expression in cursor_path raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "$.users[0].id" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="$.users[0].id", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -319,11 +335,80 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_start_open( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_start="open", + ) + + table = sql_source_db.get_table("chat_message") + + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at < incremental.last_value) + else: + expected = expected.where(table.c.created_at > incremental.last_value) + + assert query.compare(expected) + + +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_end_closed( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_end="closed", + ) + + table = sql_source_db.get_table("chat_message") + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at <= incremental.last_value) + else: + expected = expected.where(table.c.created_at >= incremental.last_value) + + assert query.compare(expected) + + def mock_json_column(field: str) -> TDataItem: """""" import pyarrow as pa diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 9079638586..00257471e0 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -13,6 +13,7 @@ from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceExtractionError +from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental from dlt.sources import DltResource from tests.pipeline.utils import ( @@ -831,8 +832,12 @@ def _assert_incremental(item): else: assert _r.incremental.primary_key == ["id"] assert _r.incremental._incremental.primary_key == ["id"] - assert _r.incremental._incremental._transformers["json"].primary_key == ["id"] - assert _r.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] return item pipeline = make_pipeline("duckdb") @@ -841,8 +846,12 @@ def _assert_incremental(item): assert resource.incremental.primary_key == ["id"] assert resource.incremental._incremental.primary_key == ["id"] - assert resource.incremental._incremental._transformers["json"].primary_key == ["id"] - assert resource.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) From 80ef80401b97646901b48e15dade262ef5c3fd52 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 10 Dec 2024 23:44:01 +0100 Subject: [PATCH 03/12] bump semver to minimum version 3.0.0 (#2132) --- poetry.lock | 104 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 +- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 732ba0e219..6232b383c8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -3900,6 +3900,106 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -10518,4 +10618,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c0607d05ab37a1a6addf3ae7264bf5972cb6ce6e46df1dcdc2da3cff72e5008e" +content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" diff --git a/pyproject.toml b/pyproject.toml index 7377b03fde..f736fc65ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ requests = ">=2.26.0" pendulum = ">=2.1.2" simplejson = ">=3.17.5" PyYAML = ">=5.4.1" -semver = ">=2.13.0" +semver = ">=3.0.0" hexbytes = ">=0.2.2" tzdata = ">=2022.1" tomlkit = ">=0.11.3" From 77d8ab6ee23518213fe9da60c4275784450f98fa Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 11 Dec 2024 00:43:32 +0100 Subject: [PATCH 04/12] leverage ibis expression for getting readablerelations (#2046) * add ibis dataset in own class for now * make error clearer * fix some linting and fix broken test * make most destinations work with selecting the right db and catalog, transpiling sql via postgres in some cases and selecting the right dialect in others * add missing motherduck and sqlalchemy mappings * casefold identifiers for ibis wrapper calss * re-organize existing dataset code to prepare ibis relation integration * integrate ibis relation into existing code * re-order tests * fall back to default dataset if table not in schema * make dataset type selectable * add dataset type selection test and fix bug in tests * update docs for ibis expressions use * ensure a bunch of ibis operations continue working * add some more tests and typings * fix typing (with brute force get_attr typing..) * move ibis to dependency group * move ibis stuff to helpers * post devel merge, put in change from dataset, update lockfile * add ibis to sqlalchemy tests * improve docs a bit * fix ibis dep group * fix dataset snippets * fix ibis version * add support for column schema in certion query cases --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destination_athena.yml | 2 +- .../test_destination_athena_iceberg.yml | 2 +- .../workflows/test_destination_bigquery.yml | 2 +- .../workflows/test_destination_clickhouse.yml | 2 +- .../workflows/test_destination_databricks.yml | 2 +- .github/workflows/test_destination_dremio.yml | 2 +- .../workflows/test_destination_motherduck.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .../workflows/test_destination_snowflake.yml | 2 +- .../workflows/test_destination_synapse.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .../test_sqlalchemy_destinations.yml | 2 +- dlt/common/destination/reference.py | 10 +- dlt/destinations/dataset.py | 412 ------------------ dlt/destinations/dataset/__init__.py | 19 + dlt/destinations/dataset/dataset.py | 142 ++++++ dlt/destinations/dataset/exceptions.py | 22 + dlt/destinations/dataset/factory.py | 22 + dlt/destinations/dataset/ibis_relation.py | 224 ++++++++++ dlt/destinations/dataset/relation.py | 207 +++++++++ dlt/destinations/dataset/utils.py | 95 ++++ .../impl/sqlalchemy/db_api_client.py | 4 +- dlt/{common/libs => helpers}/ibis.py | 58 ++- dlt/pipeline/pipeline.py | 12 +- .../general-usage/dataset-access/dataset.md | 58 +++ poetry.lock | 105 ++--- pyproject.toml | 7 +- .../test_readable_dbapi_dataset.py | 30 +- tests/load/pipeline/test_duckdb.py | 8 +- tests/load/test_read_interfaces.py | 363 ++++++++++++--- 31 files changed, 1245 insertions(+), 579 deletions(-) delete mode 100644 dlt/destinations/dataset.py create mode 100644 dlt/destinations/dataset/__init__.py create mode 100644 dlt/destinations/dataset/dataset.py create mode 100644 dlt/destinations/dataset/exceptions.py create mode 100644 dlt/destinations/dataset/factory.py create mode 100644 dlt/destinations/dataset/ibis_relation.py create mode 100644 dlt/destinations/dataset/relation.py create mode 100644 dlt/destinations/dataset/utils.py rename dlt/{common/libs => helpers}/ibis.py (74%) diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 1169fab0de..03eb7f9434 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7ccefcc055..3412e789e3 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 7afc9b8a00..eb8b63f757 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 7f297db971..46464ea462 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml index 1656fe27f4..c1609de863 100644 --- a/.github/workflows/test_destination_databricks.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 45c6d17db1..4bc48c54db 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 0014b17655..db81131266 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 8b899e7da2..6fdd7a5bc5 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index a720c479bd..73a2a8f6e7 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index be1b493916..8f6bf1eb29 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 933248d994..cfd0a3bd56 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 4947a46a3b..6f44e5fd5a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index c2572b322d..1f00373674 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index e27f99cde7..048fe2186f 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -67,7 +67,7 @@ TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") -TDatasetType = Literal["dbapi", "ibis"] +TDatasetType = Literal["auto", "default", "ibis"] DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}" @@ -76,7 +76,7 @@ try: from dlt.common.libs.pandas import DataFrame from dlt.common.libs.pyarrow import Table as ArrowTable - from dlt.common.libs.ibis import BaseBackend as IbisBackend + from dlt.helpers.ibis import BaseBackend as IbisBackend except MissingDependencyException: DataFrame = Any ArrowTable = Any @@ -535,7 +535,7 @@ def fetchone(self) -> Optional[Tuple[Any, ...]]: ... # modifying access parameters - def limit(self, limit: int) -> "SupportsReadableRelation": + def limit(self, limit: int, **kwargs: Any) -> "SupportsReadableRelation": """limit the result to 'limit' items""" ... @@ -557,6 +557,10 @@ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRe """set which columns will be selected""" ... + def __getattr__(self, attr: str) -> Any: + """get an attribute of the relation""" + ... + def __copy__(self) -> "SupportsReadableRelation": """create a copy of the relation object""" ... diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py deleted file mode 100644 index 27a7f5a7af..0000000000 --- a/dlt/destinations/dataset.py +++ /dev/null @@ -1,412 +0,0 @@ -from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple - -from contextlib import contextmanager - -from dlt import version -from dlt.common.json import json -from dlt.common.exceptions import MissingDependencyException -from dlt.common.destination import AnyDestination -from dlt.common.destination.reference import ( - SupportsReadableRelation, - SupportsReadableDataset, - TDatasetType, - TDestinationReferenceArg, - Destination, - JobClientBase, - WithStateSync, - DestinationClientDwhConfiguration, - DestinationClientStagingConfiguration, - DestinationClientConfiguration, - DestinationClientDwhWithStagingConfiguration, -) - -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.destinations.sql_client import SqlClientBase, WithSqlClient -from dlt.common.schema import Schema -from dlt.common.exceptions import DltException - -if TYPE_CHECKING: - try: - from dlt.common.libs.ibis import BaseBackend as IbisBackend - except MissingDependencyException: - IbisBackend = Any -else: - IbisBackend = Any - - -class DatasetException(DltException): - pass - - -class ReadableRelationHasQueryException(DatasetException): - def __init__(self, attempted_change: str) -> None: - msg = ( - "This readable relation was created with a provided sql query. You cannot change" - f" {attempted_change}. Please change the orignal sql query." - ) - super().__init__(msg) - - -class ReadableRelationUnknownColumnException(DatasetException): - def __init__(self, column_name: str) -> None: - msg = ( - f"The selected column {column_name} is not known in the dlt schema for this releation." - ) - super().__init__(msg) - - -class ReadableDBAPIRelation(SupportsReadableRelation): - def __init__( - self, - *, - readable_dataset: "ReadableDBAPIDataset", - provided_query: Any = None, - table_name: str = None, - limit: int = None, - selected_columns: Sequence[str] = None, - ) -> None: - """Create a lazy evaluated relation to for the dataset of a destination""" - - # NOTE: we can keep an assertion here, this class will not be created by the user - assert bool(table_name) != bool( - provided_query - ), "Please provide either an sql query OR a table_name" - - self._dataset = readable_dataset - - self._provided_query = provided_query - self._table_name = table_name - self._limit = limit - self._selected_columns = selected_columns - - # wire protocol functions - self.df = self._wrap_func("df") # type: ignore - self.arrow = self._wrap_func("arrow") # type: ignore - self.fetchall = self._wrap_func("fetchall") # type: ignore - self.fetchmany = self._wrap_func("fetchmany") # type: ignore - self.fetchone = self._wrap_func("fetchone") # type: ignore - - self.iter_df = self._wrap_iter("iter_df") # type: ignore - self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore - self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore - - @property - def sql_client(self) -> SqlClientBase[Any]: - return self._dataset.sql_client - - @property - def schema(self) -> Schema: - return self._dataset.schema - - @property - def query(self) -> Any: - """build the query""" - if self._provided_query: - return self._provided_query - - table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_tables_path(self._table_name) - ) - - maybe_limit_clause_1 = "" - maybe_limit_clause_2 = "" - if self._limit: - maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( - self._limit - ) - - selector = "*" - if self._selected_columns: - selector = ",".join( - [ - self.sql_client.escape_column_name(self.schema.naming.normalize_path(c)) - for c in self._selected_columns - ] - ) - - return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" - - @property - def columns_schema(self) -> TTableSchemaColumns: - return self.compute_columns_schema() - - @columns_schema.setter - def columns_schema(self, new_value: TTableSchemaColumns) -> None: - raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") - - def compute_columns_schema(self) -> TTableSchemaColumns: - """provide schema columns for the cursor, may be filtered by selected columns""" - - columns_schema = ( - self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} - ) - - if not columns_schema: - return None - if not self._selected_columns: - return columns_schema - - filtered_columns: TTableSchemaColumns = {} - for sc in self._selected_columns: - sc = self.schema.naming.normalize_path(sc) - if sc not in columns_schema.keys(): - raise ReadableRelationUnknownColumnException(sc) - filtered_columns[sc] = columns_schema[sc] - - return filtered_columns - - @contextmanager - def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: - """Gets a DBApiCursor for the current relation""" - with self.sql_client as client: - # this hacky code is needed for mssql to disable autocommit, read iterators - # will not work otherwise. in the future we should be able to create a readony - # client which will do this automatically - if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): - self.sql_client._conn.autocommit = False - with client.execute_query(self.query) as cursor: - if columns_schema := self.columns_schema: - cursor.columns_schema = columns_schema - yield cursor - - def _wrap_iter(self, func_name: str) -> Any: - """wrap SupportsReadableRelation generators in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - yield from getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def _wrap_func(self, func_name: str) -> Any: - """wrap SupportsReadableRelation functions in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - return getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def __copy__(self) -> "ReadableDBAPIRelation": - return self.__class__( - readable_dataset=self._dataset, - provided_query=self._provided_query, - table_name=self._table_name, - limit=self._limit, - selected_columns=self._selected_columns, - ) - - def limit(self, limit: int) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("limit") - rel = self.__copy__() - rel._limit = limit - return rel - - def select(self, *columns: str) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("select") - rel = self.__copy__() - rel._selected_columns = columns - # NOTE: the line below will ensure that no unknown columns are selected if - # schema is known - rel.compute_columns_schema() - return rel - - def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": - if isinstance(columns, str): - return self.select(columns) - elif isinstance(columns, Sequence): - return self.select(*columns) - else: - raise TypeError(f"Invalid argument type: {type(columns).__name__}") - - def head(self, limit: int = 5) -> "ReadableDBAPIRelation": - return self.limit(limit) - - -class ReadableDBAPIDataset(SupportsReadableDataset): - """Access to dataframes and arrowtables in the destination dataset via dbapi""" - - def __init__( - self, - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - ) -> None: - self._destination = Destination.from_reference(destination) - self._provided_schema = schema - self._dataset_name = dataset_name - self._sql_client: SqlClientBase[Any] = None - self._schema: Schema = None - - def ibis(self) -> IbisBackend: - """return a connected ibis backend""" - from dlt.common.libs.ibis import create_ibis_backend - - self._ensure_client_and_schema() - return create_ibis_backend( - self._destination, - self._destination_client(self.schema), - ) - - @property - def schema(self) -> Schema: - self._ensure_client_and_schema() - return self._schema - - @property - def sql_client(self) -> SqlClientBase[Any]: - self._ensure_client_and_schema() - return self._sql_client - - def _destination_client(self, schema: Schema) -> JobClientBase: - return get_destination_clients( - schema, destination=self._destination, destination_dataset_name=self._dataset_name - )[0] - - def _ensure_client_and_schema(self) -> None: - """Lazy load schema and client""" - - # full schema given, nothing to do - if not self._schema and isinstance(self._provided_schema, Schema): - self._schema = self._provided_schema - - # schema name given, resolve it from destination by name - elif not self._schema and isinstance(self._provided_schema, str): - with self._destination_client(Schema(self._provided_schema)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema(self._provided_schema) - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - else: - self._schema = Schema(self._provided_schema) - - # no schema name given, load newest schema from destination - elif not self._schema: - with self._destination_client(Schema(self._dataset_name)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema() - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - - # default to empty schema with dataset name - if not self._schema: - self._schema = Schema(self._dataset_name) - - # here we create the client bound to the resolved schema - if not self._sql_client: - destination_client = self._destination_client(self._schema) - if isinstance(destination_client, WithSqlClient): - self._sql_client = destination_client.sql_client - else: - raise Exception( - f"Destination {destination_client.config.destination_type} does not support" - " SqlClient." - ) - - def __call__(self, query: Any) -> ReadableDBAPIRelation: - return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] - - def table(self, table_name: str) -> SupportsReadableRelation: - return ReadableDBAPIRelation( - readable_dataset=self, - table_name=table_name, - ) # type: ignore[abstract] - - def __getitem__(self, table_name: str) -> SupportsReadableRelation: - """access of table via dict notation""" - return self.table(table_name) - - def __getattr__(self, table_name: str) -> SupportsReadableRelation: - """access of table via property notation""" - return self.table(table_name) - - -def dataset( - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - dataset_type: TDatasetType = "dbapi", -) -> SupportsReadableDataset: - if dataset_type == "dbapi": - return ReadableDBAPIDataset(destination, dataset_name, schema) - raise NotImplementedError(f"Dataset of type {dataset_type} not implemented") - - -# helpers -def get_destination_client_initial_config( - destination: AnyDestination, - default_schema_name: str, - dataset_name: str, - as_staging: bool = False, -) -> DestinationClientConfiguration: - client_spec = destination.spec - - # this client supports many schemas and datasets - if issubclass(client_spec, DestinationClientDwhConfiguration): - if issubclass(client_spec, DestinationClientStagingConfiguration): - spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) - else: - spec = client_spec() - - spec._bind_dataset_name(dataset_name, default_schema_name) - return spec - - return client_spec() - - -def get_destination_clients( - schema: Schema, - destination: AnyDestination = None, - destination_dataset_name: str = None, - destination_initial_config: DestinationClientConfiguration = None, - staging: AnyDestination = None, - staging_dataset_name: str = None, - staging_initial_config: DestinationClientConfiguration = None, - # pipeline specific settings - default_schema_name: str = None, -) -> Tuple[JobClientBase, JobClientBase]: - destination = Destination.from_reference(destination) if destination else None - staging = Destination.from_reference(staging) if staging else None - - try: - # resolve staging config in order to pass it to destination client config - staging_client = None - if staging: - if not staging_initial_config: - # this is just initial config - without user configuration injected - staging_initial_config = get_destination_client_initial_config( - staging, - dataset_name=staging_dataset_name, - default_schema_name=default_schema_name, - as_staging=True, - ) - # create the client - that will also resolve the config - staging_client = staging.client(schema, staging_initial_config) - - if not destination_initial_config: - # config is not provided then get it with injected credentials - initial_config = get_destination_client_initial_config( - destination, - dataset_name=destination_dataset_name, - default_schema_name=default_schema_name, - ) - - # attach the staging client config to destination client config - if its type supports it - if ( - staging_client - and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) - and isinstance(staging_client.config, DestinationClientStagingConfiguration) - ): - initial_config.staging_config = staging_client.config - # create instance with initial_config properly set - client = destination.client(schema, initial_config) - return client, staging_client - except ModuleNotFoundError: - client_spec = destination.spec() - raise MissingDependencyException( - f"{client_spec.destination_type} destination", - [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], - "Dependencies for specific destinations are available as extras of dlt", - ) diff --git a/dlt/destinations/dataset/__init__.py b/dlt/destinations/dataset/__init__.py new file mode 100644 index 0000000000..e0eef681b8 --- /dev/null +++ b/dlt/destinations/dataset/__init__.py @@ -0,0 +1,19 @@ +from dlt.destinations.dataset.factory import ( + dataset, +) +from dlt.destinations.dataset.dataset import ( + ReadableDBAPIDataset, + get_destination_clients, +) +from dlt.destinations.dataset.utils import ( + get_destination_clients, + get_destination_client_initial_config, +) + + +__all__ = [ + "dataset", + "ReadableDBAPIDataset", + "get_destination_client_initial_config", + "get_destination_clients", +] diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py new file mode 100644 index 0000000000..e443045e49 --- /dev/null +++ b/dlt/destinations/dataset/dataset.py @@ -0,0 +1,142 @@ +from typing import Any, Union, TYPE_CHECKING + +from dlt.common.json import json + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination.reference import ( + SupportsReadableRelation, + SupportsReadableDataset, + TDestinationReferenceArg, + Destination, + JobClientBase, + WithStateSync, +) + +from dlt.destinations.sql_client import SqlClientBase, WithSqlClient +from dlt.common.schema import Schema +from dlt.destinations.dataset.relation import ReadableDBAPIRelation +from dlt.destinations.dataset.utils import get_destination_clients +from dlt.common.destination.reference import TDatasetType + +if TYPE_CHECKING: + try: + from dlt.helpers.ibis import BaseBackend as IbisBackend + except MissingDependencyException: + IbisBackend = Any +else: + IbisBackend = Any + + +class ReadableDBAPIDataset(SupportsReadableDataset): + """Access to dataframes and arrowtables in the destination dataset via dbapi""" + + def __init__( + self, + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", + ) -> None: + self._destination = Destination.from_reference(destination) + self._provided_schema = schema + self._dataset_name = dataset_name + self._sql_client: SqlClientBase[Any] = None + self._schema: Schema = None + self._dataset_type = dataset_type + + def ibis(self) -> IbisBackend: + """return a connected ibis backend""" + from dlt.helpers.ibis import create_ibis_backend + + self._ensure_client_and_schema() + return create_ibis_backend( + self._destination, + self._destination_client(self.schema), + ) + + @property + def schema(self) -> Schema: + self._ensure_client_and_schema() + return self._schema + + @property + def sql_client(self) -> SqlClientBase[Any]: + self._ensure_client_and_schema() + return self._sql_client + + def _destination_client(self, schema: Schema) -> JobClientBase: + return get_destination_clients( + schema, destination=self._destination, destination_dataset_name=self._dataset_name + )[0] + + def _ensure_client_and_schema(self) -> None: + """Lazy load schema and client""" + + # full schema given, nothing to do + if not self._schema and isinstance(self._provided_schema, Schema): + self._schema = self._provided_schema + + # schema name given, resolve it from destination by name + elif not self._schema and isinstance(self._provided_schema, str): + with self._destination_client(Schema(self._provided_schema)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema(self._provided_schema) + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + else: + self._schema = Schema(self._provided_schema) + + # no schema name given, load newest schema from destination + elif not self._schema: + with self._destination_client(Schema(self._dataset_name)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema() + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + + # default to empty schema with dataset name + if not self._schema: + self._schema = Schema(self._dataset_name) + + # here we create the client bound to the resolved schema + if not self._sql_client: + destination_client = self._destination_client(self._schema) + if isinstance(destination_client, WithSqlClient): + self._sql_client = destination_client.sql_client + else: + raise Exception( + f"Destination {destination_client.config.destination_type} does not support" + " SqlClient." + ) + + def __call__(self, query: Any) -> ReadableDBAPIRelation: + return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] + + def table(self, table_name: str) -> SupportsReadableRelation: + # we can create an ibis powered relation if ibis is available + if table_name in self.schema.tables and self._dataset_type in ("auto", "ibis"): + try: + from dlt.helpers.ibis import create_unbound_ibis_table + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + unbound_table = create_unbound_ibis_table(self.sql_client, self.schema, table_name) + return ReadableIbisRelation(readable_dataset=self, ibis_object=unbound_table, columns_schema=self.schema.tables[table_name]["columns"]) # type: ignore[abstract] + except MissingDependencyException: + # if ibis is explicitly requested, reraise + if self._dataset_type == "ibis": + raise + + # fallback to the standard dbapi relation + return ReadableDBAPIRelation( + readable_dataset=self, + table_name=table_name, + ) # type: ignore[abstract] + + def __getitem__(self, table_name: str) -> SupportsReadableRelation: + """access of table via dict notation""" + return self.table(table_name) + + def __getattr__(self, table_name: str) -> SupportsReadableRelation: + """access of table via property notation""" + return self.table(table_name) diff --git a/dlt/destinations/dataset/exceptions.py b/dlt/destinations/dataset/exceptions.py new file mode 100644 index 0000000000..17e8f6b563 --- /dev/null +++ b/dlt/destinations/dataset/exceptions.py @@ -0,0 +1,22 @@ +from dlt.common.exceptions import DltException + + +class DatasetException(DltException): + pass + + +class ReadableRelationHasQueryException(DatasetException): + def __init__(self, attempted_change: str) -> None: + msg = ( + "This readable relation was created with a provided sql query. You cannot change" + f" {attempted_change}. Please change the orignal sql query." + ) + super().__init__(msg) + + +class ReadableRelationUnknownColumnException(DatasetException): + def __init__(self, column_name: str) -> None: + msg = ( + f"The selected column {column_name} is not known in the dlt schema for this releation." + ) + super().__init__(msg) diff --git a/dlt/destinations/dataset/factory.py b/dlt/destinations/dataset/factory.py new file mode 100644 index 0000000000..8ea0ddf7a1 --- /dev/null +++ b/dlt/destinations/dataset/factory.py @@ -0,0 +1,22 @@ +from typing import Union + + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + SupportsReadableDataset, + TDatasetType, + TDestinationReferenceArg, +) + +from dlt.common.schema import Schema + +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset + + +def dataset( + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", +) -> SupportsReadableDataset: + return ReadableDBAPIDataset(destination, dataset_name, schema, dataset_type) diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py new file mode 100644 index 0000000000..632298ad56 --- /dev/null +++ b/dlt/destinations/dataset/ibis_relation.py @@ -0,0 +1,224 @@ +from typing import TYPE_CHECKING, Any, Union, Sequence + +from functools import partial + +from dlt.common.exceptions import MissingDependencyException +from dlt.destinations.dataset.relation import BaseReadableDBAPIRelation +from dlt.common.schema.typing import TTableSchemaColumns + + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + +try: + from dlt.helpers.ibis import Expr +except MissingDependencyException: + Expr = Any + +# map dlt destination to sqlglot dialect +DIALECT_MAP = { + "dlt.destinations.duckdb": "duckdb", # works + "dlt.destinations.motherduck": "duckdb", # works + "dlt.destinations.clickhouse": "clickhouse", # works + "dlt.destinations.databricks": "databricks", # works + "dlt.destinations.bigquery": "bigquery", # works + "dlt.destinations.postgres": "postgres", # works + "dlt.destinations.redshift": "redshift", # works + "dlt.destinations.snowflake": "snowflake", # works + "dlt.destinations.mssql": "tsql", # works + "dlt.destinations.synapse": "tsql", # works + "dlt.destinations.athena": "trino", # works + "dlt.destinations.filesystem": "duckdb", # works + "dlt.destinations.dremio": "presto", # works + # NOTE: can we discover the current dialect in sqlalchemy? + "dlt.destinations.sqlalchemy": "mysql", # may work +} + +# NOTE: some dialects are not supported by ibis, but by sqlglot, these need to +# be transpiled with a intermediary step +TRANSPILE_VIA_MAP = { + "tsql": "postgres", + "databricks": "postgres", + "clickhouse": "postgres", + "redshift": "postgres", + "presto": "postgres", +} + + +class ReadableIbisRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: ReadableDBAPIDataset, + ibis_object: Any = None, + columns_schema: TTableSchemaColumns = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + super().__init__(readable_dataset=readable_dataset) + self._ibis_object = ibis_object + self._columns_schema = columns_schema + + @property + def query(self) -> Any: + """build the query""" + + from dlt.helpers.ibis import ibis, sqlglot + + destination_type = self._dataset._destination.destination_type + target_dialect = DIALECT_MAP[destination_type] + + # render sql directly if possible + if target_dialect not in TRANSPILE_VIA_MAP: + return ibis.to_sql(self._ibis_object, dialect=target_dialect) + + # here we need to transpile first + transpile_via = TRANSPILE_VIA_MAP[target_dialect] + sql = ibis.to_sql(self._ibis_object, dialect=transpile_via) + sql = sqlglot.transpile(sql, read=transpile_via, write=target_dialect)[0] + return sql + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + # TODO: provide column lineage tracing with sqlglot lineage + return self._columns_schema + + def _proxy_expression_method(self, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Proxy method calls to the underlying ibis expression, allowing to wrap the resulting expression in a new relation""" + + # Get the method from the expression + method = getattr(self._ibis_object, method_name) + + # unwrap args and kwargs if they are relations + args = tuple( + arg._ibis_object if isinstance(arg, ReadableIbisRelation) else arg for arg in args + ) + kwargs = { + k: v._ibis_object if isinstance(v, ReadableIbisRelation) else v + for k, v in kwargs.items() + } + + # casefold string params, we assume these are column names + args = tuple( + self.sql_client.capabilities.casefold_identifier(arg) if isinstance(arg, str) else arg + for arg in args + ) + kwargs = { + k: self.sql_client.capabilities.casefold_identifier(v) if isinstance(v, str) else v + for k, v in kwargs.items() + } + + # Call it with provided args + result = method(*args, **kwargs) + + # calculate columns schema for the result, some operations we know will not change the schema + # and select will just reduce the amount of column + columns_schema = None + if method_name == "select": + columns_schema = self._get_filtered_columns_schema(args) + elif method_name in ["filter", "limit", "order_by", "head"]: + columns_schema = self._columns_schema + + # If result is an ibis expression, wrap it in a new relation else return raw result + return self.__class__( + readable_dataset=self._dataset, ibis_object=result, columns_schema=columns_schema + ) + + def __getattr__(self, name: str) -> Any: + """Wrap all callable attributes of the expression""" + + attr = getattr(self._ibis_object, name, None) + + # try casefolded name for ibis columns access + if attr is None: + name = self.sql_client.capabilities.casefold_identifier(name) + attr = getattr(self._ibis_object, name, None) + + if attr is None: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + if not callable(attr): + # NOTE: we don't need to forward columns schema for non-callable attributes, these are usually columns + return self.__class__(readable_dataset=self._dataset, ibis_object=attr) + + return partial(self._proxy_expression_method, name) + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "ReadableIbisRelation": + # casefold column-names + columns = [columns] if isinstance(columns, str) else columns + columns = [self.sql_client.capabilities.casefold_identifier(col) for col in columns] + expr = self._ibis_object[columns] + return self.__class__( + readable_dataset=self._dataset, + ibis_object=expr, + columns_schema=self._get_filtered_columns_schema(columns), + ) + + def _get_filtered_columns_schema(self, columns: Sequence[str]) -> TTableSchemaColumns: + if not self._columns_schema: + return None + try: + return {col: self._columns_schema[col] for col in columns} + except KeyError: + # NOTE: select statements can contain new columns not present in the original schema + # here we just break the column schema inheritance chain + return None + + # forward ibis methods defined on interface + def limit(self, limit: int, **kwargs: Any) -> "ReadableIbisRelation": + """limit the result to 'limit' items""" + return self._proxy_expression_method("limit", limit, **kwargs) # type: ignore + + def head(self, limit: int = 5) -> "ReadableIbisRelation": + """limit the result to 5 items by default""" + return self._proxy_expression_method("head", limit) # type: ignore + + def select(self, *columns: str) -> "ReadableIbisRelation": + """set which columns will be selected""" + return self._proxy_expression_method("select", *columns) # type: ignore + + # forward ibis comparison and math operators + def __lt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__lt__", other) # type: ignore + + def __gt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__gt__", other) # type: ignore + + def __ge__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__ge__", other) # type: ignore + + def __le__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__le__", other) # type: ignore + + def __eq__(self, other: Any) -> bool: + return self._proxy_expression_method("__eq__", other) # type: ignore + + def __ne__(self, other: Any) -> bool: + return self._proxy_expression_method("__ne__", other) # type: ignore + + def __and__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__and__", other) # type: ignore + + def __or__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__or__", other) # type: ignore + + def __mul__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__mul__", other) # type: ignore + + def __div__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__div__", other) # type: ignore + + def __add__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__add__", other) # type: ignore + + def __sub__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__sub__", other) # type: ignore diff --git a/dlt/destinations/dataset/relation.py b/dlt/destinations/dataset/relation.py new file mode 100644 index 0000000000..2cdb7640df --- /dev/null +++ b/dlt/destinations/dataset/relation.py @@ -0,0 +1,207 @@ +from typing import Any, Generator, Sequence, Union, TYPE_CHECKING + +from contextlib import contextmanager + + +from dlt.common.destination.reference import ( + SupportsReadableRelation, +) + +from dlt.destinations.dataset.exceptions import ( + ReadableRelationHasQueryException, + ReadableRelationUnknownColumnException, +) + +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.destinations.sql_client import SqlClientBase +from dlt.common.schema import Schema + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + + +class BaseReadableDBAPIRelation(SupportsReadableRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + self._dataset = readable_dataset + + # wire protocol functions + self.df = self._wrap_func("df") # type: ignore + self.arrow = self._wrap_func("arrow") # type: ignore + self.fetchall = self._wrap_func("fetchall") # type: ignore + self.fetchmany = self._wrap_func("fetchmany") # type: ignore + self.fetchone = self._wrap_func("fetchone") # type: ignore + + self.iter_df = self._wrap_iter("iter_df") # type: ignore + self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore + self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore + + @property + def sql_client(self) -> SqlClientBase[Any]: + return self._dataset.sql_client + + @property + def schema(self) -> Schema: + return self._dataset.schema + + @property + def query(self) -> Any: + raise NotImplementedError("No query in ReadableDBAPIRelation") + + @contextmanager + def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: + """Gets a DBApiCursor for the current relation""" + with self.sql_client as client: + # this hacky code is needed for mssql to disable autocommit, read iterators + # will not work otherwise. in the future we should be able to create a readony + # client which will do this automatically + if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): + self.sql_client._conn.autocommit = False + with client.execute_query(self.query) as cursor: + if columns_schema := self.columns_schema: + cursor.columns_schema = columns_schema + yield cursor + + def _wrap_iter(self, func_name: str) -> Any: + """wrap SupportsReadableRelation generators in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + yield from getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + def _wrap_func(self, func_name: str) -> Any: + """wrap SupportsReadableRelation functions in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + return getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + +class ReadableDBAPIRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + provided_query: Any = None, + table_name: str = None, + limit: int = None, + selected_columns: Sequence[str] = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + # NOTE: we can keep an assertion here, this class will not be created by the user + assert bool(table_name) != bool( + provided_query + ), "Please provide either an sql query OR a table_name" + + super().__init__(readable_dataset=readable_dataset) + + self._provided_query = provided_query + self._table_name = table_name + self._limit = limit + self._selected_columns = selected_columns + + @property + def query(self) -> Any: + """build the query""" + if self._provided_query: + return self._provided_query + + table_name = self.sql_client.make_qualified_table_name( + self.schema.naming.normalize_path(self._table_name) + ) + + maybe_limit_clause_1 = "" + maybe_limit_clause_2 = "" + if self._limit: + maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( + self._limit + ) + + selector = "*" + if self._selected_columns: + selector = ",".join( + [ + self.sql_client.escape_column_name(self.schema.naming.normalize_tables_path(c)) + for c in self._selected_columns + ] + ) + + return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + + columns_schema = ( + self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} + ) + + if not columns_schema: + return None + if not self._selected_columns: + return columns_schema + + filtered_columns: TTableSchemaColumns = {} + for sc in self._selected_columns: + sc = self.schema.naming.normalize_path(sc) + if sc not in columns_schema.keys(): + raise ReadableRelationUnknownColumnException(sc) + filtered_columns[sc] = columns_schema[sc] + + return filtered_columns + + def __copy__(self) -> "ReadableDBAPIRelation": + return self.__class__( + readable_dataset=self._dataset, + provided_query=self._provided_query, + table_name=self._table_name, + limit=self._limit, + selected_columns=self._selected_columns, + ) + + def limit(self, limit: int, **kwargs: Any) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("limit") + rel = self.__copy__() + rel._limit = limit + return rel + + def select(self, *columns: str) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("select") + rel = self.__copy__() + rel._selected_columns = columns + # NOTE: the line below will ensure that no unknown columns are selected if + # schema is known + rel.compute_columns_schema() + return rel + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": + if isinstance(columns, str): + return self.select(columns) + elif isinstance(columns, Sequence): + return self.select(*columns) + else: + raise TypeError(f"Invalid argument type: {type(columns).__name__}") + + def head(self, limit: int = 5) -> "ReadableDBAPIRelation": + return self.limit(limit) diff --git a/dlt/destinations/dataset/utils.py b/dlt/destinations/dataset/utils.py new file mode 100644 index 0000000000..766fbc13ea --- /dev/null +++ b/dlt/destinations/dataset/utils.py @@ -0,0 +1,95 @@ +from typing import Tuple + +from dlt import version + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + Destination, + JobClientBase, + DestinationClientDwhConfiguration, + DestinationClientStagingConfiguration, + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) + +from dlt.common.schema import Schema + + +# helpers +def get_destination_client_initial_config( + destination: AnyDestination, + default_schema_name: str, + dataset_name: str, + as_staging: bool = False, +) -> DestinationClientConfiguration: + client_spec = destination.spec + + # this client supports many schemas and datasets + if issubclass(client_spec, DestinationClientDwhConfiguration): + if issubclass(client_spec, DestinationClientStagingConfiguration): + spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) + else: + spec = client_spec() + + spec._bind_dataset_name(dataset_name, default_schema_name) + return spec + + return client_spec() + + +def get_destination_clients( + schema: Schema, + destination: AnyDestination = None, + destination_dataset_name: str = None, + destination_initial_config: DestinationClientConfiguration = None, + staging: AnyDestination = None, + staging_dataset_name: str = None, + staging_initial_config: DestinationClientConfiguration = None, + # pipeline specific settings + default_schema_name: str = None, +) -> Tuple[JobClientBase, JobClientBase]: + destination = Destination.from_reference(destination) if destination else None + staging = Destination.from_reference(staging) if staging else None + + try: + # resolve staging config in order to pass it to destination client config + staging_client = None + if staging: + if not staging_initial_config: + # this is just initial config - without user configuration injected + staging_initial_config = get_destination_client_initial_config( + staging, + dataset_name=staging_dataset_name, + default_schema_name=default_schema_name, + as_staging=True, + ) + # create the client - that will also resolve the config + staging_client = staging.client(schema, staging_initial_config) + + if not destination_initial_config: + # config is not provided then get it with injected credentials + initial_config = get_destination_client_initial_config( + destination, + dataset_name=destination_dataset_name, + default_schema_name=default_schema_name, + ) + + # attach the staging client config to destination client config - if its type supports it + if ( + staging_client + and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) + and isinstance(staging_client.config, DestinationClientStagingConfiguration) + ): + initial_config.staging_config = staging_client.config + # create instance with initial_config properly set + client = destination.client(schema, initial_config) + return client, staging_client + except ModuleNotFoundError: + client_spec = destination.spec() + raise MissingDependencyException( + f"{client_spec.destination_type} destination", + [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], + "Dependencies for specific destinations are available as extras of dlt", + ) diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py index 6f3ff065bf..27c4f2f1f9 100644 --- a/dlt/destinations/impl/sqlalchemy/db_api_client.py +++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py @@ -84,7 +84,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None: def _get_columns(self) -> List[str]: try: - return list(self.native_cursor.keys()) # type: ignore[attr-defined] + return list(self.native_cursor.keys()) except ResourceClosedError: # this happens if now rows are returned return [] @@ -314,7 +314,7 @@ def execute_sql( self, sql: Union[AnyStr, sa.sql.Executable], *args: Any, **kwargs: Any ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as cursor: - if cursor.returns_rows: # type: ignore[attr-defined] + if cursor.returns_rows: return cursor.fetchall() return None diff --git a/dlt/common/libs/ibis.py b/dlt/helpers/ibis.py similarity index 74% rename from dlt/common/libs/ibis.py rename to dlt/helpers/ibis.py index ba6f363e66..ed4264dac7 100644 --- a/dlt/common/libs/ibis.py +++ b/dlt/helpers/ibis.py @@ -1,12 +1,14 @@ -from typing import cast +from typing import cast, Any from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination.reference import TDestinationReferenceArg, Destination, JobClientBase +from dlt.common.schema import Schema +from dlt.destinations.sql_client import SqlClientBase try: import ibis # type: ignore - from ibis import BaseBackend + import sqlglot + from ibis import BaseBackend, Expr except ModuleNotFoundError: raise MissingDependencyException("dlt ibis Helpers", ["ibis"]) @@ -29,6 +31,22 @@ ] +# Map dlt data types to ibis data types +DATA_TYPE_MAP = { + "text": "string", + "double": "float64", + "bool": "boolean", + "timestamp": "timestamp", + "bigint": "int64", + "binary": "binary", + "json": "string", # Store JSON as string in ibis + "decimal": "decimal", + "wei": "int64", # Wei is a large integer + "date": "date", + "time": "time", +} + + def create_ibis_backend( destination: TDestinationReferenceArg, client: JobClientBase ) -> BaseBackend: @@ -119,3 +137,37 @@ def create_ibis_backend( con = ibis.duckdb.from_connection(duck) return con + + +def create_unbound_ibis_table( + sql_client: SqlClientBase[Any], schema: Schema, table_name: str +) -> Expr: + """Create an unbound ibis table from a dlt schema""" + + if table_name not in schema.tables: + raise Exception( + f"Table {table_name} not found in schema. Available tables: {schema.tables.keys()}" + ) + table_schema = schema.tables[table_name] + + # Convert dlt table schema columns to ibis schema + ibis_schema = { + sql_client.capabilities.casefold_identifier(col_name): DATA_TYPE_MAP[ + col_info.get("data_type", "string") + ] + for col_name, col_info in table_schema.get("columns", {}).items() + } + + # normalize table name + table_path = sql_client.make_qualified_table_name_path(table_name, escape=False) + + catalog = None + if len(table_path) == 3: + catalog, database, table = table_path + else: + database, table = table_path + + # create unbound ibis table and return in dlt wrapper + unbound_table = ibis.table(schema=ibis_schema, name=table, database=database, catalog=catalog) + + return unbound_table diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 70d160ea67..9bd2d6911f 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1751,9 +1751,17 @@ def __getstate__(self) -> Any: return {"pipeline_name": self.pipeline_name} def _dataset( - self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi" + self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto" ) -> SupportsReadableDataset: - """Access helper to dataset""" + """Returns a dataset object for querying the destination data. + + Args: + schema: Schema name or Schema object to use. If None, uses the default schema if set. + dataset_type: Type of dataset interface to return. Defaults to 'auto' which will select ibis if available + otherwise it will fallback to the standard dbapi interface. + Returns: + A dataset object that supports querying the destination data. + """ if schema is None: schema = self.default_schema if self.default_schema_name else None return dataset( diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 68635383c5..b2e3f03d4d 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -156,6 +156,64 @@ You can combine `select`, `limit`, and other methods. arrow_table = items_relation.select("col1", "col2").limit(50).arrow() ``` +## Modifying queries with ibis expressions + +If you install the amazing [ibis](https://ibis-project.org/) library, you can use ibis expressions to modify your queries. + +```sh +pip install ibis-framework +``` + +dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object under the hood that will allow you to modify the query of a reltaion using ibis expressions: + +```py +# now that ibis is installed, we can get a dataset with ibis relations +dataset = pipeline._dataset() + +# get two relations +items_relation = dataset["items"] +order_relation = dataset["orders"] + +# join them using an ibis expression +joined_relation = items_relation.join(order_relation, items_relation.id == order_relation.item_id) + +# now we can use the ibis expression to filter the data +filtered_relation = joined_relation.filter(order_relation.status == "completed") + +# we can inspect the query that will be used to read the data +print(filtered_relation.query) + +# and finally fetch the data as a pandas dataframe, the same way we would do with a normal relation +df = filtered_relation.df() + +# a few more examples + +# filter for rows where the id is in the list of ids +items_relation.filter(items_relation.id.isin([1, 2, 3])).df() + +# limit and offset +items_relation.limit(10, offset=5).arrow() + +# mutate columns by adding a new colums that always is 10 times the value of the id column +items_relation.mutate(new_id=items_relation.id * 10).df() + +# sort asc and desc +import ibis +items_relation.order_by(ibis.desc("id"), ibis.asc("price")).limit(10) + +# group by and aggregate +items_relation.group_by("item_group").having(items_table.count() >= 1000).aggregate(sum_id=items_table.id.sum()).df() + +# subqueries +items_relation.filter(items_table.category.isin(beverage_categories.name)).df() +``` + +You can learn more about the available expressions on the [ibis for sql users](https://ibis-project.org/tutorials/ibis-for-sql-users) page. + +:::note +Keep in mind that you can use only methods that modify the executed query and none of the methods ibis provides for fetching data. This is done with the same methods defined on the regular relations explained above. If you need full native ibis integration, please read the ibis section in the advanced part further down. Additionally, not all ibis expressions may be supported by all destinations and sql dialects. +::: + ## Supported destinations All SQL and filesystem destinations supported by `dlt` can utilize this data access interface. For filesystem destinations, `dlt` [uses **DuckDB** under the hood](./sql-client.md#the-filesystem-sql-client) to create views from Parquet or JSONL files dynamically. This allows you to query data stored in files using the same interface as you would with SQL databases. If you plan on accessing data in buckets or the filesystem a lot this way, it is advised to load data as Parquet instead of JSONL, as **DuckDB** is able to only load the parts of the data actually needed for the query to work. diff --git a/poetry.lock b/poetry.lock index 6232b383c8..749979439d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -776,7 +776,7 @@ files = [ name = "atpublic" version = "5.0" description = "Keep all y'all's __all__'s in sync" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "atpublic-5.0-py3-none-any.whl", hash = "sha256:b651dcd886666b1042d1e38158a22a4f2c267748f4e97fde94bc492a4a28a3f3"}, @@ -1755,7 +1755,7 @@ PyYAML = ">=3.11" name = "clickhouse-connect" version = "0.7.8" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" -optional = true +optional = false python-versions = "~=3.8" files = [ {file = "clickhouse-connect-0.7.8.tar.gz", hash = "sha256:dad10ba90eabfe215dfb1fef59f2821a95c752988e66f1093ca8590a51539b8f"}, @@ -2242,7 +2242,7 @@ urllib3 = ">=1.0" name = "db-dtypes" version = "1.3.0" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "db_dtypes-1.3.0-py2.py3-none-any.whl", hash = "sha256:7e65c59f849ccbe6f7bc4d0253edcc212a7907662906921caba3e4aadd0bc277"}, @@ -3526,7 +3526,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-bigquery-storage" version = "2.27.0" description = "Google Cloud Bigquery Storage API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl", hash = "sha256:3bfa8f74a61ceaffd3bfe90be5bbef440ad81c1c19ac9075188cccab34bffc2b"}, @@ -4504,63 +4504,64 @@ files = [ [[package]] name = "ibis-framework" -version = "10.0.0.dev256" +version = "9.5.0" description = "The portable Python dataframe library" -optional = true +optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "ibis_framework-10.0.0.dev256-py3-none-any.whl", hash = "sha256:d6f21278e6fd78920bbe986df2c871921142635cc4f7d5d2048cae26e307a3df"}, - {file = "ibis_framework-10.0.0.dev256.tar.gz", hash = "sha256:e9f97d8177fd88f4a3578be20519c1da79a6a7ffac678b46b790bfde67405930"}, + {file = "ibis_framework-9.5.0-py3-none-any.whl", hash = "sha256:145fe30d94f111cff332580c275ce77725c5ff7086eede93af0b371649d009c0"}, + {file = "ibis_framework-9.5.0.tar.gz", hash = "sha256:1c8a29277e63ee0dfc289bc8f550164b5e3bdaec1b76b62436c37d331bb4ef84"}, ] [package.dependencies] atpublic = ">=2.3,<6" clickhouse-connect = {version = ">=0.5.23,<1", extras = ["arrow", "numpy", "pandas"], optional = true, markers = "extra == \"clickhouse\""} db-dtypes = {version = ">=0.3,<2", optional = true, markers = "extra == \"bigquery\""} -duckdb = {version = ">=0.10,<1.2", optional = true, markers = "extra == \"duckdb\""} +duckdb = {version = ">=0.8.1,<1.2", optional = true, markers = "extra == \"duckdb\""} google-cloud-bigquery = {version = ">=3,<4", optional = true, markers = "extra == \"bigquery\""} google-cloud-bigquery-storage = {version = ">=2,<3", optional = true, markers = "extra == \"bigquery\""} -numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"duckdb\" or extra == \"oracle\" or extra == \"polars\" or extra == \"pyspark\""} -pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"dask\" or extra == \"duckdb\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"pyspark\""} +pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} parsy = ">=2,<3" psycopg2 = {version = ">=2.8.4,<3", optional = true, markers = "extra == \"postgres\" or extra == \"risingwave\""} -pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow = {version = ">=10.0.1,<18", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} pydata-google-auth = {version = ">=1.4.0,<2", optional = true, markers = "extra == \"bigquery\""} pyodbc = {version = ">=4.0.39,<6", optional = true, markers = "extra == \"mssql\""} python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" -rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} snowflake-connector-python = {version = ">=3.0.2,<3.3.0b1 || >3.3.0b1,<4", optional = true, markers = "extra == \"snowflake\""} -sqlglot = ">=23.4,<25.30" -toolz = ">=0.11,<2" +sqlglot = ">=23.4,<25.21" +toolz = ">=0.11,<1" typing-extensions = ">=4.3.0,<5" [package.extras] -bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] -clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -databricks = ["databricks-sql-connector-core (>=4,<5)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -datafusion = ["datafusion (>=0.6,<43)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] +clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +dask = ["dask[array,dataframe] (>=2022.9.1,<2024.3.0)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +datafusion = ["datafusion (>=0.6,<41)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] decompiler = ["black (>=22.1.0,<25)"] deltalake = ["deltalake (>=0.9.0,<1)"] -druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] -duckdb = ["duckdb (>=0.10,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] +duckdb = ["duckdb (>=0.8.1,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] examples = ["pins[gcs] (>=0.8.3,<1)"] -exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] -flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] +flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] geospatial = ["geoarrow-types (>=0.2,<1)", "geopandas (>=0.6,<2)", "pyproj (>=3.3.0,<4)", "shapely (>=2,<3)"] -impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] -mysql = ["mysqlclient (>=2.2.4,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] -risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] -sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] -trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] +impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] +mysql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pymysql (>=1,<2)", "rich (>=12.4.4,<14)"] +oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pandas = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] +risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] +sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] visualization = ["graphviz (>=0.16,<1)"] [[package]] @@ -5212,7 +5213,7 @@ source = ["Cython (>=0.29.35)"] name = "lz4" version = "4.3.3" description = "LZ4 Bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, @@ -6645,7 +6646,7 @@ future = "*" name = "parsy" version = "2.1" description = "Easy-to-use parser combinators, for parsing in pure Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "parsy-2.1-py3-none-any.whl", hash = "sha256:8f18e7b11985e7802e7e3ecbd8291c6ca243d29820b1186e4c84605db4efffa0"}, @@ -7080,7 +7081,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2" version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, @@ -7243,7 +7244,7 @@ test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] name = "pyarrow-hotfix" version = "0.6" description = "" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, @@ -7456,7 +7457,7 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" name = "pydata-google-auth" version = "1.9.0" description = "PyData helpers for authenticating to Google APIs" -optional = true +optional = false python-versions = ">=3.9" files = [ {file = "pydata-google-auth-1.9.0.tar.gz", hash = "sha256:2f546e88f007dfdb050087556eb46d6008e351386a7b368096797fae5df374f2"}, @@ -9265,13 +9266,13 @@ typing-extensions = "*" [[package]] name = "sqlglot" -version = "25.24.5" +version = "25.20.2" description = "An easily customizable SQL parser and transpiler" optional = false python-versions = ">=3.7" files = [ - {file = "sqlglot-25.24.5-py3-none-any.whl", hash = "sha256:f8a8870d1f5cdd2e2dc5c39a5030a0c7b0a91264fb8972caead3dac8e8438873"}, - {file = "sqlglot-25.24.5.tar.gz", hash = "sha256:6d3d604034301ca3b614d6b4148646b4033317b7a93d1801e9661495eb4b4fcf"}, + {file = "sqlglot-25.20.2-py3-none-any.whl", hash = "sha256:cdbfd7ce3f2f39f32bd7b4c23fd9e0fd261636a6b14285b914e8def25fd0a567"}, + {file = "sqlglot-25.20.2.tar.gz", hash = "sha256:169fe8308dd70d7bd40117b2221b62bdc7c4e2ea8eb07394b2a6146cdedf05ab"}, ] [package.extras] @@ -9648,13 +9649,13 @@ files = [ [[package]] name = "toolz" -version = "1.0.0" +version = "0.12.1" description = "List processing tools and functional utilities" -optional = true -python-versions = ">=3.8" +optional = false +python-versions = ">=3.7" files = [ - {file = "toolz-1.0.0-py3-none-any.whl", hash = "sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236"}, - {file = "toolz-1.0.0.tar.gz", hash = "sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02"}, + {file = "toolz-0.12.1-py3-none-any.whl", hash = "sha256:d22731364c07d72eea0a0ad45bafb2c2937ab6fd38a3507bf55eae8744aa7d85"}, + {file = "toolz-0.12.1.tar.gz", hash = "sha256:ecca342664893f177a13dac0e6b41cbd8ac25a358e5f215316d43e2100224f4d"}, ] [[package]] @@ -10529,7 +10530,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p name = "zstandard" version = "0.22.0" description = "Zstandard bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "zstandard-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:275df437ab03f8c033b8a2c181e51716c32d831082d93ce48002a5227ec93019"}, @@ -10618,4 +10619,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" +content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" diff --git a/pyproject.toml b/pyproject.toml index f736fc65ad..0fb7f94e36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,6 @@ pytest-mock = "^3.14.0" types-regex = "^2024.5.15.20240519" flake8-print = "^5.0.0" mimesis = "^7.0.0" -ibis-framework = { version = ">=9.0.0", markers = "python_version >= '3.10'", optional = true, extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} shapely = ">=2.0.6" [tool.poetry.group.sources] @@ -205,6 +204,12 @@ optional = true [tool.poetry.group.airflow.dependencies] apache-airflow = {version = "^2.8.0", markers = "python_version < '3.12'"} +[tool.poetry.group.ibis] +optional = true + +[tool.poetry.group.ibis.dependencies] +ibis-framework = { version = ">=9.0.0,<10.0.0", markers = "python_version >= '3.10'", extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} + [tool.poetry.group.providers] optional = true diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index 4745735371..bc58a18fa0 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -2,7 +2,7 @@ import dlt import pytest -from dlt.destinations.dataset import ( +from dlt.destinations.dataset.exceptions import ( ReadableRelationHasQueryException, ReadableRelationUnknownColumnException, ) @@ -12,44 +12,44 @@ def test_query_builder() -> None: dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() # default query for a table - assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined] + assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # head query assert ( - dataset.my_table.head().query.strip() # type: ignore[attr-defined] + dataset.my_table.head().query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 5' ) # limit query assert ( - dataset.my_table.limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.limit(24).query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 24' ) # select columns assert ( - dataset.my_table.select("col1", "col2").query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # also indexer notation assert ( - dataset.my_table[["col1", "col2"]].query.strip() # type: ignore[attr-defined] + dataset.my_table[["col1", "col2"]].query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # identifiers are normalized assert ( - dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() == 'SELECT "co_l1","c_ol2" FROM "pipeline_dataset"."my_table"' ) assert ( - dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() == 'SELECT "co__l1","c_ol2" FROM "pipeline_dataset"."my__table"' ) # limit and select chained assert ( - dataset.my_table.select("col1", "col2").limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").limit(24).query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table" LIMIT 24' ) @@ -65,18 +65,18 @@ def test_copy_and_chaining() -> None: relation2 = relation.__copy__() assert relation != relation2 - assert relation._limit == relation2._limit # type: ignore[attr-defined] - assert relation._table_name == relation2._table_name # type: ignore[attr-defined] - assert relation._provided_query == relation2._provided_query # type: ignore[attr-defined] - assert relation._selected_columns == relation2._selected_columns # type: ignore[attr-defined] + assert relation._limit == relation2._limit + assert relation._table_name == relation2._table_name + assert relation._provided_query == relation2._provided_query + assert relation._selected_columns == relation2._selected_columns # test copy while chaining limit relation3 = relation2.limit(22) assert relation2 != relation3 - assert relation2._limit != relation3._limit # type: ignore[attr-defined] + assert relation2._limit != relation3._limit # test last setting prevails chaining - assert relation.limit(23).limit(67).limit(11)._limit == 11 # type: ignore[attr-defined] + assert relation.limit(23).limit(67).limit(11)._limit == 11 def test_computed_schema_columns() -> None: diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 98642bb263..a7aa4d36e4 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -283,8 +283,8 @@ def test_duckdb_credentials_separation( print(p1_dataset.p1_data.fetchall()) print(p2_dataset.p2_data.fetchall()) - assert "p1" in p1_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] - assert "p2" in p2_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] + assert "p1" in p1_dataset.sql_client.credentials._conn_str() + assert "p2" in p2_dataset.sql_client.credentials._conn_str() - assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 # type: ignore[attr-defined] - assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 # type: ignore[attr-defined] + assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 + assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1a9c8a383b..d2f5f7951e 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -1,5 +1,5 @@ -from typing import Any, cast - +from typing import Any, cast, Tuple, List +import re import pytest import dlt import os @@ -20,8 +20,10 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT, clean_test_storage -from dlt.common.destination.reference import TDestinationReferenceArg -from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +from dlt.destinations.dataset.exceptions import ( + ReadableRelationUnknownColumnException, +) from tests.load.utils import drop_pipeline_data EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -58,6 +60,7 @@ def autouse_test_storage() -> FileStorage: @pytest.fixture(scope="session") def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" + destination_config = cast(DestinationTestConfiguration, request.param) if ( @@ -104,6 +107,7 @@ def items(): columns={ "id": {"data_type": "bigint"}, "double_id": {"data_type": "bigint"}, + "di_decimal": {"data_type": "decimal", "precision": 7, "scale": 3}, }, ) def double_items(): @@ -111,6 +115,7 @@ def double_items(): { "id": i, "double_id": i * 2, + "di_decimal": Decimal("10.433"), } for i in range(total_records) ] @@ -151,6 +156,24 @@ def double_items(): ) +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): + from dlt.destinations.dataset.dataset import ReadableDBAPIRelation + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + assert isinstance( + populated_pipeline._dataset(dataset_type="default").items, ReadableDBAPIRelation + ) + assert isinstance(populated_pipeline._dataset(dataset_type="ibis").items, ReadableIbisRelation) + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -258,71 +281,6 @@ def test_db_cursor_access(populated_pipeline: Pipeline) -> None: assert set(ids) == set(range(total_records)) -@pytest.mark.no_load -@pytest.mark.essential -@pytest.mark.parametrize( - "populated_pipeline", - configs, - indirect=True, - ids=lambda x: x.name, -) -def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: - # NOTE: we could generalize this with a context for certain deps - import subprocess - - subprocess.check_call( - ["pip", "install", "ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]"] - ) - - from dlt.common.libs.ibis import SUPPORTED_DESTINATIONS - - # check correct error if not supported - if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: - with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() - return - - total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() - - map_i = lambda x: x - if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": - map_i = lambda x: x.upper() - - dataset_name = map_i(populated_pipeline.dataset_name) - table_like_statement = None - table_name_prefix = "" - addtional_tables = [] - - # clickhouse has no datasets, but table prefixes and a sentinel table - if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": - table_like_statement = dataset_name + "." - table_name_prefix = dataset_name + "___" - dataset_name = None - addtional_tables = ["dlt_sentinel_table"] - - add_table_prefix = lambda x: table_name_prefix + x - - # just do a basic check to see wether ibis can connect - assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { - add_table_prefix(map_i(x)) - for x in ( - [ - "_dlt_loads", - "_dlt_pipeline_state", - "_dlt_version", - "double_items", - "items", - "items__children", - ] - + addtional_tables - ) - } - - items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) - assert items_table.count().to_pandas() == total_records - - @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -332,7 +290,8 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + # NOTE: for now hints are only preserved for the default dataset + table_relationship = populated_pipeline._dataset(dataset_type="default").items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -425,8 +384,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items - + table_relationship = populated_pipeline._dataset(dataset_type="default").items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() assert [v.lower() for v in data_frame.columns.values] == columns @@ -479,6 +437,266 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: assert "items" in dataset.schema.tables +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_expression_relation(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + import ibis # type: ignore + + # now we should get the more powerful ibis relation + dataset = populated_pipeline._dataset() + total_records = _total_records(populated_pipeline) + + items_table = dataset["items"] + double_items_table = dataset["double_items"] + + # check full table access + df = items_table.df() + assert len(df.index) == total_records + + df = double_items_table.df() + assert len(df.index) == total_records + + # check limit + df = items_table.limit(5).df() + assert len(df.index) == 5 + + # check chained expression with join, column selection, order by and limit + joined_table = ( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + .order_by("id") + .limit(20) + ) + table = joined_table.fetchall() + assert len(table) == 20 + assert list(table[0]) == [0, 0] + assert list(table[5]) == [5, 10] + assert list(table[10]) == [10, 20] + + # check aggregate of first 20 items + agg_table = items_table.order_by("id").limit(20).aggregate(sum_id=items_table.id.sum()) + assert agg_table.fetchone()[0] == reduce(lambda a, b: a + b, range(20)) + + # check filtering + filtered_table = items_table.filter(items_table.id < 10) + assert len(filtered_table.fetchall()) == 10 + + if populated_pipeline.destination.destination_type != "dlt.destinations.duckdb": + return + + # we check a bunch of expressions without executing them to see that they produce correct sql + # also we return the keys of the disovered schema columns + def sql_from_expr(expr: Any) -> Tuple[str, List[str]]: + query = str(expr.query).replace(populated_pipeline.dataset_name, "dataset") + columns = list(expr.columns_schema.keys()) if expr.columns_schema else None + return re.sub(r"\s+", " ", query), columns + + # test all functions discussed here: https://ibis-project.org/tutorials/ibis-for-sql-users + ALL_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] + + # selecting two columns + assert sql_from_expr(items_table.select("id", "decimal")) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # selecting all columns + assert sql_from_expr(items_table) == ('SELECT * FROM "dataset"."items"', ALL_COLUMNS) + + # selecting two other columns via item getter + assert sql_from_expr(items_table["id", "decimal"]) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # adding a new columns + new_col = (items_table.id * 2).name("new_col") + assert sql_from_expr(items_table.select("id", "decimal", new_col)) == ( + ( + 'SELECT "t0"."id", "t0"."decimal", "t0"."id" * 2 AS "new_col" FROM' + ' "dataset"."items" AS "t0"' + ), + None, + ) + + # mutating table (add a new column computed from existing columns) + assert sql_from_expr( + items_table.mutate(double_id=items_table.id * 2).select("id", "double_id") + ) == ( + 'SELECT "t0"."id", "t0"."id" * 2 AS "double_id" FROM "dataset"."items" AS "t0"', + None, + ) + + # mutating table add new static column + assert sql_from_expr( + items_table.mutate(new_col=ibis.literal("static_value")).select("id", "new_col") + ) == ('SELECT "t0"."id", \'static_value\' AS "new_col" FROM "dataset"."items" AS "t0"', None) + + # check filtering (preserves all columns) + assert sql_from_expr(items_table.filter(items_table.id < 10)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ALL_COLUMNS, + ) + + # filtering and selecting a single column + assert sql_from_expr(items_table.filter(items_table.id < 10).select("id")) == ( + 'SELECT "t0"."id" FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ["id"], + ) + + # check filter "and" condition + assert sql_from_expr(items_table.filter(items_table.id < 10).filter(items_table.id > 5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10 AND "t0"."id" > 5', + ALL_COLUMNS, + ) + + # check filter "or" condition + assert sql_from_expr(items_table.filter((items_table.id < 10) | (items_table.id > 5))) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE ( "t0"."id" < 10 ) OR ( "t0"."id" > 5 )', + ALL_COLUMNS, + ) + + # check group by and aggregate + assert sql_from_expr( + items_table.group_by("id") + .having(items_table.count() >= 1000) + .aggregate(sum_id=items_table.id.sum()) + ) == ( + ( + 'SELECT "t1"."id", "t1"."sum_id" FROM ( SELECT "t0"."id", SUM("t0"."id") AS "sum_id",' + ' COUNT(*) AS "CountStar(items)" FROM "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1"' + ' WHERE "t1"."CountStar(items)" >= 1000' + ), + None, + ) + + # sorting and ordering + assert sql_from_expr(items_table.order_by("id", "decimal").limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # sort desc and asc + assert sql_from_expr(items_table.order_by(ibis.desc("id"), ibis.asc("decimal")).limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" DESC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # offset and limit + assert sql_from_expr(items_table.order_by("id").limit(10, offset=5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC LIMIT 10 OFFSET 5', + ALL_COLUMNS, + ) + + # join + assert sql_from_expr( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + ) == ( + ( + 'SELECT "t2"."id", "t3"."double_id" FROM "dataset"."items" AS "t2" INNER JOIN' + ' "dataset"."double_items" AS "t3" ON "t2"."id" = "t3"."id"' + ), + None, + ) + + # subqueries + assert sql_from_expr( + items_table.filter(items_table.decimal.isin(double_items_table.di_decimal)) + ) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."decimal" IN ( SELECT' + ' "t1"."di_decimal" FROM "dataset"."double_items" AS "t1" )' + ), + ALL_COLUMNS, + ) + + # topk + assert sql_from_expr(items_table.decimal.topk(10)) == ( + ( + 'SELECT * FROM ( SELECT "t0"."decimal", COUNT(*) AS "CountStar(items)" FROM' + ' "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1" ORDER BY "t1"."CountStar(items)" DESC' + " LIMIT 10" + ), + None, + ) + + +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + + from dlt.helpers.ibis import SUPPORTED_DESTINATIONS + + # check correct error if not supported + if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: + with pytest.raises(NotImplementedError): + populated_pipeline._dataset().ibis() + return + + total_records = _total_records(populated_pipeline) + ibis_connection = populated_pipeline._dataset().ibis() + + map_i = lambda x: x + if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": + map_i = lambda x: x.upper() + + dataset_name = map_i(populated_pipeline.dataset_name) + table_like_statement = None + table_name_prefix = "" + addtional_tables = [] + + # clickhouse has no datasets, but table prefixes and a sentinel table + if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": + table_like_statement = dataset_name + "." + table_name_prefix = dataset_name + "___" + dataset_name = None + addtional_tables = ["dlt_sentinel_table"] + + add_table_prefix = lambda x: table_name_prefix + x + + # just do a basic check to see wether ibis can connect + assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { + add_table_prefix(map_i(x)) + for x in ( + [ + "_dlt_loads", + "_dlt_pipeline_state", + "_dlt_version", + "double_items", + "items", + "items__children", + ] + + addtional_tables + ) + } + + items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) + assert items_table.count().to_pandas() == total_records + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -546,6 +764,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: assert dataset.schema.name == "unknown_dataset" assert "items" not in dataset.schema.tables + # NOTE: this breaks the following test, it will need to be fixed somehow # create a newer schema with different name and see wether this is loaded from dlt.common.schema import Schema from dlt.common.schema import utils From 4e5a2405e23c7dfae89903327569ae31fb535d4b Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:35:59 +0400 Subject: [PATCH 05/12] `iceberg` table format support for `filesystem` destination (#2067) * add pyiceberg dependency and upgrade mypy - mypy upgrade needed to solve this issue: https://github.com/apache/iceberg-python/issues/768 - uses <1.13.0 requirement on mypy because 1.13.0 gives error - new lint errors arising due to version upgrade are simply ignored * extend pyiceberg dependencies * remove redundant delta annotation * add basic local filesystem iceberg support * add active table format setting * disable merge tests for iceberg table format * restore non-redundant extra info * refactor to in-memory iceberg catalog * add s3 support for iceberg table format * add schema evolution support for iceberg table format * extract _register_table function * add partition support for iceberg table format * update docstring * enable child table test for iceberg table format * enable empty source test for iceberg table format * make iceberg catalog namespace configurable and default to dataset name * add optional typing * fix typo * improve typing * extract logic into dedicated function * add iceberg read support to filesystem sql client * remove unused import * add todo * extract logic into separate functions * add azure support for iceberg table format * generalize delta table format tests * enable get tables function test for iceberg table format * remove ignores * undo table directory management change * enable test_read_interfaces tests for iceberg * fix active table format filter * use mixin for object store rs credentials * generalize catalog typing * extract pyiceberg scheme mapping into separate function * generalize credentials mixin test setup * remove unused import * add centralized fallback to append when merge is not supported * Revert "add centralized fallback to append when merge is not supported" This reverts commit 54cd0bcebffad15d522e734da321c602f4bd7461. * fall back to append if merge is not supported on filesystem * fix test for s3-compatible storage * remove obsolete code path * exclude gcs read interface tests for iceberg * add gcs support for iceberg table format * switch to UnsupportedAuthenticationMethodException * add iceberg table format docs * use shorter pipeline name to prevent too long sql identifiers * add iceberg catalog note to docs * black format * use shorter pipeline name to prevent too long sql identifiers * correct max id length for sqlalchemy mysql dialect * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit 6cce03b77111825b0714597e6d494df97145f0f2. * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit ef29aa7c2fdba79441573850c7d15b83526c011a. * replace show with execute to prevent useless print output * add abfss scheme to test * remove az support for iceberg table format * remove iceberg bucket test exclusion * add note to docs on azure scheme support for iceberg table format * exclude iceberg from duckdb s3-compatibility test * disable pyiceberg info logs for tests * extend table format docs and move into own page * upgrade adlfs to enable account_host attribute * Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1996-iceberg-filesystem * fix lint errors * re-add pyiceberg dependency * enabled iceberg in dbt-duckdb * upgrade pyiceberg version * remove pyiceberg mypy errors across python version * does not install airflow group for dev * fixes gcp oauth iceberg credentials handling * fixes ca cert bundle duckdb azure on ci * allow for airflow dep to be present during type check --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destinations.yml | 9 +- .github/workflows/test_local_destinations.yml | 5 +- Makefile | 2 +- dlt/cli/source_detection.py | 3 +- .../configuration/specs/aws_credentials.py | 15 +- .../configuration/specs/azure_credentials.py | 22 +- .../configuration/specs/base_configuration.py | 2 +- .../specs/config_providers_context.py | 7 +- dlt/common/configuration/specs/exceptions.py | 4 + .../configuration/specs/gcp_credentials.py | 36 +- dlt/common/configuration/specs/mixins.py | 24 ++ dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- dlt/common/libs/deltalake.py | 6 +- dlt/common/libs/pyiceberg.py | 192 +++++++++ dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/reflection/utils.py | 14 +- dlt/common/schema/schema.py | 2 +- dlt/common/typing.py | 2 +- dlt/destinations/impl/filesystem/factory.py | 4 +- .../impl/filesystem/filesystem.py | 86 +++- .../impl/filesystem/sql_client.py | 27 +- dlt/destinations/impl/sqlalchemy/factory.py | 3 + dlt/extract/incremental/lag.py | 2 +- dlt/helpers/airflow_helper.py | 4 +- dlt/helpers/dbt/profiles.yml | 1 + .../destinations/delta-iceberg.md | 168 ++++++++ .../dlt-ecosystem/destinations/filesystem.md | 113 +---- .../dlt-ecosystem/table-formats/iceberg.md | 2 +- .../dataset-access/ibis-backend.md | 3 +- docs/website/sidebars.js | 1 + mypy.ini | 6 + poetry.lock | 154 +++++-- pyproject.toml | 11 +- tests/conftest.py | 3 + tests/libs/test_csv_writer.py | 4 +- ...dentials.py => test_credentials_mixins.py} | 169 +++++--- tests/load/filesystem/test_sql_client.py | 18 +- .../load/pipeline/test_filesystem_pipeline.py | 393 +++++++++++------- .../sql_database/test_sql_database_source.py | 5 +- tests/load/utils.py | 33 +- tests/pipeline/utils.py | 13 + .../helpers/rest_client/test_client.py | 2 +- tests/utils.py | 7 + 45 files changed, 1163 insertions(+), 422 deletions(-) create mode 100644 dlt/common/configuration/specs/mixins.py create mode 100644 dlt/common/libs/pyiceberg.py create mode 100644 docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md rename tests/load/filesystem/{test_object_store_rs_credentials.py => test_credentials_mixins.py} (50%) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index cfd0a3bd56..84a8f95d71 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,8 +77,13 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6f44e5fd5a..706bae1b0c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/Makefile b/Makefile index 2a7f6dac0a..0ca8a2e0c3 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk lint: ./tools/check-package.sh diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 7067f8b896..0769605d01 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -29,8 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" - f" string to '{arg_name}' function in line {dn_node.lineno}", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined] ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 5f69be6a33..a75cd85225 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -16,7 +17,9 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration): +class AwsCredentialsWithoutDefaults( + CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig +): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None @@ -77,6 +80,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "s3.access-key-id": self.aws_access_key_id, + "s3.secret-access-key": self.aws_secret_access_key, + "s3.session-token": self.aws_session_token, + "s3.region": self.region_name, + "s3.endpoint": self.endpoint_url, + "s3.connect-timeout": 300, + } + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index cf6ec493de..aabd0b471a 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt import version from dlt.common.utils import without_none @@ -15,7 +16,7 @@ @configspec -class AzureCredentialsBase(CredentialsConfiguration): +class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials): azure_storage_account_name: str = None azure_account_host: Optional[str] = None """Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net""" @@ -32,7 +33,7 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: @configspec -class AzureCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): """Credentials for Azure Blob Storage, compatible with adlfs""" azure_storage_account_key: Optional[TSecretStrValue] = None @@ -49,6 +50,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: account_host=self.azure_account_host, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.account-key": self.azure_storage_account_key, + "adlfs.sas-token": self.azure_storage_sas_token, + } + def create_sas_token(self) -> None: try: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -72,7 +80,7 @@ def on_partial(self) -> None: @configspec -class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): azure_tenant_id: str = None azure_client_id: str = None azure_client_secret: TSecretStrValue = None @@ -86,6 +94,14 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: client_secret=self.azure_client_secret, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.tenant-id": self.azure_tenant_id, + "adlfs.client-id": self.azure_client_id, + "adlfs.client-secret": self.azure_client_secret, + } + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 5d1a5b7f26..a244ab571f 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,5 +1,4 @@ import contextlib -import dataclasses import io from typing import ClassVar, List @@ -8,10 +7,6 @@ ConfigProvider, ContextProvider, ) -from dlt.common.configuration.specs.base_configuration import ( - ContainerInjectableContext, - NotResolved, -) from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, BaseConfiguration, @@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 928e46a8a0..fe87ef24d7 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any): class ObjectStoreRsCredentialsException(ConfigurationException): pass + + +class UnsupportedAuthenticationMethodException(ConfigurationException): + pass diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 60ab1d4b56..17519b032a 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -11,7 +11,9 @@ InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired, + UnsupportedAuthenticationMethodException, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -23,7 +25,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) @@ -126,6 +128,12 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Service Account authentication not supported with `iceberg` table format. Use OAuth" + " authentication instead." + ) + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -176,11 +184,19 @@ def to_native_representation(self) -> str: return json.dumps(self._info_dict()) def to_object_store_rs_credentials(self) -> Dict[str, str]: - raise NotImplementedError( - "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" - " https://docs.rs/object_store/latest/object_store/gcp." + raise UnsupportedAuthenticationMethodException( + "OAuth authentication not supported with `delta` table format. Use Service Account or" + " Application Default Credentials authentication instead." ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + self.auth() + return { + "gcs.project-id": self.project_id, + "gcs.oauth2.token": self.token, + "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000, + } + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) @@ -313,6 +329,12 @@ def to_native_credentials(self) -> Any: else: return super().to_native_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Application Default Credentials authentication not supported with `iceberg` table" + " format. Use OAuth authentication instead." + ) + @configspec class GcpServiceAccountCredentials( @@ -334,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None: except NativeValueError: pass GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value) + + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + if self.has_default_credentials(): + return GcpDefaultCredentials.to_pyiceberg_fileio_config(self) + else: + return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self) diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py new file mode 100644 index 0000000000..2f843aee5b --- /dev/null +++ b/dlt/common/configuration/specs/mixins.py @@ -0,0 +1,24 @@ +from typing import Dict, Any +from abc import abstractmethod, ABC + + +class WithObjectStoreRsCredentials(ABC): + @abstractmethod + def to_object_store_rs_credentials(self) -> Dict[str, Any]: + """Returns credentials dictionary for object_store Rust crate. + + Can be used for libraries that build on top of the object_store crate, such as `deltalake`. + + https://docs.rs/object_store/latest/object_store/ + """ + pass + + +class WithPyicebergConfig(ABC): + @abstractmethod + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + """Returns `pyiceberg` FileIO configuration dictionary. + + https://py.iceberg.apache.org/configuration/#fileio + """ + pass diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index e2b6c9a442..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 0bad5b152e..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index 4047bc3a1a..0f938e7102 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -10,6 +10,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: @@ -191,10 +192,9 @@ def get_delta_tables( def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: """Returns dict that can be passed as `storage_options` in `deltalake` library.""" - creds = {} # type: ignore + creds = {} extra_options = {} - # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery - if hasattr(config.credentials, "to_object_store_rs_credentials"): + if isinstance(config.credentials, WithObjectStoreRsCredentials): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py new file mode 100644 index 0000000000..19ce9abbf2 --- /dev/null +++ b/dlt/common/libs/pyiceberg.py @@ -0,0 +1,192 @@ +from typing import Dict, Any, List, Optional + +from dlt import version, Pipeline +from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.utils import assert_min_pkg_version +from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages.configuration import FileSystemCredentials +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +try: + from pyiceberg.table import Table as IcebergTable + from pyiceberg.catalog import MetastoreCatalog + import pyarrow as pa +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt pyiceberg helpers", + [f"{version.DLT_PKG_NAME}[pyiceberg]"], + "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.", + ) + + +def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: + ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP) + + +def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table: + schema = ensure_iceberg_compatible_arrow_schema(data.schema) + return data.cast(schema) + + +def write_iceberg_table( + table: IcebergTable, + data: pa.Table, + write_disposition: TWriteDisposition, +) -> None: + if write_disposition == "append": + table.append(ensure_iceberg_compatible_arrow_data(data)) + elif write_disposition == "replace": + table.overwrite(ensure_iceberg_compatible_arrow_data(data)) + + +def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821 + assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg=( + "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination." + ), + ) + + from pyiceberg.catalog.sql import SqlCatalog + + return SqlCatalog( + "default", + uri="sqlite:///:memory:", + **_get_fileio_config(credentials), + ) + + +def create_or_evolve_table( + catalog: MetastoreCatalog, + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + # add table to catalog + table_id = f"{namespace_name}.{table_name}" + table_path = f"{client.dataset_path}/{table_name}" + metadata_path = f"{table_path}/metadata" + if client.fs_client.exists(metadata_path): + # found metadata; register existing table + table = _register_table(table_id, metadata_path, catalog, client) + + # evolve schema + if schema is not None: + with table.update_schema() as update: + update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema)) + else: + # found no metadata; create new table + assert schema is not None + with catalog.create_table_transaction( + table_id, + schema=ensure_iceberg_compatible_arrow_schema(schema), + location=_make_path(table_path, client), + ) as txn: + # add partitioning + with txn.update_spec() as update_spec: + for col in partition_columns: + update_spec.add_identity(col) + + return catalog + + +def get_catalog( + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + + # create in-memory catalog + catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) + + # add table to catalog + catalog = create_or_evolve_table( + catalog=catalog, + client=client, + table_name=table_name, + namespace_name=namespace_name, + schema=schema, + partition_columns=partition_columns, + ) + + return catalog + + +def get_iceberg_tables( + pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None +) -> Dict[str, IcebergTable]: + from dlt.common.schema.utils import get_table_format + + with pipeline.destination_client(schema_name=schema_name) as client: + assert isinstance( + client, FilesystemClient + ), "The `get_iceberg_tables` function requires a `filesystem` destination." + + schema_iceberg_tables = [ + t["name"] + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "iceberg" + ] + if len(tables) > 0: + invalid_tables = set(tables) - set(schema_iceberg_tables) + if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" + raise ValueError( + f"Schema {client.schema.name} does not contain Iceberg tables with these names:" + f" {', '.join(invalid_tables)}.{available_schemas}" + ) + schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] + + return { + name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}") + for name in schema_iceberg_tables + } + + +def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: + if isinstance(credentials, WithPyicebergConfig): + return credentials.to_pyiceberg_fileio_config() + return {} + + +def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + return _make_path(sorted(metadata_files)[-1], client) + + +def _register_table( + identifier: str, + metadata_path: str, + catalog: MetastoreCatalog, + client: FilesystemClient, +) -> IcebergTable: + last_metadata_file = _get_last_metadata_file(metadata_path, client) + return catalog.register_table(identifier, last_metadata_file) + + +def _make_path(path: str, client: FilesystemClient) -> str: + # don't use file protocol for local files because duckdb does not support it + # https://github.com/duckdb/duckdb/issues/13669 + return path if client.is_local_filesystem else client.config.make_url(path) diff --git a/dlt/common/logger.py b/dlt/common/logger.py index b163c15672..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c612c5a4f1..27c7bd8758 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -90,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(ast_unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index d6031a08fa..276bbe9c09 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -525,7 +525,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/typing.py b/dlt/common/typing.py index a3364d1b07..8986d753f3 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 2463da58fa..906bd157e4 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector( *, table_schema: TTableSchema, ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": + if table_schema.get("table_format") in ("delta", "iceberg"): return ("parquet", ["parquet"]) return (preferred_loader_file_format, supported_loader_file_formats) @@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities( preferred_loader_file_format="jsonl", loader_file_format_selector=filesystem_loader_file_format_selector, - supported_table_formats=["delta"], + supported_table_formats=["delta", "iceberg"], supported_merge_strategies=["upsert"], merge_strategies_selector=filesystem_merge_strategies_selector, ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 1739c87fb3..ccf764811b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -119,16 +119,27 @@ def metrics(self) -> Optional[LoadJobMetrics]: return m._replace(remote_url=self.make_remote_url()) -class DeltaLoadFilesystemJob(FilesystemLoadJob): +class TableFormatLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: super().__init__(file_path=file_path) self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) def make_remote_path(self) -> str: - # remote path is table dir - delta will create its file structure inside it return self._job_client.get_table_dir(self.load_table_name) + @property + def arrow_dataset(self) -> Any: + from dlt.common.libs.pyarrow import pyarrow + + return pyarrow.dataset.dataset(self.file_paths) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + +class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa @@ -138,7 +149,7 @@ def run(self) -> None: f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow" f" buffer: {pa.total_allocated_bytes()}]" ) - source_ds = pa.dataset.dataset(self.file_paths) + source_ds = self.arrow_dataset delta_table = self._delta_table() # explicitly check if there is data @@ -148,9 +159,6 @@ def run(self) -> None: else: with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader if self._load_table["write_disposition"] == "merge" and delta_table is not None: - self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key] - self._schema.tables, self._load_table, self._job_client.capabilities - ) merge_delta_table( table=delta_table, data=arrow_rbr, @@ -188,10 +196,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] else: return None - @property - def _partition_columns(self) -> List[str]: - return get_columns_names_with_prop(self._load_table, "partition") - def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import ( DeltaTable, @@ -211,13 +215,36 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta return _evolve_delta_table_schema(delta_table, arrow_ds.schema) +class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): + def run(self) -> None: + from dlt.common.libs.pyiceberg import write_iceberg_table + + write_iceberg_table( + table=self._iceberg_table(), + data=self.arrow_dataset.to_table(), + write_disposition=self._load_table["write_disposition"], + ) + + def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyiceberg import get_catalog + + catalog = get_catalog( + client=self._job_client, + table_name=self.load_table_name, + schema=self.arrow_dataset.schema, + partition_columns=self._partition_columns, + ) + return catalog.load_table(self.table_identifier) + + @property + def table_identifier(self) -> str: + return f"{self._job_client.dataset_name}.{self.load_table_name}" + + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) - if self._load_table.get("table_format") == "delta": - # delta table jobs only require table chain followup jobs - pass - elif final_state == "completed": + if final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], @@ -394,6 +421,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema: if table["write_disposition"] == "merge": table["write_disposition"] = "append" table.pop("table_format", None) + merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities) + if table["write_disposition"] == "merge": + if merge_strategy is None: + # no supported merge strategies, fall back to append + table["write_disposition"] = "append" + else: + table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key] return table def get_table_dir(self, table_name: str, remote: bool = False) -> str: @@ -458,12 +492,20 @@ def create_load_job( # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination: return FinalizedLoadJob(file_path) - if table.get("table_format") == "delta": - import dlt.common.libs.deltalake # assert dependencies are installed + table_format = table.get("table_format") + if table_format in ("delta", "iceberg"): # a reference job for a delta table indicates a table chain followup job if ReferenceFollowupJobRequest.is_reference_job(file_path): - return DeltaLoadFilesystemJob(file_path) + if table_format == "delta": + import dlt.common.libs.deltalake + + return DeltaLoadFilesystemJob(file_path) + elif table_format == "iceberg": + import dlt.common.libs.pyiceberg + + return IcebergLoadFilesystemJob(file_path) + # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -494,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool: def should_truncate_table_before_load(self, table_name: str) -> bool: table = self.prepare_load_table(table_name) - return ( - table["write_disposition"] == "replace" - and not table.get("table_format") == "delta" # Delta can do a logical replace - ) + return table["write_disposition"] == "replace" and not table.get("table_format") in ( + "delta", + "iceberg", + ) # Delta/Iceberg can do a logical replace # # state stuff @@ -718,7 +760,7 @@ def create_table_chain_completed_followup_jobs( jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs ) - if table_chain[0].get("table_format") == "delta": + if table_chain[0].get("table_format") in ("delta", "iceberg"): for table in table_chain: table_job_paths = [ job.file_path diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d03a00b418..d39f4c3431 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -169,8 +170,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non # native google storage implementation is not supported.. elif self.fs_client.config.protocol in ["gs", "gcs"]: logger.warn( - "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling" - " back to fsspec." + "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if" + " possible (not supported when using `iceberg` table format). Falling back to" + " fsspec." ) self._conn.register_filesystem(self.fs_client.fs_client) @@ -192,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn @@ -258,6 +260,13 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: from_statement = "" if schema_table.get("table_format") == "delta": from_statement = f"delta_scan('{resolved_folder}')" + elif schema_table.get("table_format") == "iceberg": + from dlt.common.libs.pyiceberg import _get_last_metadata_file + + self._setup_iceberg(self._conn) + metadata_path = f"{resolved_folder}/metadata" + last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) + from_statement = f"iceberg_scan('{last_metadata_file}')" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -267,7 +276,7 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: else: raise NotImplementedError( f"Unknown filetype {first_file_type} for table {table_name}. Currently only" - " jsonl and parquet files as well as delta tables are supported." + " jsonl and parquet files as well as delta and iceberg tables are supported." ) # create table @@ -299,6 +308,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB with super().execute_query(query, *args, **kwargs) as cursor: yield cursor + @staticmethod + def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: + # needed to make persistent secrets work in new connection + # https://github.com/duckdb/duckdb_iceberg/issues/83 + conn.execute("FROM duckdb_secrets();") + + # `duckdb_iceberg` extension does not support autoloading + # https://github.com/duckdb/duckdb_iceberg/issues/71 + conn.execute("INSTALL iceberg; LOAD iceberg;") + def __del__(self) -> None: if self.memory_db: self.memory_db.close() diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index edd827ed00..e61ac1fb6a 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -81,6 +81,9 @@ def adjust_capabilities( caps.max_column_identifier_length = dialect.max_identifier_length caps.supports_native_boolean = dialect.supports_native_boolean if dialect.name == "mysql": + # correct max identifier length + # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers) + caps.max_identifier_length = 64 caps.format_datetime_literal = _format_mysql_datetime_literal return caps diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 99458a3949..aaa19ea97d 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -18,7 +18,7 @@ from airflow.configuration import conf from airflow.models import TaskInstance from airflow.utils.task_group import TaskGroup - from airflow.operators.dummy import DummyOperator # type: ignore + from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator, get_current_context except ModuleNotFoundError: raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"]) @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] logger.LOGGER = ti.log # set global number of buffered items diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index a2a0014e4e..fd114478fb 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -83,6 +83,7 @@ duckdb: extensions: - httpfs - parquet + - iceberg # TODO: emit the config of duck db motherduck: diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md new file mode 100644 index 0000000000..7a056d6b40 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -0,0 +1,168 @@ +--- +title: Delta / Iceberg +description: Delta / Iceberg `dlt` destination +keywords: [delta, iceberg, destination, data warehouse] +--- + +# Delta and Iceberg table formats +`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination. + +## How it works +`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + +## Iceberg single-user ephemeral catalog +`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. + +:::caution +While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: +- concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean +- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores +::: + +## Delta dependencies + +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +You also need `pyarrow>=17.0.0`: + +```sh +pip install 'pyarrow>=17.0.0' +``` + +## Iceberg dependencies + +You need Python version 3.9 or higher and the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +## Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + +:::note +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. +::: + + +## Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). +::: + +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: + +## Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. + +```py +from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables + +... + +# get dictionary of DeltaTable objects +delta_tables = get_delta_tables(pipeline) + +# execute operations on DeltaTable objects +delta_tables["my_delta_table"].optimize.compact() +delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) +# delta_tables["my_delta_table"].vacuum() +# etc. +``` + +## Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + +## Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + +## Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +## Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +## Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 9b243b9429..de3d12e8e1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google +Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -166,6 +167,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. +#### Supported schemes + `dlt` supports both forms of the blob storage urls: ```toml [destination.filesystem] @@ -404,29 +407,6 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### Merge with Delta table format (experimental) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format). - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is experimental. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - ## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -645,88 +625,9 @@ You can choose the following file formats: ## Supported table formats -You can choose the following table formats: -* [Delta table](../table-formats/delta.md) is supported - -### Delta table format - -You need the `deltalake` package to use this format: - -```sh -pip install "dlt[deltalake]" -``` - -You also need `pyarrow>=17.0.0`: - -```sh -pip install 'pyarrow>=17.0.0' -``` - -Set the `table_format` argument to `delta` when defining your resource: - -```py -@dlt.resource(table_format="delta") -def my_delta_resource(): - ... -``` - -:::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. -::: - -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: - -#### Delta table partitioning -A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: - -```py -@dlt.resource( - table_format="delta", - columns={"foo": {"partition": True}} -) -def my_delta_resource(): - ... -``` - -:::caution -It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. -::: - - -#### Storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### `get_delta_tables` helper -You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables: - -```py -from dlt.common.libs.deltalake import get_delta_tables - -... - -# get dictionary of DeltaTable objects -delta_tables = get_delta_tables(pipeline) - -# execute operations on DeltaTable objects -delta_tables["my_delta_table"].optimize.compact() -delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) -# delta_tables["my_delta_table"].vacuum() -# etc. - -``` +You can choose the following [table formats](./delta-iceberg.md): +* Delta table +* Iceberg ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index 233ae0ce21..edca521e52 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -10,5 +10,5 @@ keywords: [iceberg, table formats] ## Supported destinations -Supported by: **Athena** +Supported by: **Athena**, **filesystem** diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..9f9b65e9c0 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` - diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..8e8c11fc09 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -167,6 +167,7 @@ const sidebars = { 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/clickhouse', 'dlt-ecosystem/destinations/filesystem', + 'dlt-ecosystem/destinations/delta-iceberg', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', diff --git a/mypy.ini b/mypy.ini index 769e84b13a..fdf0ceb1e6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -135,3 +135,9 @@ ignore_missing_imports = True [mypy-time_machine.*] ignore_missing_imports = True + +[mypy-pyiceberg.*] +ignore_missing_imports = True + +[mypy-airflow.*] +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 749979439d..83090360b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1543,13 +1543,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.1" +version = "5.5.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] [[package]] @@ -5872,44 +5872,49 @@ files = [ [[package]] name = "mypy" -version = "1.10.0" +version = "1.12.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -7521,6 +7526,74 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.8.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" +files = [ + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c121d1d3baf64510db94740ad870ae4b6eb9eb59a5ff7ecb4e96f7510666b2f"}, + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6f14aa588a3883fc7fddc136ca75b75660b4abb0b55b4c541619953f8971e7"}, + {file = "pyiceberg-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c720c2a191ac6faf01fe4c0f4c01c64b94bf064185b0292003d42939049277c"}, + {file = "pyiceberg-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d421d6e51ac1c581cba9fce96aa6b9118cf4a02270066a7fdc9490ab5d57ece9"}, + {file = "pyiceberg-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:ae11fb0515ea0a046370e09a7f6039a7e86622ab910360eaa732f0106b8f00c7"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9488954c9eb5ce42ca6b816fc61873f219414cfdb9e9928d1c4a302702be1d89"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:44179e0fb844887b440c162279ba526dfe0e0f72d32945236528838518b55af0"}, + {file = "pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e121c6f5505d8ec711a1dd1690e07156cd54fb3d0844d5d991e02f1593f2708"}, + {file = "pyiceberg-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5961a288f2d4bbb2ab300c803da1bf0e70cea837e3f14b14108827cc821af252"}, + {file = "pyiceberg-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbe192324a6fb552c2fd29cab51086e21fa248ea2a0b95fbab921dede49e5a69"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:60430f0d8f6d650ed7d1893d038b847565a8e9ac135a1cc812e57d24f0482f6c"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0f697977dac672d8b00e125836423585a97ebf59a28b865b1296a2b6ee81c51"}, + {file = "pyiceberg-0.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370de7c230970ff858f713d150164d492ba8450e771e59a0c520520b13ea6226"}, + {file = "pyiceberg-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3036ed226020d50e30648a71f968cf78bde5d6b609294508e60754e100e5ef36"}, + {file = "pyiceberg-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ac9555f3bd25a31059229089ae639cf738a8e8286a175cea128561ac1ed9452"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51da3a553d3a881042bf436e66a91cc2b6c4a3fea0e174cd73af2eb6ed255323"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:863f1dce7340e6ed870706a3fa4a73457178dae8529725bb80522ddcd4253afb"}, + {file = "pyiceberg-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dbf52b39080a6a2cda6a5126a74e3a88d5b206f609c128d001a728b36b81075"}, + {file = "pyiceberg-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb77d65e8efbb883c163817e4a9c373d907110ab6343c1b816b48f336955d4d7"}, + {file = "pyiceberg-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:1fcd35b7de0eddc3fd8fd0c38b98741217ef6de4eeb0e72b798b4007692aa76c"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0f56f8fc61bcd795f6a3d03e8ce6bee09ebaa64425eb08327e975f906d98be"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7099c6631743ad29c451de2bebd9ed3c96c42bcb1fe5d5d5c93aec895858e3f"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6436f5a782491115f64131882a737d77c9dc0040493e1b7f9b3081ea8cf6a26"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1d75b40a98a327f7436eb0d6187c51834c44b79adf61c6945b33645f4afbf17"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8de988fa2363e6a51b40b85b5ff1e8261cda5bfc14ac54dd4ebe58391b95acae"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:dd06c5b606011155aa0b76e7b001e30f1c40ab2fb3eeb8a0652b88629259c2bb"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8142f0dbc12dda0e6d7aaf564a3fbb0f17fc934630e7cf866773c8caaebf666"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6126ee3a46ff975f15abf2085f184591d21643bffb96330907e003eea0b63005"}, + {file = "pyiceberg-0.8.1.tar.gz", hash = "sha256:4502f0cfddf6f7cd48b9cd54016bce0ab94052b0ab01efcfa515879074f4c8e3"}, +] + +[package.dependencies] +cachetools = ">=5.5.0,<6.0.0" +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0" +mmh3 = ">=4.0.0,<6.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<10.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +pyarrow = ["pyarrow (>=14.0.0,<19.0.0)"] +ray = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9327,6 +9400,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -10606,6 +10693,7 @@ mssql = ["pyodbc"] parquet = ["pyarrow"] postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10619,4 +10707,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" +content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" diff --git a/pyproject.toml b/pyproject.toml index 0fb7f94e36..bfa830cd06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -adlfs = {version = ">=2022.4.0", optional = true} +adlfs = {version = ">=2024.7.0", optional = true} pyodbc = {version = ">=4.0.39", optional = true} qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} @@ -89,6 +89,12 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead +pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -118,6 +124,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +pyiceberg = ["pyiceberg", "pyarrow", "sqlalchemy"] postgis = ["psycopg2-binary", "psycopg2cffi"] [tool.poetry.scripts] @@ -134,7 +141,7 @@ sqlfluff = "^2.3.2" types-deprecated = "^1.2.9.2" pytest-console-scripts = "^1.4.1" pytest = "^7.0.0" -mypy = "^1.10.0" +mypy = ">=1.11.0,<1.13.0" flake8 = "^5.0.0" bandit = "^1.7.0" black = "^23.7.0" diff --git a/tests/conftest.py b/tests/conftest.py index 6088fa976c..a5a349f8d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,9 @@ def _create_pipeline_instance_id(self) -> str: # disable googleapiclient logging logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING") + # disable pyiceberg logging + logging.getLogger("pyiceberg").setLevel("WARNING") + # reset and init airflow db import warnings diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index 3c30123e1c..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_credentials_mixins.py similarity index 50% rename from tests/load/filesystem/test_object_store_rs_credentials.py rename to tests/load/filesystem/test_credentials_mixins.py index f23187a269..c1fb02c152 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -1,12 +1,8 @@ -"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" - -from typing import Any, Dict +from typing import Any, Dict, Union, Type, get_args, cast import os import json # noqa: I251 import pytest -from deltalake import DeltaTable -from deltalake.exceptions import TableNotFoundError import dlt from dlt.common.configuration import resolve_configuration @@ -23,10 +19,15 @@ from dlt.common.utils import custom_environ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials -from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.exceptions import ( + ObjectStoreRsCredentialsException, + UnsupportedAuthenticationMethodException, +) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( AZ_BUCKET, + ABFS_BUCKET, AWS_BUCKET, GCS_BUCKET, R2_BUCKET_CONFIG, @@ -34,6 +35,9 @@ ) +TCredentialsMixin = Union[WithObjectStoreRsCredentials, WithPyicebergConfig] +ALL_CREDENTIALS_MIXINS = get_args(TCredentialsMixin) + pytestmark = pytest.mark.essential if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")): @@ -53,11 +57,27 @@ def fs_creds() -> Dict[str, Any]: return creds -def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: - """Returns True if client can connect to object store, False otherwise. +def can_connect(bucket_url: str, credentials: TCredentialsMixin, mixin: Type[TCredentialsMixin]) -> bool: # type: ignore[return] + """Returns True if client can connect to object store, False otherwise.""" + if mixin == WithObjectStoreRsCredentials: + credentials = cast(WithObjectStoreRsCredentials, credentials) + return can_connect_object_store_rs_credentials( + bucket_url, credentials.to_object_store_rs_credentials() + ) + elif mixin == WithPyicebergConfig: + credentials = cast(WithPyicebergConfig, credentials) + return can_connect_pyiceberg_fileio_config( + bucket_url, credentials.to_pyiceberg_fileio_config() + ) + + +def can_connect_object_store_rs_credentials( + bucket_url: str, object_store_rs_credentials: Dict[str, str] +) -> bool: + # uses `deltatable` library as Python interface to `object_store` Rust crate + from deltalake import DeltaTable + from deltalake.exceptions import TableNotFoundError - Uses `deltatable` library as Python interface to `object_store` Rust crate. - """ try: DeltaTable( bucket_url, @@ -70,16 +90,40 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> return False +def can_connect_pyiceberg_fileio_config( + bucket_url: str, pyiceberg_fileio_config: Dict[str, str] +) -> bool: + from pyiceberg.table import StaticTable + + try: + StaticTable.from_metadata( + f"{bucket_url}/non_existing_metadata_file.json", + properties=pyiceberg_fileio_config, + ) + except FileNotFoundError: + # this error implies the connection was successful + # there is no Iceberg metadata file at the specified path + return True + return False + + @pytest.mark.parametrize( - "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] + "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az", "abfss")] ) -def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_azure_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: + if mixin == WithPyicebergConfig and driver == "az": + pytest.skip("`pyiceberg` does not support `az` scheme") + + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -87,18 +131,21 @@ def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any] azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")] ) -def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_aws_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AwsCredentialsWithoutDefaults if driver == "r2": @@ -112,9 +159,11 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) + if mixin == WithObjectStoreRsCredentials: + assert ( + "aws_session_token" not in creds.to_object_store_rs_credentials() + ) # no auto-generated token + assert can_connect(AWS_BUCKET, creds, mixin) # AwsCredentials: no user-provided session token creds = AwsCredentials( @@ -124,24 +173,27 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) - - # exception should be raised if both `endpoint_url` and `region_name` are - # not provided - with pytest.raises(ObjectStoreRsCredentialsException): - AwsCredentials( - aws_access_key_id=fs_creds["aws_access_key_id"], - aws_secret_access_key=fs_creds["aws_secret_access_key"], - ).to_object_store_rs_credentials() - - if "endpoint_url" in object_store_rs_creds: - # TODO: make sure this case is tested on GitHub CI, e.g. by adding - # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + + # exception should be raised if both `endpoint_url` and `region_name` are + # not provided + with pytest.raises(ObjectStoreRsCredentialsException): + AwsCredentials( + aws_access_key_id=fs_creds["aws_access_key_id"], + aws_secret_access_key=fs_creds["aws_secret_access_key"], + ).to_object_store_rs_credentials() + + if "endpoint_url" in object_store_rs_creds and object_store_rs_creds[ + "endpoint_url" + ].startswith("http://"): + # TODO: make sure this case is tested on GitHub CI, e.g. by adding + # a local MinIO bucket to the set of tested buckets assert object_store_rs_creds["aws_allow_http"] == "true" + if creds.endpoint_url is not None: # remainder of tests use session tokens # we don't run them on S3 compatible storage because session tokens # may not be available @@ -158,9 +210,10 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # AwsCredentialsWithoutDefaults: user-provided session token creds = AwsCredentialsWithoutDefaults( @@ -170,15 +223,19 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")] ) -def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_gcp_credentials_mixins( + driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -189,7 +246,11 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -197,7 +258,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0 # write service account key to JSON file - service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"]) + service_json = json.loads(creds.to_native_representation()) path = "_secrets/service.json" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -206,8 +267,18 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) - can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) - - # GcpOAuthCredentialsWithoutDefaults is currently not supported - with pytest.raises(NotImplementedError): - GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) + + # GcpOAuthCredentialsWithoutDefaults + creds = resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), sections=("destination", "fsgcpoauth") + ) + if mixin == WithPyicebergConfig: + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..a73b0f7e31 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -1,17 +1,17 @@ """Test the duckdb supported sql client for special internal features""" -from typing import Any +from typing import Optional import pytest import dlt import os import shutil -import logging from dlt import Pipeline from dlt.common.utils import uniq_id +from dlt.common.schema.typing import TTableFormat from tests.load.utils import ( destinations_configs, @@ -19,7 +19,6 @@ GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, - AWS_BUCKET, ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT @@ -37,7 +36,7 @@ def _run_dataset_checks( pipeline: Pipeline, destination_config: DestinationTestConfiguration, secret_directory: str, - table_format: Any = None, + table_format: Optional[TTableFormat] = None, alternate_access_pipeline: Pipeline = None, ) -> None: total_records = 200 @@ -144,6 +143,8 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux, see duckdb docs external_db.sql("SET azure_transport_option_type = 'curl';") external_db.sql(f"SET secret_directory = '{secret_directory}';") + if table_format == "iceberg": + FilesystemSqlClient._setup_iceberg(external_db) return external_db def _fs_sql_client_for_external_db( @@ -283,13 +284,13 @@ def test_read_interfaces_filesystem( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET], # NOTE: delta does not work on memory buckets ), ids=lambda x: x.name, ) -def test_delta_tables( +def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" @@ -302,8 +303,9 @@ def test_delta_tables( # in case of gcs we use the s3 compat layer for reading # for writing we still need to use the gc authentication, as delta_rs seems to use # methods on the s3 interface that are not implemented by gcs + # s3 compat layer does not work with `iceberg` table format access_pipeline = pipeline - if destination_config.bucket_url == GCS_BUCKET: + if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format != "iceberg": gcp_bucket = filesystem( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) @@ -315,7 +317,7 @@ def test_delta_tables( pipeline, destination_config, secret_directory=secret_directory, - table_format="delta", + table_format=destination_config.table_format, alternate_access_pipeline=access_pipeline, ) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8d890642ee..c70fa5ab5d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -2,7 +2,7 @@ import os import posixpath from pathlib import Path -from typing import Any, Callable, List, Dict, cast +from typing import Any, Callable, List, Dict, cast, Tuple from importlib.metadata import version as pkg_version from packaging.version import Version @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableFormat from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -223,6 +223,48 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +# here start the `table_format` tests + + +def get_expected_actual( + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + arrow_table: "pyarrow.Table", # type: ignore[name-defined] # noqa: F821 +) -> Tuple["pyarrow.Table", "pyarrow.Table"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) + + @pytest.mark.skip( reason="pyarrow version check not needed anymore, since we have 17 as a dependency" ) @@ -258,44 +300,44 @@ def foo(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_core( +def test_table_format_core( destination_config: DestinationTestConfiguration, ) -> None: - """Tests core functionality for `delta` table format. + """Tests core functionality for `delta` and `iceberg` table formats. Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - - from dlt.common.libs.deltalake import get_delta_tables + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables # create resource that yields rows with all data types column_schemas, row = table_update_and_row() - @dlt.resource(columns=column_schemas, table_format="delta") + @dlt.resource(columns=column_schemas, table_format=destination_config.table_format) def data_types(): nonlocal row yield [row] * 10 pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - # run pipeline, this should create Delta table + # run pipeline, this should create table info = pipeline.run(data_types()) assert_load_info(info) - # `delta` table format should use `parquet` file format + # table formats should use `parquet` file format completed_jobs = info.load_packages[0].jobs["completed_jobs"] data_types_jobs = [ job for job in completed_jobs if job.job_file_info.table_name == "data_types" ] assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) - # 10 rows should be loaded to the Delta table and the content of the first + # 10 rows should be loaded to the table and the content of the first # row should match expected values rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -322,7 +364,8 @@ def data_types(): # should do logical replace, increasing the table version info = pipeline.run(data_types(), write_disposition="replace") assert_load_info(info) - assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 + if destination_config.table_format == "delta": + assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -331,15 +374,16 @@ def data_types(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_does_not_contain_job_files( +def test_table_format_does_not_contain_job_files( destination_config: DestinationTestConfiguration, ) -> None: - """Asserts Parquet job files do not end up in Delta table.""" + """Asserts Parquet job files do not end up in table.""" pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -376,17 +420,18 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_multiple_files( +def test_table_format_multiple_files( destination_config: DestinationTestConfiguration, ) -> None: - """Tests loading multiple files into a Delta table. + """Tests loading multiple files into a table. - Files should be loaded into the Delta table in a single commit. + Files should be loaded into the table in a single commit. """ from dlt.common.libs.deltalake import get_delta_tables @@ -422,17 +467,17 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_child_tables( +def test_table_format_child_tables( destination_config: DestinationTestConfiguration, ) -> None: - """Tests child table handling for `delta` table format.""" + """Tests child table handling for `delta` and `iceberg` table formats.""" - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def nested_table(): yield [ { @@ -494,49 +539,63 @@ def nested_table(): assert len(rows_dict["nested_table__child"]) == 3 assert len(rows_dict["nested_table__child__grandchild"]) == 5 - # now drop children and grandchildren, use merge write disposition to create and pass full table chain - # also for tables that do not have jobs - info = pipeline.run( - [{"foo": 3}] * 10000, - table_name="nested_table", - primary_key="foo", - write_disposition="merge", - ) - assert_load_info(info) + if destination_config.supports_merge: + # now drop children and grandchildren, use merge write disposition to create and pass full table chain + # also for tables that do not have jobs + info = pipeline.run( + [{"foo": 3}] * 10000, + table_name="nested_table", + primary_key="foo", + write_disposition="merge", + ) + assert_load_info(info) @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_partitioning( +def test_table_format_partitioning( destination_config: DestinationTestConfiguration, ) -> None: - """Tests partitioning for `delta` table format.""" + """Tests partitioning for `delta` and `iceberg` table formats.""" - from dlt.common.libs.deltalake import get_delta_tables from tests.pipeline.utils import users_materialize_table_schema + def assert_partition_columns( + table_name: str, table_format: TTableFormat, expected_partition_columns: List[str] + ) -> None: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + actual_partition_columns = dt.metadata().partition_columns + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + actual_partition_columns = [f.name for f in it.metadata.specs_struct().fields] + assert actual_partition_columns == expected_partition_columns + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) # zero partition columns - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def zero_part(): yield {"foo": 1, "bar": 1} info = pipeline.run(zero_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 # one partition column - @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + @dlt.resource(table_format=destination_config.table_format, columns={"c1": {"partition": True}}) def one_part(): yield [ {"c1": "foo", "c2": 1}, @@ -547,13 +606,13 @@ def one_part(): info = pipeline.run(one_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "one_part")["one_part"] - assert dt.metadata().partition_columns == ["c1"] + assert_partition_columns("one_part", destination_config.table_format, ["c1"]) assert load_table_counts(pipeline, "one_part")["one_part"] == 4 # two partition columns @dlt.resource( - table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + table_format=destination_config.table_format, + columns={"c1": {"partition": True}, "c2": {"partition": True}}, ) def two_part(): yield [ @@ -565,29 +624,31 @@ def two_part(): info = pipeline.run(two_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "two_part")["two_part"] - assert dt.metadata().partition_columns == ["c1", "c2"] + assert_partition_columns("two_part", destination_config.table_format, ["c1", "c2"]) assert load_table_counts(pipeline, "two_part")["two_part"] == 4 # test partitioning with empty source users_materialize_table_schema.apply_hints( - table_format="delta", + table_format=destination_config.table_format, columns={"id": {"partition": True}}, ) info = pipeline.run(users_materialize_table_schema()) assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.metadata().partition_columns == ["id"] + assert_partition_columns("users", destination_config.table_format, ["id"]) assert load_table_counts(pipeline, "users")["users"] == 0 # changing partitioning after initial table creation is not supported zero_part.apply_hints(columns={"foo": {"partition": True}}) - with pytest.raises(PipelineStepFailed) as pip_ex: + if destination_config.table_format == "delta": + # Delta raises error when trying to change partitioning + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + elif destination_config.table_format == "iceberg": + # while Iceberg supports partition evolution, we don't apply it pipeline.run(zero_part()) - assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) - assert "partitioning" in pip_ex.value.__context__.retry_message - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) @pytest.mark.parametrize( @@ -646,7 +707,7 @@ def test_delta_table_partitioning_arrow_load_id( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, @@ -659,20 +720,25 @@ def test_delta_table_partitioning_arrow_load_id( pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), ), ) -def test_delta_table_schema_evolution( +def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: - """Tests schema evolution (adding new columns) for `delta` table format.""" - from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + """Tests schema evolution (adding new columns) for `delta` and `iceberg` table formats.""" + if destination_config.table_format == "iceberg" and write_disposition == { + "disposition": "merge", + "strategy": "upsert", + }: + pytest.skip("`upsert` currently not implemented for `iceberg`") + from dlt.common.libs.pyarrow import pyarrow @dlt.resource( write_disposition=write_disposition, primary_key="pk", - table_format="delta", + table_format=destination_config.table_format, ) - def delta_table(data): + def evolving_table(data): yield data pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -684,11 +750,11 @@ def delta_table(data): assert arrow_table.shape == (1, 1) # initial load - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) assert actual.equals(expected) # create Arrow table with many columns, two rows @@ -703,11 +769,11 @@ def delta_table(data): arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) # second load — this should evolve the schema (i.e. add the new columns) - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected = ensure_delta_compatible_arrow_data(arrow_table) + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is # more involved than with the other dispositions @@ -724,13 +790,21 @@ def delta_table(data): empty_arrow_table = arrow_table.schema.empty_table() # load 3 — this should evolve the schema without changing data - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema - assert actual.schema.equals(expected_schema) - expected_num_rows = 3 if write_disposition == "append" else 2 + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) + assert actual.schema.equals(expected.schema) + if write_disposition == "append": + expected_num_rows = 3 + elif write_disposition == "replace": + expected_num_rows = 0 + if destination_config.table_format == "delta": + # TODO: fix https://github.com/dlt-hub/dlt/issues/2092 and remove this if-clause + expected_num_rows = 2 + elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: + expected_num_rows = 2 assert actual.num_rows == expected_num_rows # new column should have NULLs only assert ( @@ -743,23 +817,38 @@ def delta_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_empty_source( +def test_table_format_empty_source( destination_config: DestinationTestConfiguration, ) -> None: - """Tests empty source handling for `delta` table format. + """Tests empty source handling for `delta` and `iceberg` table formats. Tests both empty Arrow table and `dlt.mark.materialize_table_schema()`. """ - from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_data, get_delta_tables from tests.pipeline.utils import users_materialize_table_schema - @dlt.resource(table_format="delta") - def delta_table(data): + def get_table_version( # type: ignore[return] + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + ) -> int: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + return dt.version() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + return it.last_sequence_number - 1 # subtract 1 to match `delta` + + @dlt.resource(table_format=destination_config.table_format) + def a_table(data): yield data # create empty Arrow table with schema @@ -779,61 +868,62 @@ def delta_table(data): # run 1: empty Arrow table with schema # this should create empty Delta table with same schema as Arrow table - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (0, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 0 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (0, expected.num_columns) + assert actual.schema.equals(expected.schema) # run 2: non-empty Arrow table with same schema as run 1 # this should load records into Delta table - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(a_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 1 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (2, expected.num_columns) + assert actual.schema.equals(expected.schema) # now run the empty frame again - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - # use materialized list - # NOTE: this will create an empty parquet file with a schema takes from dlt schema. - # the original parquet file had a nested (struct) type in `json` field that is now - # in the delta table schema. the empty parquet file lost this information and had - # string type (converted from dlt `json`) - info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="delta_table") - assert_load_info(info) + if destination_config.table_format == "delta": + # use materialized list + # NOTE: this will create an empty parquet file with a schema takes from dlt schema. + # the original parquet file had a nested (struct) type in `json` field that is now + # in the delta table schema. the empty parquet file lost this information and had + # string type (converted from dlt `json`) + info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="a_table") + assert_load_info(info) # test `dlt.mark.materialize_table_schema()` - users_materialize_table_schema.apply_hints(table_format="delta") + users_materialize_table_schema.apply_hints(table_format=destination_config.table_format) info = pipeline.run(users_materialize_table_schema(), loader_file_format="parquet") assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.num_rows == 0 - assert "id", "name" == dt_arrow_table.schema.names[:2] + assert get_table_version(pipeline, "users", destination_config.table_format) == 0 + _, actual = get_expected_actual( + pipeline, "users", destination_config.table_format, empty_arrow_table + ) + assert actual.num_rows == 0 + assert "id", "name" == actual.schema.names[:2] @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_mixed_source( +def test_table_format_mixed_source( destination_config: DestinationTestConfiguration, ) -> None: """Tests file format handling in mixed source. @@ -877,12 +967,13 @@ def s(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_dynamic_dispatch( +def test_table_format_dynamic_dispatch( destination_config: DestinationTestConfiguration, ) -> None: @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") @@ -905,80 +996,96 @@ def github_events(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_get_delta_tables_helper( +def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: - """Tests `get_delta_tables` helper function.""" - from dlt.common.libs.deltalake import DeltaTable, get_delta_tables + """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + get_tables: Any + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import DeltaTable, get_delta_tables - @dlt.resource(table_format="delta") - def foo_delta(): + get_tables = get_delta_tables + get_num_rows = lambda table: table.to_pyarrow_table().num_rows + elif destination_config.table_format == "iceberg": + from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables + + get_tables = get_iceberg_tables + get_num_rows = lambda table: table.scan().to_arrow().num_rows + + @dlt.resource(table_format=destination_config.table_format) + def foo_table_format(): yield [{"foo": 1}, {"foo": 2}] - @dlt.resource(table_format="delta") - def bar_delta(): + @dlt.resource(table_format=destination_config.table_format) + def bar_table_format(): yield [{"bar": 1}] @dlt.resource - def baz_not_delta(): + def baz_not_table_format(): yield [{"baz": 1}] pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - info = pipeline.run(foo_delta()) + info = pipeline.run(foo_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta"} - assert isinstance(delta_tables["foo_delta"], DeltaTable) - assert delta_tables["foo_delta"].to_pyarrow_table().num_rows == 2 - - info = pipeline.run([foo_delta(), bar_delta(), baz_not_delta()]) + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format"} + if destination_config.table_format == "delta": + assert isinstance(tables["foo_table_format"], DeltaTable) + elif destination_config.table_format == "iceberg": + assert isinstance(tables["foo_table_format"], IcebergTable) + assert get_num_rows(tables["foo_table_format"]) == 2 + + info = pipeline.run([foo_table_format(), bar_table_format(), baz_not_table_format()]) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta", "bar_delta"} - assert delta_tables["bar_delta"].to_pyarrow_table().num_rows == 1 - assert get_delta_tables(pipeline, "foo_delta").keys() == {"foo_delta"} - assert get_delta_tables(pipeline, "bar_delta").keys() == {"bar_delta"} - assert get_delta_tables(pipeline, "foo_delta", "bar_delta").keys() == {"foo_delta", "bar_delta"} + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format", "bar_table_format"} + assert get_num_rows(tables["bar_table_format"]) == 1 + assert get_tables(pipeline, "foo_table_format").keys() == {"foo_table_format"} + assert get_tables(pipeline, "bar_table_format").keys() == {"bar_table_format"} + assert get_tables(pipeline, "foo_table_format", "bar_table_format").keys() == { + "foo_table_format", + "bar_table_format", + } # test with child table - @dlt.resource(table_format="delta") - def parent_delta(): + @dlt.resource(table_format=destination_config.table_format) + def parent_table_format(): yield [{"foo": 1, "child": [1, 2, 3]}] - info = pipeline.run(parent_delta()) + info = pipeline.run(parent_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert "parent_delta__child" in delta_tables.keys() - assert delta_tables["parent_delta__child"].to_pyarrow_table().num_rows == 3 + tables = get_tables(pipeline) + assert "parent_table_format__child" in tables.keys() + assert get_num_rows(tables["parent_table_format__child"]) == 3 # test invalid input with pytest.raises(ValueError): - get_delta_tables(pipeline, "baz_not_delta") + get_tables(pipeline, "baz_not_table_format") with pytest.raises(ValueError): - get_delta_tables(pipeline, "non_existing_table") + get_tables(pipeline, "non_existing_table") # test unknown schema with pytest.raises(FileNotFoundError): - get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + get_tables(pipeline, "non_existing_table", schema_name="aux_2") # load to a new schema and under new name aux_schema = dlt.Schema("aux_2") # NOTE: you cannot have a file with name - info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + info = pipeline.run(parent_table_format().with_name("aux_table"), schema=aux_schema) # also state in seprate package assert_load_info(info, expected_load_packages=2) - delta_tables = get_delta_tables(pipeline, schema_name="aux_2") - assert "aux_delta__child" in delta_tables.keys() - get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + tables = get_tables(pipeline, schema_name="aux_2") + assert "aux_table__child" in tables.keys() + get_tables(pipeline, "aux_table", schema_name="aux_2") with pytest.raises(ValueError): - get_delta_tables(pipeline, "aux_delta") + get_tables(pipeline, "aux_table") @pytest.mark.parametrize( diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 00257471e0..2de923fe38 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1286,10 +1286,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( - List[TColumnSchema], - deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), - ) + expected = deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS) if backend == "pyarrow": expected = cast( List[TColumnSchema], diff --git a/tests/load/utils.py b/tests/load/utils.py index 5c24b2d1dc..5660202ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,10 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -57,6 +60,7 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import ( ACTIVE_DESTINATIONS, + ACTIVE_TABLE_FORMATS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS, @@ -171,7 +175,9 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: dest_type = kwargs.pop("destination", self.destination_type) dest_name = kwargs.pop("destination_name", self.destination_name) self.setup() - return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + return Destination.from_reference( + dest_type, self.credentials, destination_name=dest_name, **kwargs + ) def raw_capabilities(self) -> DestinationCapabilitiesContext: dest = Destination.from_reference(self.destination_type) @@ -604,7 +610,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, - extra_info=bucket + "-delta", + extra_info=bucket, table_format="delta", supports_merge=True, file_format="parquet", @@ -619,12 +625,33 @@ def destinations_configs( ), ) ] + if bucket == AZ_BUCKET: + # `pyiceberg` does not support `az` scheme + continue + destination_configs += [ + DestinationTestConfiguration( + destination_type="filesystem", + bucket_url=bucket, + extra_info=bucket, + table_format="iceberg", + supports_merge=False, + file_format="parquet", + destination_name="fsgcpoauth" if bucket == GCS_BUCKET else None, + ) + ] # filter out non active destinations destination_configs = [ conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] + # filter out non active table formats + destination_configs = [ + conf + for conf in destination_configs + if conf.table_format is None or conf.table_format in ACTIVE_TABLE_FORMATS + ] + # filter out destinations not in subset if subset: destination_configs = [ diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0ae734f72e..e72a27c827 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -197,10 +197,23 @@ def _load_tables_to_dicts_fs( delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) + iceberg_table_names = [ + table_name + for table_name in table_names + if get_table_format(client.schema.tables, table_name) == "iceberg" + ] + if len(iceberg_table_names) > 0: + from dlt.common.libs.pyiceberg import get_iceberg_tables + + iceberg_tables = get_iceberg_tables(p, *table_names, schema_name=schema_name) + for table_name in table_names: if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() + elif table_name in client.schema.data_table_names() and table_name in iceberg_table_names: + it = iceberg_tables[table_name] + result[table_name] = it.scan().to_arrow().to_pylist() else: table_files = client.list_table_files(table_name) for file in table_files: diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 36fe009b93..e67ff9c70a 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/utils.py b/tests/utils.py index 1aafa4bfe4..82d742ac65 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from dlt.common.runtime.run_context import DOT_DLT, RunContext from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema +from dlt.common.schema.typing import TTableFormat from dlt.common.storages import FileStorage from dlt.common.storages.versioned_storage import VersionedStorage from dlt.common.typing import DictStrAny, StrAny, TDataItem @@ -88,6 +89,12 @@ ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) +# filter out active table formats for current tests +IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat)) +ACTIVE_TABLE_FORMATS = set( + dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS +) + # sanity checks assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected" From 3a8dfa7a00298be9b45fa3cf01bd2881a846cdbf Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Thu, 12 Dec 2024 15:49:53 +0100 Subject: [PATCH 06/12] Fix validation error in for custom auth classes (#2129) --- dlt/common/typing.py | 15 +++++++++++++++ dlt/sources/rest_api/config_setup.py | 8 +++++++- tests/common/test_typing.py | 17 +++++++++++++++++ .../configurations/test_custom_auth_config.py | 17 ++++++++++++++++- .../test_custom_paginator_config.py | 12 +++++++++++- 5 files changed, 66 insertions(+), 3 deletions(-) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 8986d753f3..a0322fe01e 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -484,3 +484,18 @@ def decorator( return func return decorator + + +def add_value_to_literal(literal: Any, value: Any) -> None: + """Extends a Literal at runtime with a new value. + + Args: + literal (Type[Any]): Literal to extend + value (Any): Value to add + + """ + type_args = get_args(literal) + + if value not in type_args: + type_args += (value,) + literal.__args__ = type_args diff --git a/dlt/sources/rest_api/config_setup.py b/dlt/sources/rest_api/config_setup.py index d03a4fd59b..bf62c6c4f7 100644 --- a/dlt/sources/rest_api/config_setup.py +++ b/dlt/sources/rest_api/config_setup.py @@ -20,6 +20,7 @@ from dlt.common.configuration import resolve_configuration from dlt.common.schema.utils import merge_columns from dlt.common.utils import update_dict_nested, exclude_keys +from dlt.common.typing import add_value_to_literal from dlt.common import jsonpath from dlt.extract.incremental import Incremental @@ -64,6 +65,8 @@ ResponseActionDict, Endpoint, EndpointResource, + AuthType, + PaginatorType, ) @@ -103,6 +106,7 @@ def register_paginator( "Your custom paginator has to be a subclass of BasePaginator" ) PAGINATOR_MAP[paginator_name] = paginator_class + add_value_to_literal(PaginatorType, paginator_name) def get_paginator_class(paginator_name: str) -> Type[BasePaginator]: @@ -153,6 +157,8 @@ def register_auth( ) AUTH_MAP[auth_name] = auth_class + add_value_to_literal(AuthType, auth_name) + def get_auth_class(auth_type: str) -> Type[AuthConfigBase]: try: @@ -285,7 +291,7 @@ def build_resource_dependency_graph( resolved_param_map[resource_name] = None break assert isinstance(endpoint_resource["endpoint"], dict) - # connect transformers to resources via resolved params + # find resolved parameters to connect dependent resources resolved_params = _find_resolved_params(endpoint_resource["endpoint"]) # set of resources in resolved params diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 2749e3ebb1..e81c3e7fa2 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -43,6 +43,7 @@ is_union_type, is_annotated, is_callable_type, + add_value_to_literal, ) @@ -293,3 +294,19 @@ def test_secret_type() -> None: assert TSecretStrValue("x_str") == "x_str" assert TSecretStrValue({}) == "{}" + + +def test_add_value_to_literal() -> None: + TestLiteral = Literal["red", "blue"] + + add_value_to_literal(TestLiteral, "green") + + assert get_args(TestLiteral) == ("red", "blue", "green") + + add_value_to_literal(TestLiteral, "red") + assert get_args(TestLiteral) == ("red", "blue", "green") + + TestSingleLiteral = Literal["red"] + add_value_to_literal(TestSingleLiteral, "green") + add_value_to_literal(TestSingleLiteral, "blue") + assert get_args(TestSingleLiteral) == ("red", "green", "blue") diff --git a/tests/sources/rest_api/configurations/test_custom_auth_config.py b/tests/sources/rest_api/configurations/test_custom_auth_config.py index 1a5a2e58a3..52cdb95735 100644 --- a/tests/sources/rest_api/configurations/test_custom_auth_config.py +++ b/tests/sources/rest_api/configurations/test_custom_auth_config.py @@ -5,7 +5,7 @@ from dlt.sources import rest_api from dlt.sources.helpers.rest_client.auth import APIKeyAuth, OAuth2ClientCredentials -from dlt.sources.rest_api.typing import ApiKeyAuthConfig, AuthConfig +from dlt.sources.rest_api.typing import ApiKeyAuthConfig, AuthConfig, RESTAPIConfig class CustomOAuth2(OAuth2ClientCredentials): @@ -77,3 +77,18 @@ class NotAuthConfigBase: "not_an_auth_config_base", NotAuthConfigBase # type: ignore ) assert e.match("Invalid auth: NotAuthConfigBase.") + + def test_valid_config_raises_no_error(self, custom_auth_config: AuthConfig) -> None: + rest_api.config_setup.register_auth("custom_oauth_2", CustomOAuth2) + + valid_config: RESTAPIConfig = { + "client": { + "base_url": "https://example.com", + "auth": custom_auth_config, + }, + "resources": ["test"], + } + + rest_api.rest_api_source(valid_config) + + del rest_api.config_setup.AUTH_MAP["custom_oauth_2"] diff --git a/tests/sources/rest_api/configurations/test_custom_paginator_config.py b/tests/sources/rest_api/configurations/test_custom_paginator_config.py index f8ac060218..975ab10176 100644 --- a/tests/sources/rest_api/configurations/test_custom_paginator_config.py +++ b/tests/sources/rest_api/configurations/test_custom_paginator_config.py @@ -4,7 +4,7 @@ from dlt.sources import rest_api from dlt.sources.helpers.rest_client.paginators import JSONLinkPaginator -from dlt.sources.rest_api.typing import PaginatorConfig +from dlt.sources.rest_api.typing import PaginatorConfig, RESTAPIConfig class CustomPaginator(JSONLinkPaginator): @@ -67,3 +67,13 @@ class NotAPaginator: with pytest.raises(ValueError) as e: rest_api.config_setup.register_paginator("not_a_paginator", NotAPaginator) # type: ignore[arg-type] assert e.match("Invalid paginator: NotAPaginator.") + + def test_test_valid_config_raises_no_error(self, custom_paginator_config) -> None: + rest_api.config_setup.register_paginator("custom_paginator", CustomPaginator) + + valid_config: RESTAPIConfig = { + "client": {"base_url": "https://example.com", "paginator": custom_paginator_config}, + "resources": ["test"], + } + + rest_api.rest_api_source(valid_config) From 80ca47417fe694c5537cedb6eb594bcd3ce2491d Mon Sep 17 00:00:00 2001 From: HulmaNaseer <42720638+HulmaNaseer@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:47:40 +0100 Subject: [PATCH 07/12] explicitly adding docs for destination item size control (#2118) * explicitly adding docs for destination item size control * alena's feedback * revised for explicit note * Update docs/website/docs/reference/performance.md --------- Co-authored-by: hulmanaseer00 <163604758+hulmanaseer00@users.noreply.github.com> Co-authored-by: Alena Astrakhantseva --- docs/website/docs/reference/performance.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index ab171ac069..1e58080200 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -48,9 +48,7 @@ Some file formats (e.g., Parquet) do not support schema changes when writing a s Below, we set files to rotate after 100,000 items written or when the filesize exceeds 1MiB. - - - + ### Disabling and enabling file compression Several [text file formats](../dlt-ecosystem/file-formats/) have `gzip` compression enabled by default. If you wish that your load packages have uncompressed files (e.g., to debug the content easily), change `data_writer.disable_compression` in config.toml. The entry below will disable the compression of the files processed in the `normalize` stage. @@ -148,7 +146,10 @@ As before, **if you have just a single table with millions of records, you shoul -Since the normalize stage uses a process pool to create load packages concurrently, adjusting the `file_max_items` and `file_max_bytes` settings can significantly impact load behavior. By setting a lower value for `file_max_items`, you reduce the size of each data chunk sent to the destination database, which can be particularly useful for managing memory constraints on the database server. Without explicit configuration of `file_max_items`, `dlt` writes all data rows into one large intermediary file, attempting to insert all data from this single file. Configuring `file_max_items` ensures data is inserted in manageable chunks, enhancing performance and preventing potential memory issues. +The **normalize** stage in `dlt` uses a process pool to create load packages concurrently, and the settings for `file_max_items` and `file_max_bytes` play a crucial role in determining the size of data chunks. Lower values for these settings reduce the size of each chunk sent to the destination database, which is particularly helpful for managing memory constraints on the database server. By default, `dlt` writes all data rows into one large intermediary file, attempting to load all data at once. Configuring these settings enables file rotation, splitting the data into smaller, more manageable chunks. This not only improves performance but also minimizes memory-related issues when working with large tables containing millions of records. + +#### Controlling destination items size +The intermediary files generated during the **normalize** stage are also used in the **load** stage. Therefore, adjusting `file_max_items` and `file_max_bytes` in the **normalize** stage directly impacts the size and number of data chunks sent to the destination, influencing loading behavior and performance. ### Parallel pipeline config example The example below simulates the loading of a large database table with 1,000,000 records. The **config.toml** below sets the parallelization as follows: From beb8465c5be5eb55dfd532c405a2e22bf2027bb6 Mon Sep 17 00:00:00 2001 From: Anton Burnashev Date: Fri, 13 Dec 2024 14:12:57 +0100 Subject: [PATCH 08/12] Update primary key for pokemon resource from id to name in REST API tutorial (#2147) --- docs/website/docs/tutorial/rest-api.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/tutorial/rest-api.md b/docs/website/docs/tutorial/rest-api.md index 56051e80de..70c7f7e964 100644 --- a/docs/website/docs/tutorial/rest-api.md +++ b/docs/website/docs/tutorial/rest-api.md @@ -246,7 +246,7 @@ pokemon_source = rest_api_source( # the primary key and write disposition { "name": "pokemon", - "primary_key": "id", + "primary_key": "name", "write_disposition": "merge", }, # The `berry` and `location` resources will use the default @@ -257,7 +257,7 @@ pokemon_source = rest_api_source( ) ``` -Run the pipeline with `python rest_api_pipeline.py`, the data for the `pokemon` resource will be merged with the existing data in the destination table based on the `id` field. +Run the pipeline with `python rest_api_pipeline.py`, the data for the `pokemon` resource will be merged with the existing data in the destination table based on the `name` field. ## Loading data incrementally From 39c0a01f58058f0f6df0ed3d4e16079ec84196c9 Mon Sep 17 00:00:00 2001 From: Julian Alves <28436330+donotpush@users.noreply.github.com> Date: Fri, 13 Dec 2024 15:26:05 +0100 Subject: [PATCH 09/12] add databricks oauth authentication (#2138) * add databricks oauth authentication * improve auth databricks test * force token-based auth for azure external location tests --- .../impl/databricks/configuration.py | 12 ++++ .../impl/databricks/sql_client.py | 20 ++++++- .../dlt-ecosystem/destinations/databricks.md | 25 ++++++++- poetry.lock | 26 ++++++++- pyproject.toml | 3 +- .../test_databricks_configuration.py | 10 ++++ .../load/pipeline/test_databricks_pipeline.py | 56 +++++++++++++++++++ 7 files changed, 145 insertions(+), 7 deletions(-) diff --git a/dlt/destinations/impl/databricks/configuration.py b/dlt/destinations/impl/databricks/configuration.py index c95b6eba4c..21338bd310 100644 --- a/dlt/destinations/impl/databricks/configuration.py +++ b/dlt/destinations/impl/databricks/configuration.py @@ -4,6 +4,7 @@ from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration +from dlt.common.configuration.exceptions import ConfigurationValueError DATABRICKS_APPLICATION_ID = "dltHub_dlt" @@ -15,6 +16,8 @@ class DatabricksCredentials(CredentialsConfiguration): server_hostname: str = None http_path: str = None access_token: Optional[TSecretStrValue] = None + client_id: Optional[TSecretStrValue] = None + client_secret: Optional[TSecretStrValue] = None http_headers: Optional[Dict[str, str]] = None session_configuration: Optional[Dict[str, Any]] = None """Dict of session parameters that will be passed to `databricks.sql.connect`""" @@ -27,9 +30,18 @@ class DatabricksCredentials(CredentialsConfiguration): "server_hostname", "http_path", "catalog", + "client_id", + "client_secret", "access_token", ] + def on_resolved(self) -> None: + if not ((self.client_id and self.client_secret) or self.access_token): + raise ConfigurationValueError( + "No valid authentication method detected. Provide either 'client_id' and" + " 'client_secret' for OAuth, or 'access_token' for token-based authentication." + ) + def to_connector_params(self) -> Dict[str, Any]: conn_params = dict( catalog=self.catalog, diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 8bff4e0d73..16e1e73d93 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -11,10 +11,12 @@ Tuple, Union, Dict, + cast, + Callable, ) - -from databricks import sql as databricks_lib +from databricks.sdk.core import Config, oauth_service_principal +from databricks import sql as databricks_lib # type: ignore[attr-defined] from databricks.sql.client import ( Connection as DatabricksSqlConnection, Cursor as DatabricksSqlCursor, @@ -73,8 +75,22 @@ def __init__( self._conn: DatabricksSqlConnection = None self.credentials = credentials + def _get_oauth_credentials(self) -> Optional[Callable[[], Dict[str, str]]]: + config = Config( + host=f"https://{self.credentials.server_hostname}", + client_id=self.credentials.client_id, + client_secret=self.credentials.client_secret, + ) + return cast(Callable[[], Dict[str, str]], oauth_service_principal(config)) + def open_connection(self) -> DatabricksSqlConnection: conn_params = self.credentials.to_connector_params() + + if self.credentials.client_id and self.credentials.client_secret: + conn_params["credentials_provider"] = self._get_oauth_credentials + else: + conn_params["access_token"] = self.credentials.access_token + self._conn = databricks_lib.connect( **conn_params, schema=self.dataset_name, use_inline_params="silent" ) diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index 513a3b792f..dd046ce28a 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -90,6 +90,29 @@ If you already have your Databricks workspace set up, you can skip to the [Loade Click your email in the top right corner and go to "User Settings". Go to "Developer" -> "Access Tokens". Generate a new token and save it. You will use it in your `dlt` configuration. +## OAuth M2M (Machine-to-Machine) Authentication + +You can authenticate to Databricks using a service principal via OAuth M2M. This method allows for secure, programmatic access to Databricks resources without requiring a user-managed personal access token. + +### Create a Service Principal in Databricks +Follow the instructions in the Databricks documentation to create a service principal and retrieve the client_id and client_secret: + +[Authenticate access to Databricks using OAuth M2M](https://docs.databricks.com/en/dev-tools/auth/oauth-m2m.html) + +Once you have the service principal credentials, update your secrets.toml as shown bellow. + +### Configuration + +Add the following fields to your `.dlt/secrets.toml` file: +```toml +[destination.databricks.credentials] +server_hostname = "MY_DATABRICKS.azuredatabricks.net" +http_path = "/sql/1.0/warehouses/12345" +catalog = "my_catalog" +client_id = "XXX" +client_secret = "XXX" +``` + ## Loader setup guide **1. Initialize a project with a pipeline that loads to Databricks by running** @@ -118,7 +141,7 @@ Example: [destination.databricks.credentials] server_hostname = "MY_DATABRICKS.azuredatabricks.net" http_path = "/sql/1.0/warehouses/12345" -access_token = "MY_ACCESS_TOKEN" +access_token = "MY_ACCESS_TOKEN" # Replace for client_id and client_secret when using OAuth catalog = "my_catalog" ``` diff --git a/poetry.lock b/poetry.lock index 83090360b0..82d9bf90f8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "about-time" @@ -2208,6 +2208,26 @@ nr-date = ">=2.0.0,<3.0.0" typeapi = ">=2.0.1,<3.0.0" typing-extensions = ">=3.10.0" +[[package]] +name = "databricks-sdk" +version = "0.39.0" +description = "Databricks SDK for Python (Beta)" +optional = true +python-versions = ">=3.7" +files = [ + {file = "databricks_sdk-0.39.0-py3-none-any.whl", hash = "sha256:915fbf12b249264f74ddae2ca739530e3c4a9c5a454617ac403115d6466c2f99"}, + {file = "databricks_sdk-0.39.0.tar.gz", hash = "sha256:2e04edbb9e050f4362da804fb5dad07637c5adecfcffb4d0ca8abb5aefa36d06"}, +] + +[package.dependencies] +google-auth = ">=2.0,<3.0" +requests = ">=2.28.1,<3" + +[package.extras] +dev = ["autoflake", "databricks-connect", "httpx", "ipython", "ipywidgets", "isort", "langchain-openai", "openai", "pycodestyle", "pyfakefs", "pytest", "pytest-cov", "pytest-mock", "pytest-rerunfailures", "pytest-xdist", "requests-mock", "wheel", "yapf"] +notebook = ["ipython (>=8,<9)", "ipywidgets (>=8,<9)"] +openai = ["httpx", "langchain-openai", "openai"] + [[package]] name = "databricks-sql-connector" version = "2.9.6" @@ -10680,7 +10700,7 @@ az = ["adlfs"] bigquery = ["db-dtypes", "gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] cli = ["cron-descriptor", "pipdeptree"] clickhouse = ["adlfs", "clickhouse-connect", "clickhouse-driver", "gcsfs", "pyarrow", "s3fs"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sdk", "databricks-sql-connector"] deltalake = ["deltalake", "pyarrow"] dremio = ["pyarrow"] duckdb = ["duckdb"] @@ -10707,4 +10727,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" +content-hash = "5513aca05ae04d7941f2a890d0fefa86a08371508a2d319c1e558c29ff8a45f3" diff --git a/pyproject.toml b/pyproject.toml index bfa830cd06..d12073601d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,7 @@ db-dtypes = { version = ">=1.2.0", optional = true } # pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } # we will rely on manual installation of `sqlalchemy>=2.0.18` instead pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } +databricks-sdk = {version = ">=0.38.0", optional = true} [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -117,7 +118,7 @@ weaviate = ["weaviate-client"] mssql = ["pyodbc"] synapse = ["pyodbc", "adlfs", "pyarrow"] qdrant = ["qdrant-client"] -databricks = ["databricks-sql-connector"] +databricks = ["databricks-sql-connector", "databricks-sdk"] clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"] dremio = ["pyarrow"] lancedb = ["lancedb", "pyarrow", "tantivy"] diff --git a/tests/load/databricks/test_databricks_configuration.py b/tests/load/databricks/test_databricks_configuration.py index e27da4db2a..8b3beed2b3 100644 --- a/tests/load/databricks/test_databricks_configuration.py +++ b/tests/load/databricks/test_databricks_configuration.py @@ -4,6 +4,7 @@ pytest.importorskip("databricks") from dlt.common.exceptions import TerminalValueError +from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations.impl.databricks.databricks import DatabricksLoadJob from dlt.common.configuration import resolve_configuration @@ -86,3 +87,12 @@ def test_databricks_abfss_converter() -> None: abfss_url == "abfss://dlt-ci-test-bucket@my_account.dfs.core.windows.net/path/to/file.parquet" ) + + +def test_databricks_auth_invalid() -> None: + with pytest.raises(ConfigurationValueError, match="No valid authentication method detected.*"): + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + bricks.configuration(None, accept_partial=True) diff --git a/tests/load/pipeline/test_databricks_pipeline.py b/tests/load/pipeline/test_databricks_pipeline.py index e802cde693..078dce3a7f 100644 --- a/tests/load/pipeline/test_databricks_pipeline.py +++ b/tests/load/pipeline/test_databricks_pipeline.py @@ -2,6 +2,7 @@ import os from dlt.common.utils import uniq_id +from dlt.destinations import databricks from tests.load.utils import ( GCS_BUCKET, DestinationTestConfiguration, @@ -23,6 +24,10 @@ ids=lambda x: x.name, ) def test_databricks_external_location(destination_config: DestinationTestConfiguration) -> None: + # force token-based authentication + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + # do not interfere with state os.environ["RESTORE_FROM_DESTINATION"] = "False" # let the package complete even with failed jobs @@ -145,3 +150,54 @@ def test_databricks_gcs_external_location(destination_config: DestinationTestCon assert ( "credential_x" in pipeline.list_failed_jobs_in_package(info.loads_ids[0])[0].failed_message ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_oauth(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__ACCESS_TOKEN"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.client_id and config.credentials.client_secret + assert not config.credentials.access_token + + dataset_name = "test_databricks_oauth" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_oauth", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=("databricks",)), + ids=lambda x: x.name, +) +def test_databricks_auth_token(destination_config: DestinationTestConfiguration) -> None: + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_ID"] = "" + os.environ["DESTINATION__DATABRICKS__CREDENTIALS__CLIENT_SECRET"] = "" + bricks = databricks() + config = bricks.configuration(None, accept_partial=True) + assert config.credentials.access_token + assert not (config.credentials.client_secret and config.credentials.client_id) + + dataset_name = "test_databricks_token" + uniq_id() + pipeline = destination_config.setup_pipeline( + "test_databricks_token", dataset_name=dataset_name, destination=bricks + ) + + info = pipeline.run([1, 2, 3], table_name="digits", **destination_config.run_kwargs) + assert info.has_failed_jobs is False + + with pipeline.sql_client() as client: + rows = client.execute_sql(f"select * from {dataset_name}.digits") + assert len(rows) == 3 From fd5ba0bcefc098d6b245bbf68e1bbb4d0ee9fb77 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Sun, 15 Dec 2024 15:17:28 +0400 Subject: [PATCH 10/12] make duckdb handle Iceberg table with nested types (#2141) * make duckdb handle iceberg table with nested types * replace duckdb views for iceberg tables * remove unnecessary context closing and opening * replace duckdb views for abfss protocol * restore original destination for write path * use dev_mode to work around leftover data from previous tests leftover data caused by https://github.com/dlt-hub/dlt/issues/2148 --- .../impl/filesystem/sql_client.py | 25 +++++++---- tests/load/filesystem/test_sql_client.py | 42 +++++++++++++++---- 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d39f4c3431..e6b84343bb 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -214,14 +214,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: # unknown views will not be created continue - # only create view if it does not exist in the current schema yet - existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()] - if view_name in existing_tables: - continue - # NOTE: if this is staging configuration then `prepare_load_table` will remove some info # from table schema, if we ever extend this to handle staging destination, this needs to change schema_table = self.fs_client.prepare_load_table(table_name) + table_format = schema_table.get("table_format") + + # skip if view already exists and does not need to be replaced each time + existing_tables = [tname[0] for tname in self._conn.execute("SHOW TABLES").fetchall()] + needs_replace = table_format == "iceberg" or self.fs_client.config.protocol == "abfss" + if view_name in existing_tables and not needs_replace: + continue + # discover file type folder = self.fs_client.get_table_dir(table_name) files = self.fs_client.list_table_files(table_name) @@ -258,15 +261,17 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: # create from statement from_statement = "" - if schema_table.get("table_format") == "delta": + if table_format == "delta": from_statement = f"delta_scan('{resolved_folder}')" - elif schema_table.get("table_format") == "iceberg": + elif table_format == "iceberg": from dlt.common.libs.pyiceberg import _get_last_metadata_file self._setup_iceberg(self._conn) metadata_path = f"{resolved_folder}/metadata" last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) - from_statement = f"iceberg_scan('{last_metadata_file}')" + # skip schema inference to make nested data types work + # https://github.com/duckdb/duckdb_iceberg/issues/47 + from_statement = f"iceberg_scan('{last_metadata_file}', skip_schema_inference=True)" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -281,7 +286,9 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: # create table view_name = self.make_qualified_table_name(view_name) - create_table_sql_base = f"CREATE VIEW {view_name} AS SELECT * FROM {from_statement}" + create_table_sql_base = ( + f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM {from_statement}" + ) self._conn.execute(create_table_sql_base) @contextmanager diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index a73b0f7e31..4f537d129c 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -22,6 +22,7 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT +from tests.cases import arrow_table_all_data_types from dlt.destinations.exceptions import DatabaseUndefinedRelation @@ -81,12 +82,17 @@ def double_items(): for i in range(total_records) ] - return [items, double_items] + @dlt.resource(table_format=table_format) + def arrow_all_types(): + yield arrow_table_all_data_types("arrow-table", num_rows=total_records)[0] + + return [items, double_items, arrow_all_types] # run source pipeline.run(source(), loader_file_format=destination_config.file_format) if alternate_access_pipeline: + orig_dest = pipeline.destination pipeline.destination = alternate_access_pipeline.destination import duckdb @@ -96,8 +102,11 @@ def double_items(): DuckDbCredentials, ) - # check we can create new tables from the views with pipeline.sql_client() as c: + # check if all data types are handled properly + c.execute_sql("SELECT * FROM arrow_all_types;") + + # check we can create new tables from the views c.execute_sql( "CREATE TABLE items_joined AS (SELECT i.id, di.double_id FROM items as i JOIN" " double_items as di ON (i.id = di.id));" @@ -109,16 +118,14 @@ def double_items(): assert list(joined_table[5]) == [5, 10] assert list(joined_table[10]) == [10, 20] - # inserting values into a view should fail gracefully - with pipeline.sql_client() as c: + # inserting values into a view should fail gracefully try: c.execute_sql("INSERT INTO double_items VALUES (1, 2)") except Exception as exc: assert "double_items is not an table" in str(exc) - # check that no automated views are created for a schema different than - # the known one - with pipeline.sql_client() as c: + # check that no automated views are created for a schema different than + # the known one c.execute_sql("CREATE SCHEMA other_schema;") with pytest.raises(DatabaseUndefinedRelation): with c.execute_query("SELECT * FROM other_schema.items ORDER BY id ASC;") as cursor: @@ -172,6 +179,24 @@ def _fs_sql_client_for_external_db( # views exist assert len(external_db.sql("SELECT * FROM second.referenced_items").fetchall()) == total_records assert len(external_db.sql("SELECT * FROM first.items").fetchall()) == 3 + + # test if view reflects source table accurately after it has changed + # conretely, this tests if an existing view is replaced with formats that need it, such as + # `iceberg` table format + with fs_sql_client as sql_client: + sql_client.create_views_for_tables({"arrow_all_types": "arrow_all_types"}) + assert external_db.sql("FROM second.arrow_all_types;").arrow().num_rows == total_records + if alternate_access_pipeline: + # switch back for the write path + pipeline.destination = orig_dest + pipeline.run( # run pipeline again to add rows to source table + source().with_resources("arrow_all_types"), + loader_file_format=destination_config.file_format, + ) + with fs_sql_client as sql_client: + sql_client.create_views_for_tables({"arrow_all_types": "arrow_all_types"}) + assert external_db.sql("FROM second.arrow_all_types;").arrow().num_rows == (2 * total_records) + external_db.close() # in case we are not connecting to a bucket that needs secrets, views should still be here after connection reopen @@ -298,6 +323,7 @@ def test_table_formats( pipeline = destination_config.setup_pipeline( "read_pipeline", dataset_name="read_test", + dev_mode=True, ) # in case of gcs we use the s3 compat layer for reading @@ -310,7 +336,7 @@ def test_table_formats( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) access_pipeline = destination_config.setup_pipeline( - "read_pipeline", dataset_name="read_test", destination=gcp_bucket + "read_pipeline", dataset_name="read_test", dev_mode=True, destination=gcp_bucket ) _run_dataset_checks( From 95d606396157a968e35bf1170b1e43989dfc97f5 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Sun, 15 Dec 2024 16:49:14 +0100 Subject: [PATCH 11/12] Fix/refresh standalone resources (#2140) * drops tables from schema and relational * documents custom sections for sql_database and source rename * clones schema without data tables when resources without source are extacted, adds tests * skips airflow tests if not installed * adds doc on setting up FUSE on bucket * adds doc on setting up FUSE on bucket * adds row key propagation for table when its nested table require it * fixes tests --- Makefile | 3 +- dlt/common/normalizers/json/__init__.py | 4 + dlt/common/normalizers/json/relational.py | 54 ++++++++++--- dlt/common/schema/schema.py | 2 + dlt/extract/extract.py | 7 +- docs/tools/check_embedded_snippets.py | 9 ++- .../verified-sources/sql_database/advanced.md | 21 +++++ docs/website/docs/general-usage/source.md | 17 +++- docs/website/docs/reference/performance.md | 26 ++++++ .../normalizers/test_json_relational.py | 33 ++++++++ .../airflow_tests/test_airflow_provider.py | 4 + .../airflow_tests/test_airflow_wrapper.py | 2 + .../test_join_airflow_scheduler.py | 3 + tests/helpers/airflow_tests/utils.py | 8 +- tests/load/pipeline/test_drop.py | 55 ++++++++++--- tests/load/pipeline/test_refresh_modes.py | 79 ++++++++++++++----- tests/pipeline/test_pipeline.py | 24 ++++++ 17 files changed, 304 insertions(+), 47 deletions(-) diff --git a/Makefile b/Makefile index 0ca8a2e0c3..975a8a42da 100644 --- a/Makefile +++ b/Makefile @@ -63,7 +63,6 @@ format: lint-snippets: cd docs/tools && poetry run python check_embedded_snippets.py full - lint-and-test-snippets: lint-snippets poetry run mypy --config-file mypy.ini docs/website docs/tools --exclude docs/tools/lint_setup --exclude docs/website/docs_processed poetry run flake8 --max-line-length=200 docs/website docs/tools --exclude docs/website/.dlt-repo @@ -82,7 +81,7 @@ lint-security: poetry run bandit -r dlt/ -n 3 -l test: - (set -a && . tests/.env && poetry run pytest tests) + poetry run pytest tests test-load-local: DESTINATION__POSTGRES__CREDENTIALS=postgresql://loader:loader@localhost:5432/dlt_data DESTINATION__DUCKDB__CREDENTIALS=duckdb:///_storage/test_quack.duckdb poetry run pytest tests -k '(postgres or duckdb)' diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index 725f6a8355..ae5e06fe2e 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -36,6 +36,10 @@ def extend_schema(self) -> None: def extend_table(self, table_name: str) -> None: pass + @abc.abstractmethod + def remove_table(self, table_name: str) -> None: + pass + @classmethod @abc.abstractmethod def update_normalizer_config(cls, schema: Schema, config: TNormalizerConfig) -> None: diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index e365017125..36845b2e14 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,4 +1,16 @@ -from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any +from typing import ( + ClassVar, + Dict, + List, + Mapping, + Optional, + Sequence, + Tuple, + Type, + cast, + TypedDict, + Any, +) from dlt.common.normalizers.exceptions import InvalidJsonNormalizer from dlt.common.normalizers.typing import TJSONNormalizer @@ -14,6 +26,9 @@ from dlt.common.schema.utils import ( column_name_validator, is_nested_table, + get_nested_tables, + has_column_with_prop, + get_first_column_name_with_prop, ) from dlt.common.utils import update_dict_nested from dlt.common.normalizers.json import ( @@ -48,6 +63,7 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): # other constants EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this + RELATIONAL_CONFIG_TYPE: ClassVar[Type[RelationalNormalizerConfig]] = RelationalNormalizerConfig normalizer_config: RelationalNormalizerConfig propagation_config: RelationalNormalizerConfigPropagation @@ -310,20 +326,38 @@ def extend_table(self, table_name: str) -> None: Table name should be normalized. """ table = self.schema.tables.get(table_name) - if not is_nested_table(table) and table.get("write_disposition") == "merge": - DataItemNormalizer.update_normalizer_config( + # add root key prop when merge disposition is used or any of nested tables needs row_key + if not is_nested_table(table) and ( + table.get("write_disposition") == "merge" + or any( + has_column_with_prop(t, "root_key", include_incomplete=True) + for t in get_nested_tables(self.schema.tables, table_name) + ) + ): + # get row id column from table, assume that we propagate it into c_dlt_root_id always + c_dlt_id = get_first_column_name_with_prop(table, "row_key", include_incomplete=True) + self.update_normalizer_config( self.schema, { "propagation": { "tables": { table_name: { - TColumnName(self.c_dlt_id): TColumnName(self.c_dlt_root_id) + TColumnName(c_dlt_id or self.c_dlt_id): TColumnName( + self.c_dlt_root_id + ) } } } }, ) + def remove_table(self, table_name: str) -> None: + """Called by the Schema when table is removed from it.""" + config = self.get_normalizer_config(self.schema) + if propagation := config.get("propagation"): + if tables := propagation.get("tables"): + tables.pop(table_name, None) + def normalize_data_item( self, item: TDataItem, load_id: str, table_name: str ) -> TNormalizedRowIterator: @@ -352,8 +386,8 @@ def normalize_data_item( def ensure_this_normalizer(cls, norm_config: TJSONNormalizer) -> None: # make sure schema has right normalizer present_normalizer = norm_config["module"] - if present_normalizer != __name__: - raise InvalidJsonNormalizer(__name__, present_normalizer) + if present_normalizer != cls.__module__: + raise InvalidJsonNormalizer(cls.__module__, present_normalizer) @classmethod def update_normalizer_config(cls, schema: Schema, config: RelationalNormalizerConfig) -> None: @@ -371,8 +405,10 @@ def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig: cls.ensure_this_normalizer(norm_config) return cast(RelationalNormalizerConfig, norm_config.get("config", {})) - @staticmethod - def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: + @classmethod + def _validate_normalizer_config( + cls, schema: Schema, config: RelationalNormalizerConfig + ) -> None: """Normalizes all known column identifiers according to the schema and then validates the configuration""" def _normalize_prop( @@ -397,7 +433,7 @@ def _normalize_prop( ) validate_dict( - RelationalNormalizerConfig, + cls.RELATIONAL_CONFIG_TYPE, config, "./normalizers/json/config", validator_f=column_name_validator(schema.naming), diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 276bbe9c09..f2d75638fe 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -451,10 +451,12 @@ def drop_tables( ) -> List[TTableSchema]: """Drops tables from the schema and returns the dropped tables""" result = [] + # TODO: make sure all nested tables to table_names are also dropped for table_name in table_names: table = self.get_table(table_name) if table and (not seen_data_only or utils.has_table_seen_data(table)): result.append(self._schema_tables.pop(table_name)) + self.data_item_normalizer.remove_table(table_name) return result def filter_row_with_hint( diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 25c3a0dbae..c062a74920 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -87,7 +87,12 @@ def choose_schema() -> Schema: schema_ = schema # take pipeline schema to make newest version visible to the resources elif pipeline.default_schema_name: - schema_ = pipeline.schemas[pipeline.default_schema_name].clone() + # clones with name which will drop previous hashes + schema_ = pipeline.schemas[pipeline.default_schema_name].clone( + with_name=pipeline.default_schema_name + ) + # delete data tables + schema_.drop_tables(schema_.data_table_names(include_incomplete=True)) else: schema_ = pipeline._make_schema_with_default_name() return schema_ diff --git a/docs/tools/check_embedded_snippets.py b/docs/tools/check_embedded_snippets.py index e8399fce6e..b917cafee1 100644 --- a/docs/tools/check_embedded_snippets.py +++ b/docs/tools/check_embedded_snippets.py @@ -21,7 +21,7 @@ SNIPPET_MARKER = "```" -ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql"] +ALLOWED_LANGUAGES = ["py", "toml", "json", "yaml", "text", "sh", "bat", "sql", "hcl"] LINT_TEMPLATE = "./lint_setup/template.py" LINT_FILE = "./lint_setup/lint_me.py" @@ -163,8 +163,11 @@ def parse_snippets(snippets: List[Snippet], verbose: bool) -> None: json.loads(snippet.code) elif snippet.language == "yaml": yaml.safe_load(snippet.code) - # ignore text and sh scripts - elif snippet.language in ["text", "sh", "bat", "sql"]: + elif snippet.language == "hcl": + # TODO: implement hcl parsers + pass + # ignore all other scripts + elif snippet.language in ALLOWED_LANGUAGES: pass else: raise ValueError(f"Unknown language {snippet.language}") diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index c532f6d357..954c1fb493 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -256,3 +256,24 @@ SOURCES__SQL_DATABASE__CHUNK_SIZE=1000 SOURCES__SQL_DATABASE__CHAT_MESSAGE__INCREMENTAL__CURSOR_PATH=updated_at ``` +### Configure many sources side by side with custom sections +`dlt` allows you to rename any source to place the source configuration into custom section or to have many instances +of the source created side by side. For example: +```py +from dlt.sources.sql_database import sql_database + +my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["chat_message"]) +print(my_db.name) +``` +Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read +credentials from: +```toml +[sources.my_db] +credentials="mssql+pyodbc://loader.database.windows.net/dlt_data?trusted_connection=yes&driver=ODBC+Driver+17+for+SQL+Server" +schema="data" +backend="pandas" +chunk_size=1000 + +[sources.my_db.chat_message.incremental] +cursor_path="updated_at" +``` diff --git a/docs/website/docs/general-usage/source.md b/docs/website/docs/general-usage/source.md index a5f1f04dee..87c07a3e44 100644 --- a/docs/website/docs/general-usage/source.md +++ b/docs/website/docs/general-usage/source.md @@ -52,7 +52,6 @@ Do not extract data in the source function. Leave that task to your resources if If this is impractical (for example, you want to reflect a database to create resources for tables), make sure you do not call the source function too often. [See this note if you plan to deploy on Airflow](../walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer.md#2-modify-dag-file) - ## Customize sources ### Access and select resources to load @@ -114,6 +113,22 @@ Note that `add_limit` **does not limit the number of records** but rather the "n Find more on sampling data [here](resource.md#sample-from-large-data). +### Rename the source +`dlt` allows you to rename the source ie. to place the source configuration into custom section or to have many instances +of the source created side by side. For example: +```py +from dlt.sources.sql_database import sql_database + +my_db = sql_database.with_args(name="my_db", section="my_db")(table_names=["table_1"]) +print(my_db.name) +``` +Here we create a renamed version of the `sql_database` and then instantiate it. Such source will read +credentials from: +```toml +[sources.my_db.my_db.credentials] +password="..." +``` + ### Add more resources to existing source You can add a custom resource to a source after it was created. Imagine that you want to score all the deals with a keras model that will tell you if the deal is a fraud or not. In order to do that, you declare a new [transformer that takes the data from](resource.md#feeding-data-from-one-resource-into-another) `deals` resource and add it to the source. diff --git a/docs/website/docs/reference/performance.md b/docs/website/docs/reference/performance.md index 1e58080200..0f536fa786 100644 --- a/docs/website/docs/reference/performance.md +++ b/docs/website/docs/reference/performance.md @@ -265,3 +265,29 @@ DLT_USE_JSON=simplejson Instead of using Python Requests directly, you can use the built-in [requests wrapper](../general-usage/http/requests) or [`RESTClient`](../general-usage/http/rest-client) for API calls. This will make your pipeline more resilient to intermittent network errors and other random glitches. + +## Keep pipeline working folder in a bucket on constrained environments. +`dlt` stores extracted data in load packages in order to load them atomically. In case you extract a lot of data at once (ie. backfill) or +your runtime env has constrained local storage (ie. cloud functions) you can keep your data on a bucket by using [FUSE](https://github.com/libfuse/libfuse) or +any other option which your cloud provider supplies. + +`dlt` users rename when saving files and "committing" packages (folder rename). Those may be not supported on bucket filesystems. Often +`rename` is translated into `copy` automatically. In other cases `dlt` will fallback to copy itself. + +In case of cloud function and gs bucket mounts, increasing the rename limit for folders is possible: +```hcl +volume_mounts { + mount_path = "/usr/src/ingestion/pipeline_storage" + name = "pipeline_bucket" + } +volumes { + name = "pipeline_bucket" + gcs { + bucket = google_storage_bucket.dlt_pipeline_data_bucket.name + read_only = false + mount_options = [ + "rename-dir-limit=100000" + ] + } +} +``` diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 35bc80add2..c35ecdef7f 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -880,6 +880,35 @@ def test_propagation_update_on_table_change(norm: RelationalNormalizer): "table_3" ] == {"_dlt_id": "_dlt_root_id", "prop1": "prop2"} + # force propagation when table has nested table that needs root_key + # also use custom name for row_key + table_4 = new_table( + "table_4", write_disposition="replace", columns=[{"name": "primary_key", "row_key": True}] + ) + table_4_nested = new_table( + "table_4__nested", + parent_table_name="table_4", + columns=[{"name": "_dlt_root_id", "root_key": True}], + ) + # must add table_4 first + norm.schema.update_table(table_4) + norm.schema.update_table(table_4_nested) + # row key table_4 not propagated because it was added before nested that needs that + # TODO: maybe fix it + assert ( + "table_4" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + ) + norm.schema.update_table(table_4) + # also custom key was used + assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "table_4" + ] == {"primary_key": "_dlt_root_id"} + # drop table from schema + norm.schema.drop_tables(["table_4"]) + assert ( + "table_4" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + ) + def test_caching_perf(norm: RelationalNormalizer) -> None: from time import time @@ -893,6 +922,10 @@ def test_caching_perf(norm: RelationalNormalizer) -> None: print(f"{time() - start}") +def test_extend_table(norm: RelationalNormalizer) -> None: + pass + + def set_max_nesting(norm: RelationalNormalizer, max_nesting: int) -> None: RelationalNormalizer.update_normalizer_config(norm.schema, {"max_nesting": max_nesting}) norm._reset() diff --git a/tests/helpers/airflow_tests/test_airflow_provider.py b/tests/helpers/airflow_tests/test_airflow_provider.py index 43fb23e48a..2a8e46e2c8 100644 --- a/tests/helpers/airflow_tests/test_airflow_provider.py +++ b/tests/helpers/airflow_tests/test_airflow_provider.py @@ -1,3 +1,7 @@ +import pytest + +pytest.importorskip("airflow") + from airflow import DAG from airflow.decorators import task, dag from airflow.operators.python import PythonOperator diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index 69e48733e3..06603ffcec 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -2,6 +2,8 @@ import pytest from unittest import mock from typing import Iterator, List + +pytest.importorskip("airflow") from airflow import DAG from airflow.decorators import dag from airflow.operators.python import PythonOperator, get_current_context diff --git a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py index d737f254e3..503aa62359 100644 --- a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py +++ b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py @@ -1,5 +1,8 @@ +import pytest import datetime from pendulum.tz import UTC + +pytest.importorskip("airflow") from airflow import DAG from airflow.decorators import dag, task from airflow.models import DagRun diff --git a/tests/helpers/airflow_tests/utils.py b/tests/helpers/airflow_tests/utils.py index a98ad4333a..4c1482a2ef 100644 --- a/tests/helpers/airflow_tests/utils.py +++ b/tests/helpers/airflow_tests/utils.py @@ -2,9 +2,6 @@ import os import argparse import pytest -from airflow.cli.commands.db_command import resetdb -from airflow.configuration import conf -from airflow.models.variable import Variable from dlt.common.configuration.container import Container from dlt.common.configuration.specs import PluggableRunContext @@ -19,6 +16,8 @@ @pytest.fixture(scope="function", autouse=True) def initialize_airflow_db(): + from airflow.models.variable import Variable + setup_airflow() # backup context providers providers = Container()[PluggableRunContext].providers @@ -35,6 +34,9 @@ def initialize_airflow_db(): def setup_airflow() -> None: + from airflow.cli.commands.db_command import resetdb + from airflow.configuration import conf + # Disable loading examples try: conf.add_section("core") diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 0e44c754e7..330f2606ff 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -27,13 +27,17 @@ def _attach(pipeline: Pipeline) -> Pipeline: @dlt.source(section="droppable", name="droppable") -def droppable_source() -> List[DltResource]: +def droppable_source(drop_columns: bool = False) -> List[DltResource]: @dlt.resource def droppable_a( - a: dlt.sources.incremental[int] = dlt.sources.incremental("a", 0) + a: dlt.sources.incremental[int] = dlt.sources.incremental("a", 0, range_start="open") ) -> Iterator[Dict[str, Any]]: - yield dict(a=1, b=2, c=3) - yield dict(a=4, b=23, c=24) + if drop_columns: + yield dict(a=1, b=2) + yield dict(a=4, b=23) + else: + yield dict(a=1, b=2, c=3) + yield dict(a=4, b=23, c=24) @dlt.resource def droppable_b( @@ -47,9 +51,17 @@ def droppable_c( qe: dlt.sources.incremental[int] = dlt.sources.incremental("qe"), ) -> Iterator[Dict[str, Any]]: # Grandchild table - yield dict( - asdasd=2424, qe=111, items=[dict(k=2, r=2, labels=[dict(name="abc"), dict(name="www")])] - ) + if drop_columns: + # dropped asdasd, items[r], items.labels.value + yield dict(qe=111, items=[dict(k=2, labels=[dict(name="abc"), dict(name="www")])]) + else: + yield dict( + asdasd=2424, + qe=111, + items=[ + dict(k=2, r=2, labels=[dict(name="abc", value=1), dict(name="www", value=2)]) + ], + ) @dlt.resource def droppable_d( @@ -134,11 +146,17 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: ), ids=lambda x: x.name, ) -def test_drop_command_resources_and_state(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize("in_source", (True, False)) +def test_drop_command_resources_and_state( + destination_config: DestinationTestConfiguration, in_source: bool +) -> None: """Test the drop command with resource and state path options and verify correct data is deleted from destination and locally""" - source = droppable_source() - pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), dev_mode=True) + source: Any = droppable_source() + if not in_source: + source = list(source.selected_resources.values()) + + pipeline = destination_config.setup_pipeline("droppable", dev_mode=True) info = pipeline.run(source, **destination_config.run_kwargs) assert_load_info(info) assert load_table_counts(pipeline, *pipeline.default_schema.tables.keys()) == { @@ -173,6 +191,9 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon assert_destination_state_loaded(pipeline) # now run the same droppable_source to see if tables are recreated and they contain right number of items + source = droppable_source(drop_columns=True) + if not in_source: + source = list(source.selected_resources.values()) info = pipeline.run(source, **destination_config.run_kwargs) assert_load_info(info) # 2 versions (one dropped and replaced with schema with dropped tables, then we added missing tables) @@ -192,6 +213,20 @@ def test_drop_command_resources_and_state(destination_config: DestinationTestCon "droppable_c__items": 1, "droppable_c__items__labels": 2, } + # check if columns got correctly dropped + droppable_a_schema = pipeline.default_schema.get_table("droppable_a") + # this table was not dropped so column still exists + assert "c" in droppable_a_schema["columns"] + # dropped asdasd, items[r], items.labels.value + droppable_c_schema = pipeline.default_schema.get_table("droppable_c") + assert "asdasd" not in droppable_c_schema["columns"] + assert "qe" in droppable_c_schema["columns"] + droppable_c_i_schema = pipeline.default_schema.get_table("droppable_c__items") + assert "r" not in droppable_c_i_schema["columns"] + assert "k" in droppable_c_i_schema["columns"] + droppable_c_l_schema = pipeline.default_schema.get_table("droppable_c__items__labels") + assert "value" not in droppable_c_l_schema["columns"] + assert "name" in droppable_c_l_schema["columns"] @pytest.mark.parametrize( diff --git a/tests/load/pipeline/test_refresh_modes.py b/tests/load/pipeline/test_refresh_modes.py index 86479acd2b..fb88ba915c 100644 --- a/tests/load/pipeline/test_refresh_modes.py +++ b/tests/load/pipeline/test_refresh_modes.py @@ -1,5 +1,5 @@ from typing import Any, List - +import os import pytest import dlt from dlt.common.destination.exceptions import DestinationUndefinedEntity @@ -12,7 +12,7 @@ from dlt.extract.source import DltSource from dlt.pipeline.state_sync import load_pipeline_state_from_destination -from tests.utils import clean_test_storage +from tests.utils import clean_test_storage, TEST_STORAGE_ROOT from tests.pipeline.utils import ( _is_filesystem, assert_load_info, @@ -106,19 +106,40 @@ def some_data_4(): ), ids=lambda x: x.name, ) -def test_refresh_drop_sources(destination_config: DestinationTestConfiguration): - pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_sources") +@pytest.mark.parametrize("in_source", (True, False)) +@pytest.mark.parametrize("with_wipe", (True, False)) +def test_refresh_drop_sources( + destination_config: DestinationTestConfiguration, in_source: bool, with_wipe: bool +): + # do not place duckdb in the working dir, because we may wipe it + os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = os.path.join( + TEST_STORAGE_ROOT, "refresh_source_db.duckdb" + ) + + pipeline = destination_config.setup_pipeline("refresh_source") + + data: Any = refresh_source(first_run=True, drop_sources=True) + if not in_source: + data = list(data.selected_resources.values()) # First run pipeline so destination so tables are created - info = pipeline.run( - refresh_source(first_run=True, drop_sources=True), **destination_config.run_kwargs - ) + info = pipeline.run(data, refresh="drop_sources", **destination_config.run_kwargs) assert_load_info(info) + # Second run of pipeline with only selected resources + if with_wipe: + pipeline._wipe_working_folder() + pipeline = destination_config.setup_pipeline("refresh_source") + + data = refresh_source(first_run=False, drop_sources=True).with_resources( + "some_data_1", "some_data_2" + ) + if not in_source: + data = list(data.selected_resources.values()) + info = pipeline.run( - refresh_source(first_run=False, drop_sources=True).with_resources( - "some_data_1", "some_data_2" - ), + data, + refresh="drop_sources", **destination_config.run_kwargs, ) @@ -199,16 +220,37 @@ def test_existing_schema_hash(destination_config: DestinationTestConfiguration): ), ids=lambda x: x.name, ) -def test_refresh_drop_resources(destination_config: DestinationTestConfiguration): +@pytest.mark.parametrize("in_source", (True, False)) +@pytest.mark.parametrize("with_wipe", (True, False)) +def test_refresh_drop_resources( + destination_config: DestinationTestConfiguration, in_source: bool, with_wipe: bool +): + # do not place duckdb in the working dir, because we may wipe it + os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = os.path.join( + TEST_STORAGE_ROOT, "refresh_source_db.duckdb" + ) # First run pipeline with load to destination so tables are created - pipeline = destination_config.setup_pipeline("refresh_full_test", refresh="drop_tables") + pipeline = destination_config.setup_pipeline("refresh_source") - info = pipeline.run(refresh_source(first_run=True), **destination_config.run_kwargs) + data: Any = refresh_source(first_run=True) + if not in_source: + data = list(data.selected_resources.values()) + + info = pipeline.run(data, refresh="drop_resources", **destination_config.run_kwargs) assert_load_info(info) # Second run of pipeline with only selected resources + if with_wipe: + pipeline._wipe_working_folder() + pipeline = destination_config.setup_pipeline("refresh_source") + + data = refresh_source(first_run=False).with_resources("some_data_1", "some_data_2") + if not in_source: + data = list(data.selected_resources.values()) + info = pipeline.run( - refresh_source(first_run=False).with_resources("some_data_1", "some_data_2"), + data, + refresh="drop_resources", **destination_config.run_kwargs, ) @@ -309,7 +351,9 @@ def test_refresh_drop_data_only(destination_config: DestinationTestConfiguration @pytest.mark.parametrize( "destination_config", - destinations_configs(default_sql_configs=True, subset=["duckdb"]), + destinations_configs( + default_sql_configs=True, local_filesystem_configs=True, subset=["duckdb", "filesystem"] + ), ids=lambda x: x.name, ) def test_refresh_drop_sources_multiple_sources(destination_config: DestinationTestConfiguration): @@ -364,7 +408,6 @@ def source_2_data_2(): **destination_config.run_kwargs, ) assert_load_info(info, 2) - # breakpoint() info = pipeline.run( refresh_source_2(first_run=False).with_resources("source_2_data_1"), **destination_config.run_kwargs, @@ -388,7 +431,7 @@ def source_2_data_2(): result = sorted([(row["id"], row["name"]) for row in data["some_data_1"]]) assert result == [(1, "John"), (2, "Jane")] - # # First table from source2 exists, with only first column + # First table from source2 exists, with only first column data = load_tables_to_dicts(pipeline, "source_2_data_1", schema_name="refresh_source_2") assert_only_table_columns( pipeline, "source_2_data_1", ["product"], schema_name="refresh_source_2" @@ -396,7 +439,7 @@ def source_2_data_2(): result = sorted([row["product"] for row in data["source_2_data_1"]]) assert result == ["orange", "pear"] - # # Second table from source 2 is gone + # Second table from source 2 is gone assert not table_exists(pipeline, "source_2_data_2", schema_name="refresh_source_2") diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index e58db64e5e..b32854b110 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1566,6 +1566,30 @@ def test_drop() -> None: pipeline.run([1, 2, 3], table_name="numbers") +def test_source_schema_in_resource() -> None: + run_count = 0 + + @dlt.resource + def schema_inspector(): + schema = dlt.current.source_schema() + if run_count == 0: + assert "schema_inspector" not in schema.tables + if run_count == 1: + assert "schema_inspector" in schema.tables + assert schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] == "X" # type: ignore[typeddict-item] + + yield [1, 2, 3] + + pipeline = dlt.pipeline(pipeline_name="test_inspector", destination="duckdb") + pipeline.run(schema_inspector()) + + # add custom annotation + pipeline.default_schema.tables["schema_inspector"]["columns"]["value"]["x-custom"] = "X" # type: ignore[typeddict-unknown-key] + + run_count += 1 + pipeline.run(schema_inspector()) + + def test_schema_version_increase_and_source_update() -> None: now = pendulum.now() From b8bac750fa8089079bf73badaf78c6c514aedbbf Mon Sep 17 00:00:00 2001 From: David Scharf Date: Sun, 15 Dec 2024 17:16:39 +0100 Subject: [PATCH 12/12] prepare dataset release & docs updates (#2126) * remove standalone dataset from exports * make pipeline dataset factory public * rework transformation section * fix some linting errors * add row counts feature for readabledataset * add dataset access example to getting started scripts * add notes about row_counts special query to datasets docs * fix internal docusaurus links * Update docs/website/docs/intro.md * Update docs/website/docs/tutorial/load-data-from-an-api.md * Update docs/website/docs/tutorial/load-data-from-an-api.md * Update docs/website/docs/tutorial/load-data-from-an-api.md * Update docs/website/docs/general-usage/dataset-access/dataset.md * Update docs/website/docs/general-usage/dataset-access/dataset.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/destinations/duckdb.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/transformations/index.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/python.md * Update docs/website/docs/dlt-ecosystem/transformations/sql.md * Update docs/website/docs/dlt-ecosystem/transformations/sql.md * Update docs/website/docs/dlt-ecosystem/transformations/sql.md * Update docs/website/docs/dlt-ecosystem/transformations/sql.md * Update docs/website/docs/dlt-ecosystem/transformations/sql.md * Update docs/website/docs/general-usage/dataset-access/dataset.md --------- Co-authored-by: Alena Astrakhantseva --- dlt/__init__.py | 2 - dlt/common/destination/reference.py | 4 + dlt/destinations/dataset/dataset.py | 28 +++- dlt/pipeline/pipeline.py | 2 +- .../website/docs/build-a-pipeline-tutorial.md | 27 ++-- .../docs/dlt-ecosystem/destinations/duckdb.md | 2 +- .../dlt-ecosystem/transformations/dbt/dbt.md | 8 +- .../dlt-ecosystem/transformations/index.md | 27 ++++ .../dlt-ecosystem/transformations/pandas.md | 42 ------ .../dlt-ecosystem/transformations/python.md | 109 ++++++++++++++ .../docs/dlt-ecosystem/transformations/sql.md | 55 +++++--- .../verified-sources/rest_api/basic.md | 2 +- .../general-usage/dataset-access/dataset.md | 25 +++- .../dataset-access/ibis-backend.md | 2 +- .../website/docs/general-usage/destination.md | 2 +- docs/website/docs/general-usage/state.md | 15 +- docs/website/docs/intro.md | 18 ++- .../docs/tutorial/load-data-from-an-api.md | 20 ++- docs/website/sidebars.js | 11 +- .../test_readable_dbapi_dataset.py | 8 +- tests/extract/test_incremental.py | 8 +- tests/load/duckdb/test_duckdb_client.py | 4 +- tests/load/filesystem/test_sql_client.py | 6 +- tests/load/pipeline/test_bigquery.py | 8 +- tests/load/pipeline/test_duckdb.py | 4 +- tests/load/test_read_interfaces.py | 133 ++++++++++++++---- tests/pipeline/test_dlt_versions.py | 2 +- tests/pipeline/test_pipeline.py | 8 +- tests/pipeline/test_pipeline_extra.py | 4 +- 29 files changed, 432 insertions(+), 154 deletions(-) create mode 100644 docs/website/docs/dlt-ecosystem/transformations/index.md delete mode 100644 docs/website/docs/dlt-ecosystem/transformations/pandas.md create mode 100644 docs/website/docs/dlt-ecosystem/transformations/python.md diff --git a/dlt/__init__.py b/dlt/__init__.py index e8a1b7bf92..328817efd2 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -42,7 +42,6 @@ ) from dlt.pipeline import progress from dlt import destinations -from dlt.destinations.dataset import dataset as _dataset pipeline = _pipeline current = _current @@ -80,7 +79,6 @@ "TCredentials", "sources", "destinations", - "_dataset", ] # verify that no injection context was created diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 048fe2186f..827034ddca 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -592,6 +592,10 @@ def __getattr__(self, table: str) -> SupportsReadableRelation: ... def ibis(self) -> IbisBackend: ... + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: ... + class JobClientBase(ABC): def __init__( diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py index e443045e49..fc55393a60 100644 --- a/dlt/destinations/dataset/dataset.py +++ b/dlt/destinations/dataset/dataset.py @@ -1,4 +1,4 @@ -from typing import Any, Union, TYPE_CHECKING +from typing import Any, Union, TYPE_CHECKING, List from dlt.common.json import json @@ -133,6 +133,32 @@ def table(self, table_name: str) -> SupportsReadableRelation: table_name=table_name, ) # type: ignore[abstract] + def row_counts( + self, *, data_tables: bool = True, dlt_tables: bool = False, table_names: List[str] = None + ) -> SupportsReadableRelation: + """Returns a dictionary of table names and their row counts, returns counts of all data tables by default""" + """If table_names is provided, only the tables in the list are returned regardless of the data_tables and dlt_tables flags""" + + selected_tables = table_names or [] + if not selected_tables: + if data_tables: + selected_tables += self.schema.data_table_names(seen_data_only=True) + if dlt_tables: + selected_tables += self.schema.dlt_table_names() + + # Build UNION ALL query to get row counts for all selected tables + queries = [] + for table in selected_tables: + queries.append( + f"SELECT '{table}' as table_name, COUNT(*) as row_count FROM" + f" {self.sql_client.make_qualified_table_name(table)}" + ) + + query = " UNION ALL ".join(queries) + + # Execute query and build result dict + return self(query) + def __getitem__(self, table_name: str) -> SupportsReadableRelation: """access of table via dict notation""" return self.table(table_name) diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 9bd2d6911f..74466a09e4 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1750,7 +1750,7 @@ def __getstate__(self) -> Any: # pickle only the SupportsPipeline protocol fields return {"pipeline_name": self.pipeline_name} - def _dataset( + def dataset( self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto" ) -> SupportsReadableDataset: """Returns a dataset object for querying the destination data. diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index f85d2e19ea..36d30a184f 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -262,20 +262,30 @@ In this example, the first pipeline loads the data using `pipedrive_source()`. T #### [Using the `dlt` SQL client](dlt-ecosystem/transformations/sql.md) -Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of inserting a row into the `customers` table using the `dlt` SQL client: +Another option is to leverage the `dlt` SQL client to query the loaded data and perform transformations using SQL statements. You can execute SQL statements that change the database schema or manipulate data within tables. Here's an example of creating a new table with aggregated sales data in duckdb: ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") with pipeline.sql_client() as client: client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", 10, "Fred", "fred@fred.com" - ) + """ CREATE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + """) ``` In this example, the `execute_sql` method of the SQL client allows you to execute SQL statements. The statement inserts a row with values into the `customers` table. -#### [Using Pandas](dlt-ecosystem/transformations/pandas.md) +#### [Using Pandas](dlt-ecosystem/transformations/python.md) You can fetch query results as Pandas data frames and perform transformations using Pandas functionalities. Here's an example of reading data from the `issues` table in DuckDB and counting reaction types using Pandas: @@ -287,11 +297,8 @@ pipeline = dlt.pipeline( dev_mode=True ) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - reactions = cursor.df() +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() counts = reactions.sum(0).sort_values(0, ascending=False) ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 2b284e991a..a4537195ff 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -118,7 +118,7 @@ to disable tz adjustments. ## Destination configuration -By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in `read/write` mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. +By default, a DuckDB database will be created in the current working directory with a name `.duckdb` (`chess.duckdb` in the example above). After loading, it is available in **read/write** mode via `with pipeline.sql_client() as con:`, which is a wrapper over `DuckDBPyConnection`. See [duckdb docs](https://duckdb.org/docs/api/python/overview#persistent-storage) for details. If you want to **read** data, use [pipeline.dataset()](../../general-usage/dataset-access/dataset) instead of `sql_client`. The `duckdb` credentials do not require any secret values. [You are free to pass the credentials and configuration explicitly](../../general-usage/destination.md#pass-explicit-credentials). For example: ```py diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md index 449f8b8bde..59eb340ef2 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt.md @@ -1,10 +1,10 @@ --- -title: Transform the data with dbt +title: Transforming data with dbt description: Transforming the data loaded by a dlt pipeline with dbt keywords: [transform, dbt, runner] --- -# Transform the data with dbt +# Transforming data with dbt [dbt](https://github.com/dbt-labs/dbt-core) is a framework that allows for the simple structuring of your transformations into DAGs. The benefits of using dbt include: @@ -105,8 +105,8 @@ You can run the example with dbt debug log: `RUNTIME__LOG_LEVEL=DEBUG python dbt ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the data after loading, you can use dbt or one of the following: +If you want to transform your data before loading, you can use Python. If you want to transform your data after loading, you can use dbt or one of the following: 1. [`dlt` SQL client.](../sql.md) -2. [Pandas.](../pandas.md) +2. [Python with dataframes or arrow tables.](../python.md) diff --git a/docs/website/docs/dlt-ecosystem/transformations/index.md b/docs/website/docs/dlt-ecosystem/transformations/index.md new file mode 100644 index 0000000000..6c51e8cd8d --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/index.md @@ -0,0 +1,27 @@ +--- +title: Transforming your data +description: How to transform your data +keywords: [datasets, data, access, transformations] +--- +import DocCardList from '@theme/DocCardList'; + +# Transforming data + +If you'd like to transform your data after a pipeline load, you have 3 options available to you: + +* [Using dbt](./dbt/dbt.md) - dlt provides a convenient dbt wrapper to make integration easier. +* [Using the `dlt` SQL client](./sql.md) - dlt exposes an SQL client to transform data on your destination directly using SQL. +* [Using Python with DataFrames or Arrow tables](./python.md) - you can also transform your data using Arrow tables and DataFrames in Python. + +If you need to preprocess some of your data before it is loaded, you can learn about strategies to: + +* [Rename columns.](../../general-usage/customising-pipelines/renaming_columns) +* [Pseudonymize columns.](../../general-usage/customising-pipelines/pseudonymizing_columns) +* [Remove columns.](../../general-usage/customising-pipelines/removing_columns) + +This is particularly useful if you are trying to remove data related to PII or other sensitive data, you want to remove columns that are not needed for your use case or you are using a destination that does not support certain data types in your source data. + + +# Learn more + + diff --git a/docs/website/docs/dlt-ecosystem/transformations/pandas.md b/docs/website/docs/dlt-ecosystem/transformations/pandas.md deleted file mode 100644 index e431313d1c..0000000000 --- a/docs/website/docs/dlt-ecosystem/transformations/pandas.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Transform the data with Pandas -description: Transform the data loaded by a dlt pipeline with Pandas -keywords: [transform, pandas] ---- - -# Transform the data with Pandas - -You can fetch the results of any SQL query as a dataframe. If the destination supports that -natively (i.e., BigQuery and DuckDB), `dlt` uses the native method. Thanks to this, reading -dataframes can be really fast! The example below reads GitHub reactions data from the `issues` table and -counts the reaction types. - -```py -pipeline = dlt.pipeline( - pipeline_name="github_pipeline", - destination="duckdb", - dataset_name="github_reactions", - dev_mode=True -) -with pipeline.sql_client() as client: - with client.execute_query( - 'SELECT "reactions__+1", "reactions__-1", reactions__laugh, reactions__hooray, reactions__rocket FROM issues' - ) as cursor: - # calling `df` on a cursor, returns the data as a pandas data frame - reactions = cursor.df() -counts = reactions.sum(0).sort_values(0, ascending=False) -``` - -The `df` method above returns all the data in the cursor as a data frame. You can also fetch data in -chunks by passing the `chunk_size` argument to the `df` method. - -Once your data is in a Pandas dataframe, you can transform it as needed. - -## Other transforming tools - -If you want to transform the data before loading, you can use Python. If you want to transform the -data after loading, you can use Pandas or one of the following: - -1. [dbt.](dbt/dbt.md) (recommended) -2. [`dlt` SQL client.](sql.md) - diff --git a/docs/website/docs/dlt-ecosystem/transformations/python.md b/docs/website/docs/dlt-ecosystem/transformations/python.md new file mode 100644 index 0000000000..d43f8caaca --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/transformations/python.md @@ -0,0 +1,109 @@ +--- +title: Transforming data in Python with Arrow tables or DataFrames +description: Transforming data loaded by a dlt pipeline with pandas dataframes or arrow tables +keywords: [transform, pandas] +--- + +# Transforming data in Python with Arrow tables or DataFrames + +You can transform your data in Python using Pandas DataFrames or Arrow tables. To get started, please read the [dataset docs](../../general-usage/dataset-access/dataset). + + +## Interactively transforming your data in Python + +Using the methods explained in the [dataset docs](../../general-usage/dataset-access/dataset), you can fetch data from your destination into a DataFrame or Arrow table in your local Python process and work with it interactively. This even works for filesystem destinations: + + +The example below reads GitHub reactions data from the `issues` table and +counts the reaction types. + +```py +pipeline = dlt.pipeline( + pipeline_name="github_pipeline", + destination="duckdb", + dataset_name="github_reactions", + dev_mode=True +) + +# get a dataframe of all reactions from the dataset +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").df() + +# calculate and print out the sum of all reactions +counts = reactions.sum(0).sort_values(0, ascending=False) +print(counts) + +# alternatively, you can fetch the data as an arrow table +reactions = pipeline.dataset().issues.select("reactions__+1", "reactions__-1", "reactions__laugh", "reactions__hooray", "reactions__rocket").arrow() +# ... do transformations on the arrow table +``` + +## Persisting your transformed data + +Since dlt supports DataFrames and Arrow tables from resources directly, you can use the same pipeline to load the transformed data back into the destination. + + +### A simple example + +A simple example that creates a new table from an existing user table but only with columns that do not contain private information. Note that we use the `iter_arrow()` method on the relation to iterate over the arrow table instead of fetching it all at once. + +```py +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# get user relation with only a few columns selected, but omitting email and name +users = pipeline.dataset().users.select("age", "amount_spent", "country") + +# load the data into a new table called users_clean in the same dataset +pipeline.run(users.iter_arrow(chunk_size=1000), table_name="users_clean") +``` + +### A more complex example + +The example above could easily be done in SQL. Let's assume you'd like to actually do in Python some Arrow transformations. For this will create a resources from which we can yield the modified Arrow tables. The same is possibly with DataFrames. + +```py +import pyarrow.compute as pc + +pipeline = dlt.pipeline( + pipeline_name="users_pipeline", + destination="duckdb", + dataset_name="users_raw", + dev_mode=True +) + +# NOTE: this resource will work like a regular resource and support write_disposition, primary_key, etc. +# NOTE: For selecting only users above 18, we could also use the filter method on the relation with ibis expressions +@dlt.resource(table_name="users_clean") +def users_clean(): + users = pipeline.dataset().users + for arrow_table in users.iter_arrow(chunk_size=1000): + + # we want to filter out users under 18 + age_filter = pc.greater_equal(arrow_table["age"], 18) + arrow_table = arrow_table.filter(age_filter) + + # we want to hash the email column + arrow_table = arrow_table.append_column("email_hash", pc.sha256(arrow_table["email"])) + + # we want to remove the email column and name column + arrow_table = arrow_table.drop(["email", "name"]) + + # yield the transformed arrow table + yield arrow_table + + +pipeline.run(users_clean()) +``` + +## Other transforming tools + +If you want to transform your data before loading, you can use Python. If you want to transform the +data after loading, you can use Pandas or one of the following: + +1. [dbt.](dbt/dbt.md) (recommended) +2. [`dlt` SQL client.](sql.md) + diff --git a/docs/website/docs/dlt-ecosystem/transformations/sql.md b/docs/website/docs/dlt-ecosystem/transformations/sql.md index ffd348d1a0..60f3e7f7a5 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/sql.md +++ b/docs/website/docs/dlt-ecosystem/transformations/sql.md @@ -1,33 +1,52 @@ --- -title: Transform the data with SQL +title: Transforming data with SQL description: Transforming the data loaded by a dlt pipeline with the dlt SQL client keywords: [transform, sql] --- -# Transform the data using the `dlt` SQL client +# Transforming data using the `dlt` SQL client A simple alternative to dbt is to query the data using the `dlt` SQL client and then perform the -transformations using Python. The `execute_sql` method allows you to execute any SQL statement, +transformations using SQL statements in Python. The `execute_sql` method allows you to execute any SQL statement, including statements that change the database schema or data in the tables. In the example below, we insert a row into the `customers` table. Note that the syntax is the same as for any standard `dbapi` connection. +:::info +* This method will work for all SQL destinations supported by `dlt`, but not for the filesystem destination. +* Read the [SQL client docs](../../ general-usage/dataset-access/dataset) for more information on how to access data with the SQL client. +* If you are simply trying to read data, you should use the powerful [dataset interface](../../general-usage/dataset-access/dataset) instead. +::: + + +Typically you will use this type of transformation if you can create or update tables directly from existing tables +without any need to insert data from your Python environment. + +The example below creates a new table `aggregated_sales` that contains the total and average sales for each category and region + + ```py -pipeline = dlt.pipeline(destination="bigquery", dataset_name="crm") -try: - with pipeline.sql_client() as client: - client.execute_sql( - "INSERT INTO customers VALUES (%s, %s, %s)", - 10, - "Fred", - "fred@fred.com" - ) -except Exception: - ... +pipeline = dlt.pipeline(destination="duckdb", dataset_name="crm") + +# NOTE: this is the duckdb sql dialect, other destinations may use different expressions +with pipeline.sql_client() as client: + client.execute_sql( + """ CREATE OR REPLACE TABLE aggregated_sales AS + SELECT + category, + region, + SUM(amount) AS total_sales, + AVG(amount) AS average_sales + FROM + sales + GROUP BY + category, + region; + """) ``` -In the case of SELECT queries, the data is returned as a list of rows, with the elements of a row -corresponding to selected columns. +You can also use the `execute_sql` method to run select queries. The data is returned as a list of rows, with the elements of a row +corresponding to selected columns. A more convenient way to extract data is to use dlt datasets. ```py try: @@ -44,9 +63,9 @@ except Exception: ## Other transforming tools -If you want to transform the data before loading, you can use Python. If you want to transform the +If you want to transform your data before loading, you can use Python. If you want to transform the data after loading, you can use SQL or one of the following: 1. [dbt](dbt/dbt.md) (recommended). -2. [Pandas](pandas.md). +2. [Python with DataFrames or Arrow tables](python.md). diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index 14d9ecb04b..ea3c9c768b 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -306,7 +306,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa - `write_disposition`: The write disposition for the resource. - `primary_key`: The primary key for the resource. - `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. -- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform the data. +- `processing_steps`: A list of [processing steps](#processing-steps-filter-and-transform-data) to filter and transform your data. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. - `auth`: An optional `AuthConfig` instance. If passed, is used over the one defined in the [client](#client) definition. Example: ```py diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index b2e3f03d4d..f9c01603f6 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -19,7 +19,7 @@ Here's a full example of how to retrieve data from a pipeline and load it into a # and you have loaded data to a table named 'items' in the destination # Step 1: Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() # Step 2: Access a table as a ReadableRelation items_relation = dataset.items # Or dataset["items"] @@ -39,7 +39,10 @@ Assuming you have a `Pipeline` object (let's call it `pipeline`), you can obtain ```py # Get the readable dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() + +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) ``` ### Access tables as `ReadableRelation` @@ -116,6 +119,18 @@ for items_chunk in items_relation.iter_fetch(chunk_size=500): The methods available on the ReadableRelation correspond to the methods available on the cursor returned by the SQL client. Please refer to the [SQL client](./sql-client.md#supported-methods-on-the-cursor) guide for more information. +## Special queries + +You can use the `row_counts` method to get the row counts of all tables in the destination as a DataFrame. + +```py +# print the row counts of all tables in the destination as dataframe +print(dataset.row_counts().df()) + +# or as tuples +print(dataset.row_counts().fetchall()) +``` + ## Modifying queries You can refine your data retrieval by limiting the number of records, selecting specific columns, or chaining these operations. @@ -168,7 +183,7 @@ dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object u ```py # now that ibis is installed, we can get a dataset with ibis relations -dataset = pipeline._dataset() +dataset = pipeline.dataset() # get two relations items_relation = dataset["items"] @@ -284,7 +299,9 @@ other_pipeline = dlt.pipeline(pipeline_name="other_pipeline", destination="duckd other_pipeline.run(limited_items_relation.iter_arrow(chunk_size=10_000), table_name="limited_items") ``` -### Using `ibis` to query the data +Learn more about [transforming data in Python with Arrow tables or DataFrames](../../dlt-ecosystem/transformations/python). + +### Using `ibis` to query data Visit the [Native Ibis integration](./ibis-backend.md) guide to learn more. diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 9f9b65e9c0..bc8487940e 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -28,7 +28,7 @@ pip install ibis-framework[duckdb] ```py # get the dataset from the pipeline -dataset = pipeline._dataset() +dataset = pipeline.dataset() dataset_name = pipeline.dataset_name # get the native ibis connection from the dataset diff --git a/docs/website/docs/general-usage/destination.md b/docs/website/docs/general-usage/destination.md index fa133b6257..ba42869957 100644 --- a/docs/website/docs/general-usage/destination.md +++ b/docs/website/docs/general-usage/destination.md @@ -128,7 +128,7 @@ When loading data, `dlt` will access the destination in two cases: 1. At the beginning of the `run` method to sync the pipeline state with the destination (or if you call `pipeline.sync_destination` explicitly). 2. In the `pipeline.load` method - to migrate the schema and load the load package. -Obviously, `dlt` will access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). +`dlt` will also access the destination when you instantiate [sql_client](../dlt-ecosystem/transformations/sql.md). :::note `dlt` will not import the destination dependencies or access destination configuration if access is not needed. You can build multi-stage pipelines where steps are executed in separate processes or containers - the `extract` and `normalize` step do not need destination dependencies, configuration, and actual connection. diff --git a/docs/website/docs/general-usage/state.md b/docs/website/docs/general-usage/state.md index 46aa1d63ce..d1fb426452 100644 --- a/docs/website/docs/general-usage/state.md +++ b/docs/website/docs/general-usage/state.md @@ -123,14 +123,13 @@ def comments(user_id: str): # on the first pipeline run, the user_comments table does not yet exist so do not check at all # alternatively, catch DatabaseUndefinedRelation which is raised when an unknown table is selected if not current_pipeline.first_run: - with current_pipeline.sql_client() as client: - # we may get the last user comment or None which we replace with 0 - max_id = ( - client.execute_sql( - "SELECT MAX(_id) FROM user_comments WHERE user_id=?", user_id - )[0][0] - or 0 - ) + # get user comments table from pipeline dataset + user_comments = current_pipeline.dataset().user_comments + # get last user comment id with ibis expression, ibis-extras need to be installed + max_id_df = user_comments.filter(user_comments.user_id == user_id).select(user_comments["_id"].max()).df() + # if there are no comments for the user, max_id will be None, so we replace it with 0 + max_id = max_id_df[0][0] if len(max_id_df.index) else 0 + # use max_id to filter our results (we simulate an API query) yield from [ {"_id": i, "value": letter, "user_id": user_id} diff --git a/docs/website/docs/intro.md b/docs/website/docs/intro.md index b20d41c494..bc227b85ad 100644 --- a/docs/website/docs/intro.md +++ b/docs/website/docs/intro.md @@ -70,6 +70,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and posts table as dataframe +print(load_info) +print(pipeline.dataset().posts.df()) ``` Follow the [REST API source tutorial](./tutorial/rest-api) to learn more about the source configuration and pagination methods. @@ -92,6 +96,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(source) + +# print load info and the "family" table as dataframe +print(load_info) +print(pipeline.dataset().family.df()) ``` Follow the [SQL source tutorial](./tutorial/sql-database) to learn more about the source configuration and supported databases. @@ -116,6 +124,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(resource) + +# print load info and the "example" table as dataframe +print(load_info) +print(pipeline.dataset().example.df()) ``` Follow the [filesystem source tutorial](./tutorial/filesystem) to learn more about the source configuration and supported storage services. @@ -128,7 +140,7 @@ dlt is able to load data from Python generators or directly from Python data str ```py import dlt -@dlt.resource +@dlt.resource(table_name="foo_data") def foo(): for i in range(10): yield {"id": i, "name": f"This is item {i}"} @@ -139,6 +151,10 @@ pipeline = dlt.pipeline( ) load_info = pipeline.run(foo) + +# print load info and the "foo_data" table as dataframe +print(load_info) +print(pipeline.dataset().foo_data.df()) ``` Check out the [Python data structures tutorial](./tutorial/load-data-from-an-api) to learn about dlt fundamentals and advanced usage scenarios. diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index ddfef2cbe8..73f780ba7a 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -72,7 +72,25 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs `dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. -### Explore the data +### Explore data in Python + +You can use dlt [datasets](../general-usage/dataset-access/dataset) to easily query the data in pure Python. + +```py +# get the dataset +dataset = pipeline.dataset("mydata") + +# get the user relation +table = dataset.users + +# query the full table as dataframe +print(table.df()) + +# query the first 10 rows as arrow table +print(table.limit(10).arrow()) +``` + +### Explore data in Streamlit To allow a sneak peek and basic discovery, you can take advantage of [built-in integration with Streamlit](../reference/command-line-interface#show-tables-and-data-in-the-destination): diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 8e8c11fc09..ca75c29392 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -211,13 +211,10 @@ const sidebars = { }, { type: 'category', - label: 'Transform the data', + label: 'Transforming data', link: { - type: 'generated-index', - title: 'Transform the data', - description: 'If you want to transform the data after loading, you can use one of the following methods: dbt, SQL, Pandas.', - slug: 'dlt-ecosystem/transformations', - keywords: ['transformations'], + type: 'doc', + id: 'dlt-ecosystem/transformations/index', }, items: [ { @@ -228,8 +225,8 @@ const sidebars = { 'dlt-ecosystem/transformations/dbt/dbt_cloud', ] }, + 'dlt-ecosystem/transformations/python', 'dlt-ecosystem/transformations/sql', - 'dlt-ecosystem/transformations/pandas', 'general-usage/customising-pipelines/renaming_columns', 'general-usage/customising-pipelines/pseudonymizing_columns', 'general-usage/customising-pipelines/removing_columns' diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index bc58a18fa0..e3b318e8d4 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -9,7 +9,7 @@ def test_query_builder() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # default query for a table assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' @@ -55,7 +55,7 @@ def test_query_builder() -> None: def test_copy_and_chaining() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() # create releation and set some stuff on it relation = dataset.items @@ -80,7 +80,7 @@ def test_copy_and_chaining() -> None: def test_computed_schema_columns() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset.items # no schema present @@ -107,7 +107,7 @@ def test_computed_schema_columns() -> None: def test_prevent_changing_relation_with_query() -> None: - dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() + dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline").dataset() relation = dataset("SELECT * FROM something") with pytest.raises(ReadableRelationHasQueryException): diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 3ebc9d1201..d63dac93f2 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -228,7 +228,7 @@ def test_pandas_index_as_dedup_key() -> None: no_index_r = some_data.with_name(new_name="no_index") p.run(no_index_r) p.run(no_index_r) - data_ = p._dataset().no_index.arrow() + data_ = p.dataset().no_index.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] @@ -240,7 +240,7 @@ def test_pandas_index_as_dedup_key() -> None: unnamed_index_r.incremental.primary_key = "__index_level_0__" p.run(unnamed_index_r) p.run(unnamed_index_r) - data_ = p._dataset().unnamed_index.arrow() + data_ = p.dataset().unnamed_index.arrow() assert data_.schema.names == ["created_at", "id", "index_level_0"] # indexes 2 and 3 are removed from second batch because they were in the previous batch # and the created_at overlapped so they got deduplicated @@ -258,7 +258,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: named_index_r.incremental.primary_key = "order_id" p.run(named_index_r) p.run(named_index_r) - data_ = p._dataset().named_index.arrow() + data_ = p.dataset().named_index.arrow() assert data_.schema.names == ["created_at", "id", "order_id"] assert data_["order_id"].to_pylist() == [0, 1, 2, 3, 4, 0, 1, 4] @@ -268,7 +268,7 @@ def _make_named_index(df_: pd.DataFrame) -> pd.DataFrame: ) p.run(named_index_impl_r) p.run(named_index_impl_r) - data_ = p._dataset().named_index_impl.arrow() + data_ = p.dataset().named_index_impl.arrow() assert data_.schema.names == ["created_at", "id"] assert data_["id"].to_pylist() == ["a", "b", "c", "d", "e", "f", "g"] diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 49475ce43f..652f75772a 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -282,14 +282,14 @@ def test_drops_pipeline_changes_bound() -> None: p = dlt.pipeline(pipeline_name="quack_pipeline", destination="duckdb") p.run([1, 2, 3], table_name="p_table") p = p.drop() - assert len(p._dataset().p_table.fetchall()) == 3 + assert len(p.dataset().p_table.fetchall()) == 3 # drops internal duckdb p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(":pipeline:")) p.run([1, 2, 3], table_name="p_table") p = p.drop() with pytest.raises(DatabaseUndefinedRelation): - p._dataset().p_table.fetchall() + p.dataset().p_table.fetchall() def test_duckdb_database_delete() -> None: diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index 4f537d129c..cf4bbfb286 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -377,7 +377,7 @@ def items(): pipeline.run([items()], loader_file_format=destination_config.file_format) - df = pipeline._dataset().items.df() + df = pipeline.dataset().items.df() assert len(df.index) == 20 @dlt.resource(table_name="items") @@ -387,5 +387,5 @@ def items2(): pipeline.run([items2()], loader_file_format=destination_config.file_format) # check df and arrow access - assert len(pipeline._dataset().items.df().index) == 50 - assert pipeline._dataset().items.arrow().num_rows == 50 + assert len(pipeline.dataset().items.df().index) == 50 + assert pipeline.dataset().items.arrow().num_rows == 50 diff --git a/tests/load/pipeline/test_bigquery.py b/tests/load/pipeline/test_bigquery.py index cb65c6bcf1..83982bb998 100644 --- a/tests/load/pipeline/test_bigquery.py +++ b/tests/load/pipeline/test_bigquery.py @@ -384,8 +384,8 @@ def resource(): bigquery_adapter(resource, autodetect_schema=True) pipeline.run(resource) - assert len(pipeline._dataset().items.df()) == 5 - assert len(pipeline._dataset().items__nested.df()) == 5 + assert len(pipeline.dataset().items.df()) == 5 + assert len(pipeline.dataset().items__nested.df()) == 5 @dlt.resource(primary_key="id", table_name="items", write_disposition="merge") def resource2(): @@ -395,5 +395,5 @@ def resource2(): bigquery_adapter(resource2, autodetect_schema=True) pipeline.run(resource2) - assert len(pipeline._dataset().items.df()) == 7 - assert len(pipeline._dataset().items__nested.df()) == 7 + assert len(pipeline.dataset().items.df()) == 7 + assert len(pipeline.dataset().items__nested.df()) == 7 diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index a7aa4d36e4..2d1138a51d 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -273,10 +273,10 @@ def test_duckdb_credentials_separation( p2 = dlt.pipeline("p2", destination=duckdb(credentials=":pipeline:")) p1.run([1, 2, 3], table_name="p1_data") - p1_dataset = p1._dataset() + p1_dataset = p1.dataset() p2.run([1, 2, 3], table_name="p2_data") - p2_dataset = p2._dataset() + p2_dataset = p2.dataset() # both dataset should have independent duckdb databases # destinations should be bounded to pipelines still diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index d2f5f7951e..bca844d2c8 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -25,6 +25,7 @@ ReadableRelationUnknownColumnException, ) from tests.load.utils import drop_pipeline_data +from dlt.destinations.dataset import dataset as _dataset EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -169,9 +170,9 @@ def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation assert isinstance( - populated_pipeline._dataset(dataset_type="default").items, ReadableDBAPIRelation + populated_pipeline.dataset(dataset_type="default").items, ReadableDBAPIRelation ) - assert isinstance(populated_pipeline._dataset(dataset_type="ibis").items, ReadableIbisRelation) + assert isinstance(populated_pipeline.dataset(dataset_type="ibis").items, ReadableIbisRelation) @pytest.mark.no_load @@ -183,7 +184,7 @@ def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): ids=lambda x: x.name, ) def test_arrow_access(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -216,7 +217,7 @@ def test_arrow_access(populated_pipeline: Pipeline) -> None: ) def test_dataframe_access(populated_pipeline: Pipeline) -> None: # access via key - table_relationship = populated_pipeline._dataset()["items"] + table_relationship = populated_pipeline.dataset()["items"] total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -233,7 +234,6 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: if not skip_df_chunk_size_check: assert len(df.index) == chunk_size - # lowercase results for the snowflake case assert set(df.columns.values) == set(EXPECTED_COLUMNS) # iterate all dataframes @@ -256,7 +256,7 @@ def test_dataframe_access(populated_pipeline: Pipeline) -> None: ) def test_db_cursor_access(populated_pipeline: Pipeline) -> None: # check fetch accessors - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items total_records = _total_records(populated_pipeline) chunk_size = _chunk_size(populated_pipeline) expected_chunk_counts = _expected_chunk_count(populated_pipeline) @@ -290,8 +290,7 @@ def test_db_cursor_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - # NOTE: for now hints are only preserved for the default dataset - table_relationship = populated_pipeline._dataset(dataset_type="default").items + table_relationship = populated_pipeline.dataset(dataset_type="default").items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -319,10 +318,94 @@ def test_hint_preservation(populated_pipeline: Pipeline) -> None: ) def test_loads_table_access(populated_pipeline: Pipeline) -> None: # check loads table access, we should have one entry - loads_table = populated_pipeline._dataset()[populated_pipeline.default_schema.loads_table_name] + loads_table = populated_pipeline.dataset()[populated_pipeline.default_schema.loads_table_name] assert len(loads_table.fetchall()) == 1 +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_row_counts(populated_pipeline: Pipeline) -> None: + total_records = _total_records(populated_pipeline) + + dataset = populated_pipeline.dataset() + # default is all data tables + assert set(dataset.row_counts().df().itertuples(index=False, name=None)) == { + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + # get only one data table + assert set( + dataset.row_counts(table_names=["items"]).df().itertuples(index=False, name=None) + ) == { + ( + "items", + total_records, + ), + } + # get all dlt tables + assert set( + dataset.row_counts(dlt_tables=True, data_tables=False) + .df() + .itertuples(index=False, name=None) + ) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + } + # get them all + assert set(dataset.row_counts(dlt_tables=True).df().itertuples(index=False, name=None)) == { + ( + "_dlt_version", + 1, + ), + ( + "_dlt_loads", + 1, + ), + ( + "_dlt_pipeline_state", + 1, + ), + ( + "items", + total_records, + ), + ( + "double_items", + total_records, + ), + ( + "items__children", + total_records * 2, + ), + } + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -334,7 +417,7 @@ def test_loads_table_access(populated_pipeline: Pipeline) -> None: def test_sql_queries(populated_pipeline: Pipeline) -> None: # simple check that query also works tname = populated_pipeline.sql_client().make_qualified_table_name("items") - query_relationship = populated_pipeline._dataset()(f"select * from {tname} where id < 20") + query_relationship = populated_pipeline.dataset()(f"select * from {tname} where id < 20") # we selected the first 20 table = query_relationship.arrow() @@ -346,7 +429,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: f"SELECT i.id, di.double_id FROM {tname} as i JOIN {tdname} as di ON (i.id = di.id) WHERE" " i.id < 20 ORDER BY i.id ASC" ) - join_relationship = populated_pipeline._dataset()(query) + join_relationship = populated_pipeline.dataset()(query) table = join_relationship.fetchall() assert len(table) == 20 assert list(table[0]) == [0, 0] @@ -363,7 +446,7 @@ def test_sql_queries(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_limit_and_head(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + table_relationship = populated_pipeline.dataset().items assert len(table_relationship.head().fetchall()) == 5 assert len(table_relationship.limit(24).fetchall()) == 24 @@ -384,7 +467,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset(dataset_type="default").items + table_relationship = populated_pipeline.dataset(dataset_type="default").items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() assert [v.lower() for v in data_frame.columns.values] == columns @@ -421,18 +504,18 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: """Simple test to ensure schemas may be selected via schema arg""" # if there is no arg, the defautl schema is used - dataset = populated_pipeline._dataset() + dataset = populated_pipeline.dataset() assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables # setting a different schema name will try to load that schema, # not find one and create an empty schema with that name - dataset = populated_pipeline._dataset(schema="unknown_schema") + dataset = populated_pipeline.dataset(schema="unknown_schema") assert dataset.schema.name == "unknown_schema" assert "items" not in dataset.schema.tables # providing the schema name of the right schema will load it - dataset = populated_pipeline._dataset(schema=populated_pipeline.default_schema_name) + dataset = populated_pipeline.dataset(schema=populated_pipeline.default_schema_name) assert dataset.schema.name == populated_pipeline.default_schema_name assert "items" in dataset.schema.tables @@ -450,7 +533,7 @@ def test_ibis_expression_relation(populated_pipeline: Pipeline) -> None: import ibis # type: ignore # now we should get the more powerful ibis relation - dataset = populated_pipeline._dataset() + dataset = populated_pipeline.dataset() total_records = _total_records(populated_pipeline) items_table = dataset["items"] @@ -653,11 +736,11 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: # check correct error if not supported if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() + populated_pipeline.dataset().ibis() return total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() + ibis_connection = populated_pipeline.dataset().ibis() map_i = lambda x: x if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": @@ -709,7 +792,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: total_records = _total_records(populated_pipeline) # check dataset factory - dataset = dlt._dataset( + dataset = _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name ) # verfiy that sql client and schema are lazy loaded @@ -722,7 +805,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded by name dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema=populated_pipeline.default_schema_name, @@ -733,7 +816,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is not loaded when wrong name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, schema="wrong_schema_name", @@ -745,7 +828,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that schema is loaded if no schema name given dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), @@ -756,7 +839,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: # check that there is no error when creating dataset without schema table dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name="unknown_dataset", ), @@ -779,7 +862,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: dataset = cast( ReadableDBAPIDataset, - dlt._dataset( + _dataset( destination=populated_pipeline.destination, dataset_name=populated_pipeline.dataset_name, ), diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index fbd4d412b3..51de3e0f76 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -538,5 +538,5 @@ def test_normalize_path_separator_legacy_behavior(test_storage: FileStorage) -> "_dlt_load_id", } # datasets must be the same - data_ = pipeline._dataset().issues_2.select("issue_id", "id").fetchall() + data_ = pipeline.dataset().issues_2.select("issue_id", "id").fetchall() print(data_) diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index b32854b110..2d72e23462 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1754,7 +1754,7 @@ def test_column_name_with_break_path() -> None: # get data assert_data_table_counts(pipeline, {"custom__path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom__path[["example_custom_field__c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1778,7 +1778,7 @@ def test_column_name_with_break_path_legacy() -> None: # get data assert_data_table_counts(pipeline, {"custom_path": 1}) # get data via dataset with dbapi - data_ = pipeline._dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() + data_ = pipeline.dataset().custom_path[["example_custom_field_c", "reg_c"]].fetchall() assert data_ == [("custom", "c")] @@ -1806,7 +1806,7 @@ def flattened_dict(): assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened__dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] @@ -1836,7 +1836,7 @@ def flattened_dict(): assert set(table["columns"]) == {"delta", "value__timestamp", "_dlt_id", "_dlt_load_id"} assert table["columns"]["value__timestamp"]["data_type"] == "timestamp" # make sure data is there - data_ = pipeline._dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() + data_ = pipeline.dataset().flattened_dict[["delta", "value__timestamp"]].limit(1).fetchall() assert data_ == [(0, now)] diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index a51052d247..32b16c234f 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -521,7 +521,7 @@ def test_parquet_with_flattened_columns() -> None: assert "issue__reactions__url" in pipeline.default_schema.tables["events"]["columns"] assert "issue_reactions_url" not in pipeline.default_schema.tables["events"]["columns"] - events_table = pipeline._dataset().events.arrow() + events_table = pipeline.dataset().events.arrow() assert "issue__reactions__url" in events_table.schema.names assert "issue_reactions_url" not in events_table.schema.names @@ -536,7 +536,7 @@ def test_parquet_with_flattened_columns() -> None: info = pipeline.run(events_table, table_name="events", loader_file_format="parquet") assert_load_info(info) - events_table_new = pipeline._dataset().events.arrow() + events_table_new = pipeline.dataset().events.arrow() assert events_table.schema == events_table_new.schema # double row count assert events_table.num_rows * 2 == events_table_new.num_rows