From 6602f70dd63703b9953d43edb9aca1a719a8f7a3 Mon Sep 17 00:00:00 2001 From: rudolfix Date: Tue, 10 Dec 2024 21:41:17 +0100 Subject: [PATCH 1/5] checks notebook presence before finding userdata (#2117) --- dlt/common/configuration/providers/toml.py | 6 ++++++ .../configuration/test_toml_provider.py | 21 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 3636565fae..e586fef225 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -124,6 +124,12 @@ def _read_google_colab_secrets(self, name: str, file_name: str) -> tomlkit.TOMLD """Try to load the toml from google colab userdata object""" try: from google.colab import userdata + from dlt.common.runtime.exec_info import is_notebook + + # make sure we work in interactive mode (get_ipython() is available) + # when dlt cli is run, userdata is available but without a kernel + if not is_notebook(): + return None try: return tomlkit.loads(userdata.get(file_name)) diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 481c21b7bb..9538849976 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -4,6 +4,7 @@ import yaml from typing import Any, Dict, Type import datetime # noqa: I251 +from unittest.mock import Mock import dlt from dlt.common import pendulum, json @@ -538,11 +539,28 @@ def loader() -> Dict[str, Any]: def test_colab_toml() -> None: + import builtins + # use a path without any settings files try: sys.path.append("tests/common/cases/modules") - # secrets are in user data + + # ipython not present provider: SettingsTomlProvider = SecretsTomlProvider("tests/common/null", global_dir=None) + assert provider.is_empty + + get_ipython_m = Mock() + get_ipython_m.return_value = "google.colab.Shell" + # make it available to all modules + builtins.get_ipython = get_ipython_m # type: ignore[attr-defined] + # test mock + assert get_ipython() == "google.colab.Shell" # type: ignore[name-defined] # noqa + from dlt.common.runtime.exec_info import is_notebook + + assert is_notebook() + + # secrets are in user data + provider = SecretsTomlProvider("tests/common/null", global_dir=None) assert provider.to_toml() == 'api_key="api"' # config is not in userdata provider = ConfigTomlProvider("tests/common/null", "unknown") @@ -551,4 +569,5 @@ def test_colab_toml() -> None: provider = SecretsTomlProvider("tests/common/cases/configuration/.dlt", global_dir=None) assert provider.get_value("secret_value", str, None) == ("2137", "secret_value") finally: + delattr(builtins, "get_ipython") sys.path.pop() From 51b11d24acf579d4f12abc15f2b661778f2995d9 Mon Sep 17 00:00:00 2001 From: Steinthor Palsson Date: Tue, 10 Dec 2024 17:35:22 -0500 Subject: [PATCH 2/5] Add open/closed range arguments for incremental (#1991) * Add open/closed range arguments for incremental * Docs for incremental range args * Docstring * Typo * Ensure deduplication is disabled when range_start=='open' * Cache transformer settings --- dlt/common/incremental/typing.py | 4 + dlt/extract/incremental/__init__.py | 60 +++-- dlt/extract/incremental/transform.py | 75 ++++-- dlt/sources/sql_database/helpers.py | 12 +- .../verified-sources/sql_database/advanced.md | 49 +++- .../docs/general-usage/incremental-loading.md | 5 +- tests/extract/test_incremental.py | 111 +++++++- .../load/sources/sql_database/test_helpers.py | 237 ++++++++++++------ .../sql_database/test_sql_database_source.py | 17 +- 9 files changed, 434 insertions(+), 136 deletions(-) diff --git a/dlt/common/incremental/typing.py b/dlt/common/incremental/typing.py index 460e2f234b..2ca981bff0 100644 --- a/dlt/common/incremental/typing.py +++ b/dlt/common/incremental/typing.py @@ -8,6 +8,8 @@ LastValueFunc = Callable[[Sequence[TCursorValue]], Any] OnCursorValueMissing = Literal["raise", "include", "exclude"] +TIncrementalRange = Literal["open", "closed"] + class IncrementalColumnState(TypedDict): initial_value: Optional[Any] @@ -26,3 +28,5 @@ class IncrementalArgs(TypedDict, total=False): allow_external_schedulers: Optional[bool] lag: Optional[Union[float, int]] on_cursor_value_missing: Optional[OnCursorValueMissing] + range_start: Optional[TIncrementalRange] + range_end: Optional[TIncrementalRange] diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 28d33bb71f..5e7bae49c6 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -42,6 +42,7 @@ LastValueFunc, OnCursorValueMissing, IncrementalArgs, + TIncrementalRange, ) from dlt.extract.items import SupportsPipe, TTableHintTemplate, ItemTransform from dlt.extract.incremental.transform import ( @@ -104,6 +105,11 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded on_cursor_value_missing: Specify what happens when the cursor_path does not exist in a record or a record has `None` at the cursor_path: raise, include, exclude lag: Optional value used to define a lag or attribution window. For datetime cursors, this is interpreted as seconds. For other types, it uses the + or - operator depending on the last_value_func. + range_start: Decide whether the incremental filtering range is `open` or `closed` on the start value side. Default is `closed`. + Setting this to `open` means that items with the same cursor value as the last value from the previous run (or `initial_value`) are excluded from the result. + The `open` range disables deduplication logic so it can serve as an optimization when you know cursors don't overlap between pipeline runs. + range_end: Decide whether the incremental filtering range is `open` or `closed` on the end value side. Default is `open` (exact `end_value` is excluded). + Setting this to `closed` means that items with the exact same cursor value as the `end_value` are included in the result. """ # this is config/dataclass so declare members @@ -116,6 +122,8 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa on_cursor_value_missing: OnCursorValueMissing = "raise" lag: Optional[float] = None duplicate_cursor_warning_threshold: ClassVar[int] = 200 + range_start: TIncrementalRange = "closed" + range_end: TIncrementalRange = "open" # incremental acting as empty EMPTY: ClassVar["Incremental[Any]"] = None @@ -132,6 +140,8 @@ def __init__( allow_external_schedulers: bool = False, on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: # make sure that path is valid if cursor_path: @@ -174,9 +184,11 @@ def __init__( self.start_out_of_range: bool = False """Becomes true on the first item that is out of range of `start_value`. I.e. when using `max` this is a value that is lower than `start_value`""" - self._transformers: Dict[str, IncrementalTransform] = {} + self._transformers: Dict[Type[IncrementalTransform], IncrementalTransform] = {} self._bound_pipe: SupportsPipe = None """Bound pipe""" + self.range_start = range_start + self.range_end = range_end @property def primary_key(self) -> Optional[TTableHintTemplate[TColumnNames]]: @@ -190,22 +202,6 @@ def primary_key(self, value: str) -> None: for transform in self._transformers.values(): transform.primary_key = value - def _make_transforms(self) -> None: - types = [("arrow", ArrowIncremental), ("json", JsonIncremental)] - for dt, kls in types: - self._transformers[dt] = kls( - self.resource_name, - self.cursor_path, - self.initial_value, - self.start_value, - self.end_value, - self.last_value_func, - self._primary_key, - set(self._cached_state["unique_hashes"]), - self.on_cursor_value_missing, - self.lag, - ) - @classmethod def from_existing_state( cls, resource_name: str, cursor_path: str @@ -489,7 +485,8 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]": ) # cache state self._cached_state = self.get_state() - self._make_transforms() + # Clear transforms so we get new instances + self._transformers.clear() return self def can_close(self) -> bool: @@ -520,15 +517,34 @@ def __str__(self) -> str: f" {self.last_value_func}" ) + def _make_or_get_transformer(self, cls: Type[IncrementalTransform]) -> IncrementalTransform: + if transformer := self._transformers.get(cls): + return transformer + transformer = self._transformers[cls] = cls( + self.resource_name, + self.cursor_path, + self.initial_value, + self.start_value, + self.end_value, + self.last_value_func, + self._primary_key, + set(self._cached_state["unique_hashes"]), + self.on_cursor_value_missing, + self.lag, + self.range_start, + self.range_end, + ) + return transformer + def _get_transformer(self, items: TDataItems) -> IncrementalTransform: # Assume list is all of the same type for item in items if isinstance(items, list) else [items]: if is_arrow_item(item): - return self._transformers["arrow"] + return self._make_or_get_transformer(ArrowIncremental) elif pandas is not None and isinstance(item, pandas.DataFrame): - return self._transformers["arrow"] - return self._transformers["json"] - return self._transformers["json"] + return self._make_or_get_transformer(ArrowIncremental) + return self._make_or_get_transformer(JsonIncremental) + return self._make_or_get_transformer(JsonIncremental) def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 22b1194b51..1d213e26c2 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -13,7 +13,12 @@ IncrementalPrimaryKeyMissing, IncrementalCursorPathHasValueNone, ) -from dlt.common.incremental.typing import TCursorValue, LastValueFunc, OnCursorValueMissing +from dlt.common.incremental.typing import ( + TCursorValue, + LastValueFunc, + OnCursorValueMissing, + TIncrementalRange, +) from dlt.extract.utils import resolve_column_value from dlt.extract.items import TTableHintTemplate @@ -57,6 +62,8 @@ def __init__( unique_hashes: Set[str], on_cursor_value_missing: OnCursorValueMissing = "raise", lag: Optional[float] = None, + range_start: TIncrementalRange = "closed", + range_end: TIncrementalRange = "open", ) -> None: self.resource_name = resource_name self.cursor_path = cursor_path @@ -71,6 +78,9 @@ def __init__( self.start_unique_hashes = set(unique_hashes) self.on_cursor_value_missing = on_cursor_value_missing self.lag = lag + self.range_start = range_start + self.range_end = range_end + # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) # for simple column name we'll fallback to search in dict @@ -107,6 +117,8 @@ def __call__( def deduplication_disabled(self) -> bool: """Skip deduplication when length of the key is 0 or if lag is applied.""" # disable deduplication if end value is set - state is not saved + if self.range_start == "open": + return True if self.end_value is not None: return True # disable deduplication if lag is applied - destination must deduplicate ranges @@ -191,10 +203,10 @@ def __call__( # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None: try: - if ( - last_value_func((row_value, self.end_value)) != self.end_value - or last_value_func((row_value,)) == self.end_value - ): + if last_value_func((row_value, self.end_value)) != self.end_value: + return None, False, True + + if self.range_end == "open" and last_value_func((row_value,)) == self.end_value: return None, False, True except Exception as ex: raise IncrementalCursorInvalidCoercion( @@ -221,6 +233,9 @@ def __call__( ) from ex # new_value is "less" or equal to last_value (the actual max) if last_value == new_value: + if self.range_start == "open": + # We only want greater than last_value + return None, False, False # use func to compute row_value into last_value compatible processed_row_value = last_value_func((row_value,)) # skip the record that is not a start_value or new_value: that record was already processed @@ -258,6 +273,31 @@ def __call__( class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + if self.last_value_func is max: + self.compute = pa.compute.max + self.end_compare = ( + pa.compute.less if self.range_end == "open" else pa.compute.less_equal + ) + self.last_value_compare = ( + pa.compute.greater_equal if self.range_start == "closed" else pa.compute.greater + ) + self.new_value_compare = pa.compute.greater + elif self.last_value_func is min: + self.compute = pa.compute.min + self.end_compare = ( + pa.compute.greater if self.range_end == "open" else pa.compute.greater_equal + ) + self.last_value_compare = ( + pa.compute.less_equal if self.range_start == "closed" else pa.compute.less + ) + self.new_value_compare = pa.compute.less + else: + raise NotImplementedError( + "Only min or max last_value_func is supported for arrow tables" + ) + def compute_unique_values(self, item: "TAnyArrowItem", unique_columns: List[str]) -> List[str]: if not unique_columns: return [] @@ -312,28 +352,13 @@ def __call__( if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - if self.last_value_func is max: - compute = pa.compute.max - end_compare = pa.compute.less - last_value_compare = pa.compute.greater_equal - new_value_compare = pa.compute.greater - elif self.last_value_func is min: - compute = pa.compute.min - end_compare = pa.compute.greater - last_value_compare = pa.compute.less_equal - new_value_compare = pa.compute.less - else: - raise NotImplementedError( - "Only min or max last_value_func is supported for arrow tables" - ) - # TODO: Json path support. For now assume the cursor_path is a column name cursor_path = self.cursor_path # The new max/min value try: # NOTE: datetimes are always pendulum in UTC - row_value = from_arrow_scalar(compute(tbl[cursor_path])) + row_value = from_arrow_scalar(self.compute(tbl[cursor_path])) cursor_data_type = tbl.schema.field(cursor_path).type row_value_scalar = to_arrow_scalar(row_value, cursor_data_type) except KeyError as e: @@ -364,10 +389,10 @@ def __call__( cursor_data_type, str(ex), ) from ex - tbl = tbl.filter(end_compare(tbl[cursor_path], end_value_scalar)) + tbl = tbl.filter(self.end_compare(tbl[cursor_path], end_value_scalar)) # Is max row value higher than end value? # NOTE: pyarrow bool *always* evaluates to python True. `as_py()` is necessary - end_out_of_range = not end_compare(row_value_scalar, end_value_scalar).as_py() + end_out_of_range = not self.end_compare(row_value_scalar, end_value_scalar).as_py() if self.start_value is not None: try: @@ -383,7 +408,7 @@ def __call__( str(ex), ) from ex # Remove rows lower or equal than the last start value - keep_filter = last_value_compare(tbl[cursor_path], start_value_scalar) + keep_filter = self.last_value_compare(tbl[cursor_path], start_value_scalar) start_out_of_range = bool(pa.compute.any(pa.compute.invert(keep_filter)).as_py()) tbl = tbl.filter(keep_filter) if not self.deduplication_disabled: @@ -407,7 +432,7 @@ def __call__( if ( self.last_value is None - or new_value_compare( + or self.new_value_compare( row_value_scalar, to_arrow_scalar(self.last_value, cursor_data_type) ).as_py() ): # Last value has changed diff --git a/dlt/sources/sql_database/helpers.py b/dlt/sources/sql_database/helpers.py index a8be2a6427..ee38c7dd98 100644 --- a/dlt/sources/sql_database/helpers.py +++ b/dlt/sources/sql_database/helpers.py @@ -94,12 +94,16 @@ def __init__( self.end_value = incremental.end_value self.row_order: TSortOrder = self.incremental.row_order self.on_cursor_value_missing = self.incremental.on_cursor_value_missing + self.range_start = self.incremental.range_start + self.range_end = self.incremental.range_end else: self.cursor_column = None self.last_value = None self.end_value = None self.row_order = None self.on_cursor_value_missing = None + self.range_start = None + self.range_end = None def _make_query(self) -> SelectAny: table = self.table @@ -110,11 +114,11 @@ def _make_query(self) -> SelectAny: # generate where if last_value_func is max: # Query ordered and filtered according to last_value function - filter_op = operator.ge - filter_op_end = operator.lt + filter_op = operator.ge if self.range_start == "closed" else operator.gt + filter_op_end = operator.lt if self.range_end == "open" else operator.le elif last_value_func is min: - filter_op = operator.le - filter_op_end = operator.gt + filter_op = operator.le if self.range_start == "closed" else operator.lt + filter_op_end = operator.gt if self.range_end == "open" else operator.ge else: # Custom last_value, load everything and let incremental handle filtering return query # type: ignore[no-any-return] diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md index 6ff3a267d2..c532f6d357 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/sql_database/advanced.md @@ -16,7 +16,7 @@ Efficient data management often requires loading only new or updated data from y Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing ID) to load only data newer than a specified initial value, enhancing efficiency by reducing processing time and resource use. Read [here](../../../walkthroughs/sql-incremental-configuration) for more details on incremental loading with `dlt`. -#### How to configure +### How to configure 1. **Choose a cursor column**: Identify a column in your SQL table that can serve as a reliable indicator of new or updated rows. Common choices include timestamp columns or auto-incrementing IDs. 1. **Set an initial value**: Choose a starting value for the cursor to begin loading data. This could be a specific timestamp or ID from which you wish to start loading data. 1. **Deduplication**: When using incremental loading, the system automatically handles the deduplication of rows based on the primary key (if available) or row hash for tables without a primary key. @@ -27,7 +27,7 @@ Incremental loading uses a cursor column (e.g., timestamp or auto-incrementing I If your cursor column name contains special characters (e.g., `$`) you need to escape it when passing it to the `incremental` function. For example, if your cursor column is `example_$column`, you should pass it as `"'example_$column'"` or `'"example_$column"'` to the `incremental` function: `incremental("'example_$column'", initial_value=...)`. ::: -#### Examples +### Examples 1. **Incremental loading with the resource `sql_table`**. @@ -52,7 +52,7 @@ If your cursor column name contains special characters (e.g., `$`) you need to e print(extract_info) ``` - Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater than the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). + Behind the scene, the loader generates a SQL query filtering rows with `last_modified` values greater or equal to the incremental value. In the first run, this is the initial value (midnight (00:00:00) January 1, 2024). In subsequent runs, it is the latest value of `last_modified` that `dlt` stores in [state](../../../general-usage/state). 2. **Incremental loading with the source `sql_database`**. @@ -78,6 +78,49 @@ If your cursor column name contains special characters (e.g., `$`) you need to e * `apply_hints` is a powerful method that enables schema modifications after resource creation, like adjusting write disposition and primary keys. You can choose from various tables and use `apply_hints` multiple times to create pipelines with merged, appended, or replaced resources. ::: +### Inclusive and exclusive filtering + +By default the incremental filtering is inclusive on the start value side so that +rows with cursor equal to the last run's cursor are fetched again from the database. + +The SQL query generated looks something like this (assuming `last_value_func` is `max`): + +```sql +SELECT * FROM family +WHERE last_modified >= :start_value +ORDER BY last_modified ASC +``` + +That means some rows overlapping with the previous load are fetched from the database. +Duplicates are then filtered out by dlt using either the primary key or a hash of the row's contents. + +This ensures there are no gaps in the extracted sequence. But it does come with some performance overhead, +both due to the deduplication processing and the cost of fetching redundant records from the database. + +This is not always needed. If you know that your data does not contain overlapping cursor values then you +can optimize extraction by passing `range_start="open"` to incremental. + +This both disables the deduplication process and changes the operator used in the SQL `WHERE` clause from `>=` (greater-or-equal) to `>` (greater than), so that no overlapping rows are fetched. + +E.g. + +```py +table = sql_table( + table='family', + incremental=dlt.sources.incremental( + 'last_modified', # Cursor column name + initial_value=pendulum.DateTime(2024, 1, 1, 0, 0, 0), # Initial cursor value + range_start="open", # exclude the start value + ) +) +``` + +It's a good option if: + +* The cursor is an auto incrementing ID +* The cursor is a high precision timestamp and two records are never created at exactly the same time +* Your pipeline runs are timed in such a way that new data is not generated during the load + ## Parallelized extraction You can extract each table in a separate thread (no multiprocessing at this point). This will decrease loading time if your queries take time to execute or your network latency/speed is low. To enable this, declare your sources/resources as follows: diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 3f452f0d16..5008795ed4 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -693,7 +693,7 @@ august_issues = repo_issues( ... ``` -Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. +Note that dlt's incremental filtering considers the ranges half-closed. `initial_value` is inclusive, `end_value` is exclusive, so chaining ranges like above works without overlaps. This behaviour can be changed with the `range_start` (default `"closed"`) and `range_end` (default `"open"`) arguments. ### Declare row order to not request unnecessary data @@ -793,6 +793,9 @@ def some_data(last_timestamp=dlt.sources.incremental("item.ts", primary_key=())) yield {"delta": i, "item": {"ts": pendulum.now().timestamp()}} ``` +This deduplication process is always enabled when `range_start` is set to `"closed"` (default). +When you pass `range_start="open"` no deduplication is done as it is not needed as rows with the previous cursor value are excluded. This can be a useful optimization to avoid the performance overhead of deduplication if the cursor field is guaranteed to be unique. + ### Using `dlt.sources.incremental` with dynamically created resources When resources are [created dynamically](source.md#create-resources-dynamically), it is possible to use the `dlt.sources.incremental` definition as well. diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 725872b621..3ebc9d1201 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -5,7 +5,7 @@ from datetime import datetime, date # noqa: I251 from itertools import chain, count from time import sleep -from typing import Any, Optional, Literal, Sequence, Dict +from typing import Any, Optional, Literal, Sequence, Dict, Iterable from unittest import mock import duckdb @@ -1522,6 +1522,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TestDataItemFormat) -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @@ -3851,3 +3852,111 @@ def some_data(): for col in table_schema["columns"].values(): assert "incremental" not in col + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_start_range_open(item_type: TestDataItemFormat, last_value_func: Any) -> None: + data_range: Iterable[int] = range(1, 12) + if last_value_func == max: + initial_value = 5 + # Only items higher than inital extracted + expected_items = list(range(6, 12)) + order_dir = "ASC" + elif last_value_func == min: + data_range = reversed(data_range) # type: ignore[call-overload] + initial_value = 5 + # Only items lower than inital extracted + expected_items = list(reversed(range(1, 5))) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=initial_value, + range_start="open", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in data_range] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + assert items == expected_items + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +def test_start_range_open_no_deduplication(item_type: TestDataItemFormat) -> None: + @dlt.source + def dummy(): + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + range_start="open", + ) + ): + yield [{"updated_at": i} for i in range(3)] + + yield some_data + + pipeline = dlt.pipeline(pipeline_name=uniq_id()) + pipeline.extract(dummy()) + + state = pipeline.state["sources"]["dummy"]["resources"]["some_data"]["incremental"][ + "updated_at" + ] + + # No unique values should be computed + assert state["unique_hashes"] == [] + + +@pytest.mark.parametrize("item_type", ALL_TEST_DATA_ITEM_FORMATS) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_end_range_closed(item_type: TestDataItemFormat, last_value_func: Any) -> None: + values = [5, 10] + expected_items = list(range(5, 11)) + if last_value_func == max: + order_dir = "ASC" + elif last_value_func == min: + values = list(reversed(values)) + expected_items = list(reversed(expected_items)) + order_dir = "DESC" + + @dlt.resource + def some_data( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", + initial_value=values[0], + end_value=values[1], + range_end="closed", + last_value_func=last_value_func, + ), + ) -> Any: + data = [{"updated_at": i} for i in range(1, 12)] + yield data_to_item_format(item_type, data) + + pipeline = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") + pipeline.run(some_data()) + + with pipeline.sql_client() as client: + items = [ + row[0] + for row in client.execute_sql( + f"SELECT updated_at FROM some_data ORDER BY updated_at {order_dir}" + ) + ] + + # Includes values 5-10 inclusive + assert items == expected_items diff --git a/tests/load/sources/sql_database/test_helpers.py b/tests/load/sources/sql_database/test_helpers.py index def5430146..43da9c955f 100644 --- a/tests/load/sources/sql_database/test_helpers.py +++ b/tests/load/sources/sql_database/test_helpers.py @@ -1,3 +1,6 @@ +from typing import Callable, Any, TYPE_CHECKING +from dataclasses import dataclass + import pytest import dlt @@ -14,6 +17,18 @@ pytest.skip("Tests require sql alchemy", allow_module_level=True) +@dataclass +class MockIncremental: + last_value: Any + last_value_func: Callable[[Any], Any] + cursor_path: str + row_order: str = None + end_value: Any = None + on_cursor_value_missing: str = "raise" + range_start: str = "closed" + range_end: str = "open" + + @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) def test_cursor_or_unique_column_not_in_table( sql_source_db: SQLAlchemySourceDB, backend: TableBackend @@ -36,13 +51,12 @@ def test_make_query_incremental_max( ) -> None: """Verify query is generated according to incremental settings""" - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -50,14 +64,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) - .where(table.c.created_at >= MockIncremental.last_value) + .where(table.c.created_at >= incremental.last_value) ) assert query.compare(expected) @@ -67,13 +81,14 @@ class MockIncremental: def test_make_query_incremental_min( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = min - cursor_path = "created_at" - row_order = "desc" - end_value = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=min, + cursor_path="created_at", + row_order="desc", + end_value=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -81,14 +96,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = ( table.select() .order_by(table.c.created_at.asc()) # `min` func swaps order - .where(table.c.created_at <= MockIncremental.last_value) + .where(table.c.created_at <= incremental.last_value) ) assert query.compare(expected) @@ -103,13 +118,14 @@ def test_make_query_incremental_on_cursor_value_missing_set( with_end_value: bool, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None if not with_end_value else dlt.common.pendulum.now().add(hours=1) - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None if not with_end_value else dlt.common.pendulum.now().add(hours=1), + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -117,7 +133,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -131,14 +147,14 @@ class MockIncremental: if with_end_value: where_clause = operator( sa.and_( - table.c.created_at >= MockIncremental.last_value, - table.c.created_at < MockIncremental.end_value, + table.c.created_at >= incremental.last_value, + table.c.created_at < incremental.end_value, ), missing_cond, ) else: where_clause = operator( - table.c.created_at >= MockIncremental.last_value, + table.c.created_at >= incremental.last_value, missing_cond, ) expected = table.select().order_by(table.c.created_at.asc()).where(where_clause) @@ -152,13 +168,14 @@ def test_make_query_incremental_on_cursor_value_missing_no_last_value( backend: TableBackend, cursor_value_missing: str, ) -> None: - class MockIncremental: - last_value = None - last_value_func = max - cursor_path = "created_at" - row_order = "asc" - end_value = None - on_cursor_value_missing = cursor_value_missing + incremental = MockIncremental( + last_value=None, + last_value_func=max, + cursor_path="created_at", + row_order="asc", + end_value=None, + on_cursor_value_missing=cursor_value_missing, + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -166,7 +183,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -189,13 +206,14 @@ def test_make_query_incremental_end_value( ) -> None: now = dlt.common.pendulum.now() - class MockIncremental: - last_value = now - last_value_func = min - cursor_path = "created_at" - end_value = now.add(hours=1) - row_order = None - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=now, + last_value_func=min, + cursor_path="created_at", + end_value=now.add(hours=1), + row_order=None, + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -203,14 +221,14 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() expected = table.select().where( sa.and_( - table.c.created_at <= MockIncremental.last_value, - table.c.created_at > MockIncremental.end_value, + table.c.created_at <= incremental.last_value, + table.c.created_at > incremental.end_value, ) ) @@ -221,13 +239,14 @@ class MockIncremental: def test_make_query_incremental_any_fun( sql_source_db: SQLAlchemySourceDB, backend: TableBackend ) -> None: - class MockIncremental: - last_value = dlt.common.pendulum.now() - last_value_func = lambda x: x[-1] - cursor_path = "created_at" - row_order = "asc" - end_value = dlt.common.pendulum.now() - on_cursor_value_missing = "raise" + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=lambda x: x[-1], + cursor_path="created_at", + row_order="asc", + end_value=dlt.common.pendulum.now(), + on_cursor_value_missing="raise", + ) table = sql_source_db.get_table("chat_message") loader = TableLoader( @@ -235,7 +254,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) query = loader.make_query() @@ -256,12 +275,11 @@ def test_cursor_path_field_name_with_a_special_chars( if special_field_name not in table.c: table.append_column(sa.Column(special_field_name, sa.String)) - class MockIncremental: - cursor_path = "'id$field'" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="'id$field'", + last_value=None, + last_value_func=max, + ) # Should not raise any exception loader = TableLoader( @@ -269,7 +287,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert loader.cursor_column == table.c[special_field_name] @@ -281,12 +299,11 @@ def test_cursor_path_multiple_fields( """Test that a cursor_path with multiple fields raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "created_at,updated_at" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="created_at,updated_at", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -294,7 +311,7 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) @@ -306,12 +323,11 @@ def test_cursor_path_complex_expression( """Test that a complex JSONPath expression in cursor_path raises a ValueError.""" table = sql_source_db.get_table("chat_message") - class MockIncremental: - cursor_path = "$.users[0].id" - last_value = None - end_value = None - row_order = None - on_cursor_value_missing = None + incremental = MockIncremental( + cursor_path="$.users[0].id", + last_value=None, + last_value_func=max, + ) with pytest.raises(ValueError) as excinfo: TableLoader( @@ -319,11 +335,80 @@ class MockIncremental: backend, table, table_to_columns(table), - incremental=MockIncremental(), # type: ignore[arg-type] + incremental=incremental, # type: ignore[arg-type] ) assert "must be a simple column name" in str(excinfo.value) +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_start_open( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_start="open", + ) + + table = sql_source_db.get_table("chat_message") + + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at < incremental.last_value) + else: + expected = expected.where(table.c.created_at > incremental.last_value) + + assert query.compare(expected) + + +@pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) +@pytest.mark.parametrize("last_value_func", [min, max]) +def test_make_query_incremental_range_end_closed( + sql_source_db: SQLAlchemySourceDB, backend: TableBackend, last_value_func: Callable[[Any], Any] +) -> None: + incremental = MockIncremental( + last_value=dlt.common.pendulum.now(), + last_value_func=last_value_func, + cursor_path="created_at", + end_value=None, + on_cursor_value_missing="raise", + range_end="closed", + ) + + table = sql_source_db.get_table("chat_message") + loader = TableLoader( + sql_source_db.engine, + backend, + table, + table_to_columns(table), + incremental=incremental, # type: ignore[arg-type] + ) + + query = loader.make_query() + expected = table.select() + + if last_value_func == min: + expected = expected.where(table.c.created_at <= incremental.last_value) + else: + expected = expected.where(table.c.created_at >= incremental.last_value) + + assert query.compare(expected) + + def mock_json_column(field: str) -> TDataItem: """""" import pyarrow as pa diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 9079638586..00257471e0 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -13,6 +13,7 @@ from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceExtractionError +from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental from dlt.sources import DltResource from tests.pipeline.utils import ( @@ -831,8 +832,12 @@ def _assert_incremental(item): else: assert _r.incremental.primary_key == ["id"] assert _r.incremental._incremental.primary_key == ["id"] - assert _r.incremental._incremental._transformers["json"].primary_key == ["id"] - assert _r.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert _r.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] return item pipeline = make_pipeline("duckdb") @@ -841,8 +846,12 @@ def _assert_incremental(item): assert resource.incremental.primary_key == ["id"] assert resource.incremental._incremental.primary_key == ["id"] - assert resource.incremental._incremental._transformers["json"].primary_key == ["id"] - assert resource.incremental._incremental._transformers["arrow"].primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + JsonIncremental + ).primary_key == ["id"] + assert resource.incremental._incremental._make_or_get_transformer( + ArrowIncremental + ).primary_key == ["id"] @pytest.mark.parametrize("backend", ["sqlalchemy", "pyarrow", "pandas", "connectorx"]) From 80ef80401b97646901b48e15dade262ef5c3fd52 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 10 Dec 2024 23:44:01 +0100 Subject: [PATCH 3/5] bump semver to minimum version 3.0.0 (#2132) --- poetry.lock | 104 ++++++++++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 2 +- 2 files changed, 103 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 732ba0e219..6232b383c8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "about-time" @@ -3900,6 +3900,106 @@ files = [ {file = "google_re2-1.1-4-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f4d4f0823e8b2f6952a145295b1ff25245ce9bb136aff6fe86452e507d4c1dd"}, {file = "google_re2-1.1-4-cp39-cp39-win32.whl", hash = "sha256:1afae56b2a07bb48cfcfefaa15ed85bae26a68f5dc7f9e128e6e6ea36914e847"}, {file = "google_re2-1.1-4-cp39-cp39-win_amd64.whl", hash = "sha256:aa7d6d05911ab9c8adbf3c225a7a120ab50fd2784ac48f2f0d140c0b7afc2b55"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:222fc2ee0e40522de0b21ad3bc90ab8983be3bf3cec3d349c80d76c8bb1a4beb"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:d4763b0b9195b72132a4e7de8e5a9bf1f05542f442a9115aa27cfc2a8004f581"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:209649da10c9d4a93d8a4d100ecbf9cc3b0252169426bec3e8b4ad7e57d600cf"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:68813aa333c1604a2df4a495b2a6ed065d7c8aebf26cc7e7abb5a6835d08353c"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:370a23ec775ad14e9d1e71474d56f381224dcf3e72b15d8ca7b4ad7dd9cd5853"}, + {file = "google_re2-1.1-5-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:14664a66a3ddf6bc9e56f401bf029db2d169982c53eff3f5876399104df0e9a6"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3ea3722cc4932cbcebd553b69dce1b4a73572823cff4e6a244f1c855da21d511"}, + {file = "google_re2-1.1-5-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e14bb264c40fd7c627ef5678e295370cd6ba95ca71d835798b6e37502fc4c690"}, + {file = "google_re2-1.1-5-cp310-cp310-win32.whl", hash = "sha256:39512cd0151ea4b3969c992579c79b423018b464624ae955be685fc07d94556c"}, + {file = "google_re2-1.1-5-cp310-cp310-win_amd64.whl", hash = "sha256:ac66537aa3bc5504320d922b73156909e3c2b6da19739c866502f7827b3f9fdf"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:5b5ea68d54890c9edb1b930dcb2658819354e5d3f2201f811798bbc0a142c2b4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:33443511b6b83c35242370908efe2e8e1e7cae749c766b2b247bf30e8616066c"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:413d77bdd5ba0bfcada428b4c146e87707452ec50a4091ec8e8ba1413d7e0619"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:5171686e43304996a34baa2abcee6f28b169806d0e583c16d55e5656b092a414"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b284db130283771558e31a02d8eb8fb756156ab98ce80035ae2e9e3a5f307c4"}, + {file = "google_re2-1.1-5-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:296e6aed0b169648dc4b870ff47bd34c702a32600adb9926154569ef51033f47"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38d50e68ead374160b1e656bbb5d101f0b95fb4cc57f4a5c12100155001480c5"}, + {file = "google_re2-1.1-5-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2a0416a35921e5041758948bcb882456916f22845f66a93bc25070ef7262b72a"}, + {file = "google_re2-1.1-5-cp311-cp311-win32.whl", hash = "sha256:a1d59568bbb5de5dd56dd6cdc79907db26cce63eb4429260300c65f43469e3e7"}, + {file = "google_re2-1.1-5-cp311-cp311-win_amd64.whl", hash = "sha256:72f5a2f179648b8358737b2b493549370debd7d389884a54d331619b285514e3"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:cbc72c45937b1dc5acac3560eb1720007dccca7c9879138ff874c7f6baf96005"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5fadd1417fbef7235fa9453dba4eb102e6e7d94b1e4c99d5fa3dd4e288d0d2ae"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:040f85c63cc02696485b59b187a5ef044abe2f99b92b4fb399de40b7d2904ccc"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:64e3b975ee6d9bbb2420494e41f929c1a0de4bcc16d86619ab7a87f6ea80d6bd"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8ee370413e00f4d828eaed0e83b8af84d7a72e8ee4f4bd5d3078bc741dfc430a"}, + {file = "google_re2-1.1-5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:5b89383001079323f693ba592d7aad789d7a02e75adb5d3368d92b300f5963fd"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:63cb4fdfbbda16ae31b41a6388ea621510db82feb8217a74bf36552ecfcd50ad"}, + {file = "google_re2-1.1-5-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ebedd84ae8be10b7a71a16162376fd67a2386fe6361ef88c622dcf7fd679daf"}, + {file = "google_re2-1.1-5-cp312-cp312-win32.whl", hash = "sha256:c8e22d1692bc2c81173330c721aff53e47ffd3c4403ff0cd9d91adfd255dd150"}, + {file = "google_re2-1.1-5-cp312-cp312-win_amd64.whl", hash = "sha256:5197a6af438bb8c4abda0bbe9c4fbd6c27c159855b211098b29d51b73e4cbcf6"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b6727e0b98417e114b92688ad2aa256102ece51f29b743db3d831df53faf1ce3"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:711e2b6417eb579c61a4951029d844f6b95b9b373b213232efd413659889a363"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:71ae8b3df22c5c154c8af0f0e99d234a450ef1644393bc2d7f53fc8c0a1e111c"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:94a04e214bc521a3807c217d50cf099bbdd0c0a80d2d996c0741dbb995b5f49f"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:a770f75358508a9110c81a1257721f70c15d9bb592a2fb5c25ecbd13566e52a5"}, + {file = "google_re2-1.1-5-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:07c9133357f7e0b17c6694d5dcb82e0371f695d7c25faef2ff8117ef375343ff"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:204ca6b1cf2021548f4a9c29ac015e0a4ab0a7b6582bf2183d838132b60c8fda"}, + {file = "google_re2-1.1-5-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f0b95857c2c654f419ca684ec38c9c3325c24e6ba7d11910a5110775a557bb18"}, + {file = "google_re2-1.1-5-cp38-cp38-win32.whl", hash = "sha256:347ac770e091a0364e822220f8d26ab53e6fdcdeaec635052000845c5a3fb869"}, + {file = "google_re2-1.1-5-cp38-cp38-win_amd64.whl", hash = "sha256:ec32bb6de7ffb112a07d210cf9f797b7600645c2d5910703fa07f456dd2150e0"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:eb5adf89060f81c5ff26c28e261e6b4997530a923a6093c9726b8dec02a9a326"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a22630c9dd9ceb41ca4316bccba2643a8b1d5c198f21c00ed5b50a94313aaf10"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:544dc17fcc2d43ec05f317366375796351dec44058e1164e03c3f7d050284d58"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:19710af5ea88751c7768575b23765ce0dfef7324d2539de576f75cdc319d6654"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:f82995a205e08ad896f4bd5ce4847c834fab877e1772a44e5f262a647d8a1dec"}, + {file = "google_re2-1.1-5-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:63533c4d58da9dc4bc040250f1f52b089911699f0368e0e6e15f996387a984ed"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79e00fcf0cb04ea35a22b9014712d448725ce4ddc9f08cc818322566176ca4b0"}, + {file = "google_re2-1.1-5-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bc41afcefee2da6c4ed883a93d7f527c4b960cd1d26bbb0020a7b8c2d341a60a"}, + {file = "google_re2-1.1-5-cp39-cp39-win32.whl", hash = "sha256:486730b5e1f1c31b0abc6d80abe174ce4f1188fe17d1b50698f2bf79dc6e44be"}, + {file = "google_re2-1.1-5-cp39-cp39-win_amd64.whl", hash = "sha256:4de637ca328f1d23209e80967d1b987d6b352cd01b3a52a84b4d742c69c3da6c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:621e9c199d1ff0fdb2a068ad450111a84b3bf14f96dfe5a8a7a0deae5f3f4cce"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:220acd31e7dde95373f97c3d1f3b3bd2532b38936af28b1917ee265d25bebbf4"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:db34e1098d164f76251a6ece30e8f0ddfd65bb658619f48613ce71acb3f9cbdb"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:5152bac41d8073977582f06257219541d0fc46ad99b0bbf30e8f60198a43b08c"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:6191294799e373ee1735af91f55abd23b786bdfd270768a690d9d55af9ea1b0d"}, + {file = "google_re2-1.1-6-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:070cbafbb4fecbb02e98feb28a1eb292fb880f434d531f38cc33ee314b521f1f"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8437d078b405a59a576cbed544490fe041140f64411f2d91012e8ec05ab8bf86"}, + {file = "google_re2-1.1-6-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f00f9a9af8896040e37896d9b9fc409ad4979f1ddd85bb188694a7d95ddd1164"}, + {file = "google_re2-1.1-6-cp310-cp310-win32.whl", hash = "sha256:df26345f229a898b4fd3cafd5f82259869388cee6268fc35af16a8e2293dd4e5"}, + {file = "google_re2-1.1-6-cp310-cp310-win_amd64.whl", hash = "sha256:3665d08262c57c9b28a5bdeb88632ad792c4e5f417e5645901695ab2624f5059"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:b26b869d8aa1d8fe67c42836bf3416bb72f444528ee2431cfb59c0d3e02c6ce3"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:41fd4486c57dea4f222a6bb7f1ff79accf76676a73bdb8da0fcbd5ba73f8da71"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:0ee378e2e74e25960070c338c28192377c4dd41e7f4608f2688064bd2badc41e"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:a00cdbf662693367b36d075b29feb649fd7ee1b617cf84f85f2deebeda25fc64"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c09455014217a41499432b8c8f792f25f3df0ea2982203c3a8c8ca0e7895e69"}, + {file = "google_re2-1.1-6-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6501717909185327935c7945e23bb5aa8fc7b6f237b45fe3647fa36148662158"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3510b04790355f199e7861c29234081900e1e1cbf2d1484da48aa0ba6d7356ab"}, + {file = "google_re2-1.1-6-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c0e64c187ca406764f9e9ad6e750d62e69ed8f75bf2e865d0bfbc03b642361c"}, + {file = "google_re2-1.1-6-cp311-cp311-win32.whl", hash = "sha256:2a199132350542b0de0f31acbb3ca87c3a90895d1d6e5235f7792bb0af02e523"}, + {file = "google_re2-1.1-6-cp311-cp311-win_amd64.whl", hash = "sha256:83bdac8ceaece8a6db082ea3a8ba6a99a2a1ee7e9f01a9d6d50f79c6f251a01d"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:81985ff894cd45ab5a73025922ac28c0707759db8171dd2f2cc7a0e856b6b5ad"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:5635af26065e6b45456ccbea08674ae2ab62494008d9202df628df3b267bc095"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:813b6f04de79f4a8fdfe05e2cb33e0ccb40fe75d30ba441d519168f9d958bd54"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:5ec2f5332ad4fd232c3f2d6748c2c7845ccb66156a87df73abcc07f895d62ead"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:5a687b3b32a6cbb731647393b7c4e3fde244aa557f647df124ff83fb9b93e170"}, + {file = "google_re2-1.1-6-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:39a62f9b3db5d3021a09a47f5b91708b64a0580193e5352751eb0c689e4ad3d7"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ca0f0b45d4a1709cbf5d21f355e5809ac238f1ee594625a1e5ffa9ff7a09eb2b"}, + {file = "google_re2-1.1-6-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a64b3796a7a616c7861247bd061c9a836b5caf0d5963e5ea8022125601cf7b09"}, + {file = "google_re2-1.1-6-cp312-cp312-win32.whl", hash = "sha256:32783b9cb88469ba4cd9472d459fe4865280a6b1acdad4480a7b5081144c4eb7"}, + {file = "google_re2-1.1-6-cp312-cp312-win_amd64.whl", hash = "sha256:259ff3fd2d39035b9cbcbf375995f83fa5d9e6a0c5b94406ff1cc168ed41d6c6"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:e4711bcffe190acd29104d8ecfea0c0e42b754837de3fb8aad96e6cc3c613cdc"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:4d081cce43f39c2e813fe5990e1e378cbdb579d3f66ded5bade96130269ffd75"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:4f123b54d48450d2d6b14d8fad38e930fb65b5b84f1b022c10f2913bd956f5b5"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:e1928b304a2b591a28eb3175f9db7f17c40c12cf2d4ec2a85fdf1cc9c073ff91"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_arm64.whl", hash = "sha256:3a69f76146166aec1173003c1f547931bdf288c6b135fda0020468492ac4149f"}, + {file = "google_re2-1.1-6-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:fc08c388f4ebbbca345e84a0c56362180d33d11cbe9ccfae663e4db88e13751e"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b057adf38ce4e616486922f2f47fc7d19c827ba0a7f69d540a3664eba2269325"}, + {file = "google_re2-1.1-6-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4138c0b933ab099e96f5d8defce4486f7dfd480ecaf7f221f2409f28022ccbc5"}, + {file = "google_re2-1.1-6-cp38-cp38-win32.whl", hash = "sha256:9693e45b37b504634b1abbf1ee979471ac6a70a0035954592af616306ab05dd6"}, + {file = "google_re2-1.1-6-cp38-cp38-win_amd64.whl", hash = "sha256:5674d437baba0ea287a5a7f8f81f24265d6ae8f8c09384e2ef7b6f84b40a7826"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7783137cb2e04f458a530c6d0ee9ef114815c1d48b9102f023998c371a3b060e"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:a49b7153935e7a303675f4deb5f5d02ab1305adefc436071348706d147c889e0"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:a96a8bb309182090704593c60bdb369a2756b38fe358bbf0d40ddeb99c71769f"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:dff3d4be9f27ef8ec3705eed54f19ef4ab096f5876c15fe011628c69ba3b561c"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:40f818b0b39e26811fa677978112a8108269977fdab2ba0453ac4363c35d9e66"}, + {file = "google_re2-1.1-6-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:8a7e53538cdb40ef4296017acfbb05cab0c19998be7552db1cfb85ba40b171b9"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6ee18e7569fb714e5bb8c42809bf8160738637a5e71ed5a4797757a1fb4dc4de"}, + {file = "google_re2-1.1-6-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cda4f6d1a7d5b43ea92bc395f23853fba0caf8b1e1efa6e8c48685f912fcb89"}, + {file = "google_re2-1.1-6-cp39-cp39-win32.whl", hash = "sha256:6a9cdbdc36a2bf24f897be6a6c85125876dc26fea9eb4247234aec0decbdccfd"}, + {file = "google_re2-1.1-6-cp39-cp39-win_amd64.whl", hash = "sha256:73f646cecfad7cc5b4330b4192c25f2e29730a3b8408e089ffd2078094208196"}, ] [[package]] @@ -10518,4 +10618,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "c0607d05ab37a1a6addf3ae7264bf5972cb6ce6e46df1dcdc2da3cff72e5008e" +content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" diff --git a/pyproject.toml b/pyproject.toml index 7377b03fde..f736fc65ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ requests = ">=2.26.0" pendulum = ">=2.1.2" simplejson = ">=3.17.5" PyYAML = ">=5.4.1" -semver = ">=2.13.0" +semver = ">=3.0.0" hexbytes = ">=0.2.2" tzdata = ">=2022.1" tomlkit = ">=0.11.3" From 77d8ab6ee23518213fe9da60c4275784450f98fa Mon Sep 17 00:00:00 2001 From: David Scharf Date: Wed, 11 Dec 2024 00:43:32 +0100 Subject: [PATCH 4/5] leverage ibis expression for getting readablerelations (#2046) * add ibis dataset in own class for now * make error clearer * fix some linting and fix broken test * make most destinations work with selecting the right db and catalog, transpiling sql via postgres in some cases and selecting the right dialect in others * add missing motherduck and sqlalchemy mappings * casefold identifiers for ibis wrapper calss * re-organize existing dataset code to prepare ibis relation integration * integrate ibis relation into existing code * re-order tests * fall back to default dataset if table not in schema * make dataset type selectable * add dataset type selection test and fix bug in tests * update docs for ibis expressions use * ensure a bunch of ibis operations continue working * add some more tests and typings * fix typing (with brute force get_attr typing..) * move ibis to dependency group * move ibis stuff to helpers * post devel merge, put in change from dataset, update lockfile * add ibis to sqlalchemy tests * improve docs a bit * fix ibis dep group * fix dataset snippets * fix ibis version * add support for column schema in certion query cases --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destination_athena.yml | 2 +- .../test_destination_athena_iceberg.yml | 2 +- .../workflows/test_destination_bigquery.yml | 2 +- .../workflows/test_destination_clickhouse.yml | 2 +- .../workflows/test_destination_databricks.yml | 2 +- .github/workflows/test_destination_dremio.yml | 2 +- .../workflows/test_destination_motherduck.yml | 2 +- .github/workflows/test_destination_mssql.yml | 2 +- .../workflows/test_destination_snowflake.yml | 2 +- .../workflows/test_destination_synapse.yml | 2 +- .github/workflows/test_destinations.yml | 2 +- .github/workflows/test_local_destinations.yml | 2 +- .../test_sqlalchemy_destinations.yml | 2 +- dlt/common/destination/reference.py | 10 +- dlt/destinations/dataset.py | 412 ------------------ dlt/destinations/dataset/__init__.py | 19 + dlt/destinations/dataset/dataset.py | 142 ++++++ dlt/destinations/dataset/exceptions.py | 22 + dlt/destinations/dataset/factory.py | 22 + dlt/destinations/dataset/ibis_relation.py | 224 ++++++++++ dlt/destinations/dataset/relation.py | 207 +++++++++ dlt/destinations/dataset/utils.py | 95 ++++ .../impl/sqlalchemy/db_api_client.py | 4 +- dlt/{common/libs => helpers}/ibis.py | 58 ++- dlt/pipeline/pipeline.py | 12 +- .../general-usage/dataset-access/dataset.md | 58 +++ poetry.lock | 105 ++--- pyproject.toml | 7 +- .../test_readable_dbapi_dataset.py | 30 +- tests/load/pipeline/test_duckdb.py | 8 +- tests/load/test_read_interfaces.py | 363 ++++++++++++--- 31 files changed, 1245 insertions(+), 579 deletions(-) delete mode 100644 dlt/destinations/dataset.py create mode 100644 dlt/destinations/dataset/__init__.py create mode 100644 dlt/destinations/dataset/dataset.py create mode 100644 dlt/destinations/dataset/exceptions.py create mode 100644 dlt/destinations/dataset/factory.py create mode 100644 dlt/destinations/dataset/ibis_relation.py create mode 100644 dlt/destinations/dataset/relation.py create mode 100644 dlt/destinations/dataset/utils.py rename dlt/{common/libs => helpers}/ibis.py (74%) diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 1169fab0de..03eb7f9434 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 7ccefcc055..3412e789e3 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -67,7 +67,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index 7afc9b8a00..eb8b63f757 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -66,7 +66,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_clickhouse.yml b/.github/workflows/test_destination_clickhouse.yml index 7f297db971..46464ea462 100644 --- a/.github/workflows/test_destination_clickhouse.yml +++ b/.github/workflows/test_destination_clickhouse.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E clickhouse --with providers -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_databricks.yml b/.github/workflows/test_destination_databricks.yml index 1656fe27f4..c1609de863 100644 --- a/.github/workflows/test_destination_databricks.yml +++ b/.github/workflows/test_destination_databricks.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E databricks -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_dremio.yml b/.github/workflows/test_destination_dremio.yml index 45c6d17db1..4bc48c54db 100644 --- a/.github/workflows/test_destination_dremio.yml +++ b/.github/workflows/test_destination_dremio.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - run: | poetry run pytest tests/load --ignore tests/load/sources diff --git a/.github/workflows/test_destination_motherduck.yml b/.github/workflows/test_destination_motherduck.yml index 0014b17655..db81131266 100644 --- a/.github/workflows/test_destination_motherduck.yml +++ b/.github/workflows/test_destination_motherduck.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-motherduck - name: Install dependencies - run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E motherduck -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index 8b899e7da2..6fdd7a5bc5 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -69,7 +69,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index a720c479bd..73a2a8f6e7 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -64,7 +64,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index be1b493916..8f6bf1eb29 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -67,7 +67,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline,ibis - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index 933248d994..cfd0a3bd56 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -78,7 +78,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 4947a46a3b..6f44e5fd5a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/.github/workflows/test_sqlalchemy_destinations.yml b/.github/workflows/test_sqlalchemy_destinations.yml index c2572b322d..1f00373674 100644 --- a/.github/workflows/test_sqlalchemy_destinations.yml +++ b/.github/workflows/test_sqlalchemy_destinations.yml @@ -86,7 +86,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" + run: poetry install --no-interaction -E parquet -E filesystem -E sqlalchemy -E cli --with sentry-sdk --with pipeline,ibis && poetry run pip install mysqlclient && poetry run pip install "sqlalchemy==${{ matrix.sqlalchemy }}" - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index e27f99cde7..048fe2186f 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -67,7 +67,7 @@ TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") TDestinationDwhClient = TypeVar("TDestinationDwhClient", bound="DestinationClientDwhConfiguration") -TDatasetType = Literal["dbapi", "ibis"] +TDatasetType = Literal["auto", "default", "ibis"] DEFAULT_FILE_LAYOUT = "{table_name}/{load_id}.{file_id}.{ext}" @@ -76,7 +76,7 @@ try: from dlt.common.libs.pandas import DataFrame from dlt.common.libs.pyarrow import Table as ArrowTable - from dlt.common.libs.ibis import BaseBackend as IbisBackend + from dlt.helpers.ibis import BaseBackend as IbisBackend except MissingDependencyException: DataFrame = Any ArrowTable = Any @@ -535,7 +535,7 @@ def fetchone(self) -> Optional[Tuple[Any, ...]]: ... # modifying access parameters - def limit(self, limit: int) -> "SupportsReadableRelation": + def limit(self, limit: int, **kwargs: Any) -> "SupportsReadableRelation": """limit the result to 'limit' items""" ... @@ -557,6 +557,10 @@ def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRe """set which columns will be selected""" ... + def __getattr__(self, attr: str) -> Any: + """get an attribute of the relation""" + ... + def __copy__(self) -> "SupportsReadableRelation": """create a copy of the relation object""" ... diff --git a/dlt/destinations/dataset.py b/dlt/destinations/dataset.py deleted file mode 100644 index 27a7f5a7af..0000000000 --- a/dlt/destinations/dataset.py +++ /dev/null @@ -1,412 +0,0 @@ -from typing import Any, Generator, Sequence, Union, TYPE_CHECKING, Tuple - -from contextlib import contextmanager - -from dlt import version -from dlt.common.json import json -from dlt.common.exceptions import MissingDependencyException -from dlt.common.destination import AnyDestination -from dlt.common.destination.reference import ( - SupportsReadableRelation, - SupportsReadableDataset, - TDatasetType, - TDestinationReferenceArg, - Destination, - JobClientBase, - WithStateSync, - DestinationClientDwhConfiguration, - DestinationClientStagingConfiguration, - DestinationClientConfiguration, - DestinationClientDwhWithStagingConfiguration, -) - -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.destinations.sql_client import SqlClientBase, WithSqlClient -from dlt.common.schema import Schema -from dlt.common.exceptions import DltException - -if TYPE_CHECKING: - try: - from dlt.common.libs.ibis import BaseBackend as IbisBackend - except MissingDependencyException: - IbisBackend = Any -else: - IbisBackend = Any - - -class DatasetException(DltException): - pass - - -class ReadableRelationHasQueryException(DatasetException): - def __init__(self, attempted_change: str) -> None: - msg = ( - "This readable relation was created with a provided sql query. You cannot change" - f" {attempted_change}. Please change the orignal sql query." - ) - super().__init__(msg) - - -class ReadableRelationUnknownColumnException(DatasetException): - def __init__(self, column_name: str) -> None: - msg = ( - f"The selected column {column_name} is not known in the dlt schema for this releation." - ) - super().__init__(msg) - - -class ReadableDBAPIRelation(SupportsReadableRelation): - def __init__( - self, - *, - readable_dataset: "ReadableDBAPIDataset", - provided_query: Any = None, - table_name: str = None, - limit: int = None, - selected_columns: Sequence[str] = None, - ) -> None: - """Create a lazy evaluated relation to for the dataset of a destination""" - - # NOTE: we can keep an assertion here, this class will not be created by the user - assert bool(table_name) != bool( - provided_query - ), "Please provide either an sql query OR a table_name" - - self._dataset = readable_dataset - - self._provided_query = provided_query - self._table_name = table_name - self._limit = limit - self._selected_columns = selected_columns - - # wire protocol functions - self.df = self._wrap_func("df") # type: ignore - self.arrow = self._wrap_func("arrow") # type: ignore - self.fetchall = self._wrap_func("fetchall") # type: ignore - self.fetchmany = self._wrap_func("fetchmany") # type: ignore - self.fetchone = self._wrap_func("fetchone") # type: ignore - - self.iter_df = self._wrap_iter("iter_df") # type: ignore - self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore - self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore - - @property - def sql_client(self) -> SqlClientBase[Any]: - return self._dataset.sql_client - - @property - def schema(self) -> Schema: - return self._dataset.schema - - @property - def query(self) -> Any: - """build the query""" - if self._provided_query: - return self._provided_query - - table_name = self.sql_client.make_qualified_table_name( - self.schema.naming.normalize_tables_path(self._table_name) - ) - - maybe_limit_clause_1 = "" - maybe_limit_clause_2 = "" - if self._limit: - maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( - self._limit - ) - - selector = "*" - if self._selected_columns: - selector = ",".join( - [ - self.sql_client.escape_column_name(self.schema.naming.normalize_path(c)) - for c in self._selected_columns - ] - ) - - return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" - - @property - def columns_schema(self) -> TTableSchemaColumns: - return self.compute_columns_schema() - - @columns_schema.setter - def columns_schema(self, new_value: TTableSchemaColumns) -> None: - raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") - - def compute_columns_schema(self) -> TTableSchemaColumns: - """provide schema columns for the cursor, may be filtered by selected columns""" - - columns_schema = ( - self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} - ) - - if not columns_schema: - return None - if not self._selected_columns: - return columns_schema - - filtered_columns: TTableSchemaColumns = {} - for sc in self._selected_columns: - sc = self.schema.naming.normalize_path(sc) - if sc not in columns_schema.keys(): - raise ReadableRelationUnknownColumnException(sc) - filtered_columns[sc] = columns_schema[sc] - - return filtered_columns - - @contextmanager - def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: - """Gets a DBApiCursor for the current relation""" - with self.sql_client as client: - # this hacky code is needed for mssql to disable autocommit, read iterators - # will not work otherwise. in the future we should be able to create a readony - # client which will do this automatically - if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): - self.sql_client._conn.autocommit = False - with client.execute_query(self.query) as cursor: - if columns_schema := self.columns_schema: - cursor.columns_schema = columns_schema - yield cursor - - def _wrap_iter(self, func_name: str) -> Any: - """wrap SupportsReadableRelation generators in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - yield from getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def _wrap_func(self, func_name: str) -> Any: - """wrap SupportsReadableRelation functions in cursor context""" - - def _wrap(*args: Any, **kwargs: Any) -> Any: - with self.cursor() as cursor: - return getattr(cursor, func_name)(*args, **kwargs) - - return _wrap - - def __copy__(self) -> "ReadableDBAPIRelation": - return self.__class__( - readable_dataset=self._dataset, - provided_query=self._provided_query, - table_name=self._table_name, - limit=self._limit, - selected_columns=self._selected_columns, - ) - - def limit(self, limit: int) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("limit") - rel = self.__copy__() - rel._limit = limit - return rel - - def select(self, *columns: str) -> "ReadableDBAPIRelation": - if self._provided_query: - raise ReadableRelationHasQueryException("select") - rel = self.__copy__() - rel._selected_columns = columns - # NOTE: the line below will ensure that no unknown columns are selected if - # schema is known - rel.compute_columns_schema() - return rel - - def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": - if isinstance(columns, str): - return self.select(columns) - elif isinstance(columns, Sequence): - return self.select(*columns) - else: - raise TypeError(f"Invalid argument type: {type(columns).__name__}") - - def head(self, limit: int = 5) -> "ReadableDBAPIRelation": - return self.limit(limit) - - -class ReadableDBAPIDataset(SupportsReadableDataset): - """Access to dataframes and arrowtables in the destination dataset via dbapi""" - - def __init__( - self, - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - ) -> None: - self._destination = Destination.from_reference(destination) - self._provided_schema = schema - self._dataset_name = dataset_name - self._sql_client: SqlClientBase[Any] = None - self._schema: Schema = None - - def ibis(self) -> IbisBackend: - """return a connected ibis backend""" - from dlt.common.libs.ibis import create_ibis_backend - - self._ensure_client_and_schema() - return create_ibis_backend( - self._destination, - self._destination_client(self.schema), - ) - - @property - def schema(self) -> Schema: - self._ensure_client_and_schema() - return self._schema - - @property - def sql_client(self) -> SqlClientBase[Any]: - self._ensure_client_and_schema() - return self._sql_client - - def _destination_client(self, schema: Schema) -> JobClientBase: - return get_destination_clients( - schema, destination=self._destination, destination_dataset_name=self._dataset_name - )[0] - - def _ensure_client_and_schema(self) -> None: - """Lazy load schema and client""" - - # full schema given, nothing to do - if not self._schema and isinstance(self._provided_schema, Schema): - self._schema = self._provided_schema - - # schema name given, resolve it from destination by name - elif not self._schema and isinstance(self._provided_schema, str): - with self._destination_client(Schema(self._provided_schema)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema(self._provided_schema) - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - else: - self._schema = Schema(self._provided_schema) - - # no schema name given, load newest schema from destination - elif not self._schema: - with self._destination_client(Schema(self._dataset_name)) as client: - if isinstance(client, WithStateSync): - stored_schema = client.get_stored_schema() - if stored_schema: - self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) - - # default to empty schema with dataset name - if not self._schema: - self._schema = Schema(self._dataset_name) - - # here we create the client bound to the resolved schema - if not self._sql_client: - destination_client = self._destination_client(self._schema) - if isinstance(destination_client, WithSqlClient): - self._sql_client = destination_client.sql_client - else: - raise Exception( - f"Destination {destination_client.config.destination_type} does not support" - " SqlClient." - ) - - def __call__(self, query: Any) -> ReadableDBAPIRelation: - return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] - - def table(self, table_name: str) -> SupportsReadableRelation: - return ReadableDBAPIRelation( - readable_dataset=self, - table_name=table_name, - ) # type: ignore[abstract] - - def __getitem__(self, table_name: str) -> SupportsReadableRelation: - """access of table via dict notation""" - return self.table(table_name) - - def __getattr__(self, table_name: str) -> SupportsReadableRelation: - """access of table via property notation""" - return self.table(table_name) - - -def dataset( - destination: TDestinationReferenceArg, - dataset_name: str, - schema: Union[Schema, str, None] = None, - dataset_type: TDatasetType = "dbapi", -) -> SupportsReadableDataset: - if dataset_type == "dbapi": - return ReadableDBAPIDataset(destination, dataset_name, schema) - raise NotImplementedError(f"Dataset of type {dataset_type} not implemented") - - -# helpers -def get_destination_client_initial_config( - destination: AnyDestination, - default_schema_name: str, - dataset_name: str, - as_staging: bool = False, -) -> DestinationClientConfiguration: - client_spec = destination.spec - - # this client supports many schemas and datasets - if issubclass(client_spec, DestinationClientDwhConfiguration): - if issubclass(client_spec, DestinationClientStagingConfiguration): - spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) - else: - spec = client_spec() - - spec._bind_dataset_name(dataset_name, default_schema_name) - return spec - - return client_spec() - - -def get_destination_clients( - schema: Schema, - destination: AnyDestination = None, - destination_dataset_name: str = None, - destination_initial_config: DestinationClientConfiguration = None, - staging: AnyDestination = None, - staging_dataset_name: str = None, - staging_initial_config: DestinationClientConfiguration = None, - # pipeline specific settings - default_schema_name: str = None, -) -> Tuple[JobClientBase, JobClientBase]: - destination = Destination.from_reference(destination) if destination else None - staging = Destination.from_reference(staging) if staging else None - - try: - # resolve staging config in order to pass it to destination client config - staging_client = None - if staging: - if not staging_initial_config: - # this is just initial config - without user configuration injected - staging_initial_config = get_destination_client_initial_config( - staging, - dataset_name=staging_dataset_name, - default_schema_name=default_schema_name, - as_staging=True, - ) - # create the client - that will also resolve the config - staging_client = staging.client(schema, staging_initial_config) - - if not destination_initial_config: - # config is not provided then get it with injected credentials - initial_config = get_destination_client_initial_config( - destination, - dataset_name=destination_dataset_name, - default_schema_name=default_schema_name, - ) - - # attach the staging client config to destination client config - if its type supports it - if ( - staging_client - and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) - and isinstance(staging_client.config, DestinationClientStagingConfiguration) - ): - initial_config.staging_config = staging_client.config - # create instance with initial_config properly set - client = destination.client(schema, initial_config) - return client, staging_client - except ModuleNotFoundError: - client_spec = destination.spec() - raise MissingDependencyException( - f"{client_spec.destination_type} destination", - [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], - "Dependencies for specific destinations are available as extras of dlt", - ) diff --git a/dlt/destinations/dataset/__init__.py b/dlt/destinations/dataset/__init__.py new file mode 100644 index 0000000000..e0eef681b8 --- /dev/null +++ b/dlt/destinations/dataset/__init__.py @@ -0,0 +1,19 @@ +from dlt.destinations.dataset.factory import ( + dataset, +) +from dlt.destinations.dataset.dataset import ( + ReadableDBAPIDataset, + get_destination_clients, +) +from dlt.destinations.dataset.utils import ( + get_destination_clients, + get_destination_client_initial_config, +) + + +__all__ = [ + "dataset", + "ReadableDBAPIDataset", + "get_destination_client_initial_config", + "get_destination_clients", +] diff --git a/dlt/destinations/dataset/dataset.py b/dlt/destinations/dataset/dataset.py new file mode 100644 index 0000000000..e443045e49 --- /dev/null +++ b/dlt/destinations/dataset/dataset.py @@ -0,0 +1,142 @@ +from typing import Any, Union, TYPE_CHECKING + +from dlt.common.json import json + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination.reference import ( + SupportsReadableRelation, + SupportsReadableDataset, + TDestinationReferenceArg, + Destination, + JobClientBase, + WithStateSync, +) + +from dlt.destinations.sql_client import SqlClientBase, WithSqlClient +from dlt.common.schema import Schema +from dlt.destinations.dataset.relation import ReadableDBAPIRelation +from dlt.destinations.dataset.utils import get_destination_clients +from dlt.common.destination.reference import TDatasetType + +if TYPE_CHECKING: + try: + from dlt.helpers.ibis import BaseBackend as IbisBackend + except MissingDependencyException: + IbisBackend = Any +else: + IbisBackend = Any + + +class ReadableDBAPIDataset(SupportsReadableDataset): + """Access to dataframes and arrowtables in the destination dataset via dbapi""" + + def __init__( + self, + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", + ) -> None: + self._destination = Destination.from_reference(destination) + self._provided_schema = schema + self._dataset_name = dataset_name + self._sql_client: SqlClientBase[Any] = None + self._schema: Schema = None + self._dataset_type = dataset_type + + def ibis(self) -> IbisBackend: + """return a connected ibis backend""" + from dlt.helpers.ibis import create_ibis_backend + + self._ensure_client_and_schema() + return create_ibis_backend( + self._destination, + self._destination_client(self.schema), + ) + + @property + def schema(self) -> Schema: + self._ensure_client_and_schema() + return self._schema + + @property + def sql_client(self) -> SqlClientBase[Any]: + self._ensure_client_and_schema() + return self._sql_client + + def _destination_client(self, schema: Schema) -> JobClientBase: + return get_destination_clients( + schema, destination=self._destination, destination_dataset_name=self._dataset_name + )[0] + + def _ensure_client_and_schema(self) -> None: + """Lazy load schema and client""" + + # full schema given, nothing to do + if not self._schema and isinstance(self._provided_schema, Schema): + self._schema = self._provided_schema + + # schema name given, resolve it from destination by name + elif not self._schema and isinstance(self._provided_schema, str): + with self._destination_client(Schema(self._provided_schema)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema(self._provided_schema) + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + else: + self._schema = Schema(self._provided_schema) + + # no schema name given, load newest schema from destination + elif not self._schema: + with self._destination_client(Schema(self._dataset_name)) as client: + if isinstance(client, WithStateSync): + stored_schema = client.get_stored_schema() + if stored_schema: + self._schema = Schema.from_stored_schema(json.loads(stored_schema.schema)) + + # default to empty schema with dataset name + if not self._schema: + self._schema = Schema(self._dataset_name) + + # here we create the client bound to the resolved schema + if not self._sql_client: + destination_client = self._destination_client(self._schema) + if isinstance(destination_client, WithSqlClient): + self._sql_client = destination_client.sql_client + else: + raise Exception( + f"Destination {destination_client.config.destination_type} does not support" + " SqlClient." + ) + + def __call__(self, query: Any) -> ReadableDBAPIRelation: + return ReadableDBAPIRelation(readable_dataset=self, provided_query=query) # type: ignore[abstract] + + def table(self, table_name: str) -> SupportsReadableRelation: + # we can create an ibis powered relation if ibis is available + if table_name in self.schema.tables and self._dataset_type in ("auto", "ibis"): + try: + from dlt.helpers.ibis import create_unbound_ibis_table + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + unbound_table = create_unbound_ibis_table(self.sql_client, self.schema, table_name) + return ReadableIbisRelation(readable_dataset=self, ibis_object=unbound_table, columns_schema=self.schema.tables[table_name]["columns"]) # type: ignore[abstract] + except MissingDependencyException: + # if ibis is explicitly requested, reraise + if self._dataset_type == "ibis": + raise + + # fallback to the standard dbapi relation + return ReadableDBAPIRelation( + readable_dataset=self, + table_name=table_name, + ) # type: ignore[abstract] + + def __getitem__(self, table_name: str) -> SupportsReadableRelation: + """access of table via dict notation""" + return self.table(table_name) + + def __getattr__(self, table_name: str) -> SupportsReadableRelation: + """access of table via property notation""" + return self.table(table_name) diff --git a/dlt/destinations/dataset/exceptions.py b/dlt/destinations/dataset/exceptions.py new file mode 100644 index 0000000000..17e8f6b563 --- /dev/null +++ b/dlt/destinations/dataset/exceptions.py @@ -0,0 +1,22 @@ +from dlt.common.exceptions import DltException + + +class DatasetException(DltException): + pass + + +class ReadableRelationHasQueryException(DatasetException): + def __init__(self, attempted_change: str) -> None: + msg = ( + "This readable relation was created with a provided sql query. You cannot change" + f" {attempted_change}. Please change the orignal sql query." + ) + super().__init__(msg) + + +class ReadableRelationUnknownColumnException(DatasetException): + def __init__(self, column_name: str) -> None: + msg = ( + f"The selected column {column_name} is not known in the dlt schema for this releation." + ) + super().__init__(msg) diff --git a/dlt/destinations/dataset/factory.py b/dlt/destinations/dataset/factory.py new file mode 100644 index 0000000000..8ea0ddf7a1 --- /dev/null +++ b/dlt/destinations/dataset/factory.py @@ -0,0 +1,22 @@ +from typing import Union + + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + SupportsReadableDataset, + TDatasetType, + TDestinationReferenceArg, +) + +from dlt.common.schema import Schema + +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset + + +def dataset( + destination: TDestinationReferenceArg, + dataset_name: str, + schema: Union[Schema, str, None] = None, + dataset_type: TDatasetType = "auto", +) -> SupportsReadableDataset: + return ReadableDBAPIDataset(destination, dataset_name, schema, dataset_type) diff --git a/dlt/destinations/dataset/ibis_relation.py b/dlt/destinations/dataset/ibis_relation.py new file mode 100644 index 0000000000..632298ad56 --- /dev/null +++ b/dlt/destinations/dataset/ibis_relation.py @@ -0,0 +1,224 @@ +from typing import TYPE_CHECKING, Any, Union, Sequence + +from functools import partial + +from dlt.common.exceptions import MissingDependencyException +from dlt.destinations.dataset.relation import BaseReadableDBAPIRelation +from dlt.common.schema.typing import TTableSchemaColumns + + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + +try: + from dlt.helpers.ibis import Expr +except MissingDependencyException: + Expr = Any + +# map dlt destination to sqlglot dialect +DIALECT_MAP = { + "dlt.destinations.duckdb": "duckdb", # works + "dlt.destinations.motherduck": "duckdb", # works + "dlt.destinations.clickhouse": "clickhouse", # works + "dlt.destinations.databricks": "databricks", # works + "dlt.destinations.bigquery": "bigquery", # works + "dlt.destinations.postgres": "postgres", # works + "dlt.destinations.redshift": "redshift", # works + "dlt.destinations.snowflake": "snowflake", # works + "dlt.destinations.mssql": "tsql", # works + "dlt.destinations.synapse": "tsql", # works + "dlt.destinations.athena": "trino", # works + "dlt.destinations.filesystem": "duckdb", # works + "dlt.destinations.dremio": "presto", # works + # NOTE: can we discover the current dialect in sqlalchemy? + "dlt.destinations.sqlalchemy": "mysql", # may work +} + +# NOTE: some dialects are not supported by ibis, but by sqlglot, these need to +# be transpiled with a intermediary step +TRANSPILE_VIA_MAP = { + "tsql": "postgres", + "databricks": "postgres", + "clickhouse": "postgres", + "redshift": "postgres", + "presto": "postgres", +} + + +class ReadableIbisRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: ReadableDBAPIDataset, + ibis_object: Any = None, + columns_schema: TTableSchemaColumns = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + super().__init__(readable_dataset=readable_dataset) + self._ibis_object = ibis_object + self._columns_schema = columns_schema + + @property + def query(self) -> Any: + """build the query""" + + from dlt.helpers.ibis import ibis, sqlglot + + destination_type = self._dataset._destination.destination_type + target_dialect = DIALECT_MAP[destination_type] + + # render sql directly if possible + if target_dialect not in TRANSPILE_VIA_MAP: + return ibis.to_sql(self._ibis_object, dialect=target_dialect) + + # here we need to transpile first + transpile_via = TRANSPILE_VIA_MAP[target_dialect] + sql = ibis.to_sql(self._ibis_object, dialect=transpile_via) + sql = sqlglot.transpile(sql, read=transpile_via, write=target_dialect)[0] + return sql + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + # TODO: provide column lineage tracing with sqlglot lineage + return self._columns_schema + + def _proxy_expression_method(self, method_name: str, *args: Any, **kwargs: Any) -> Any: + """Proxy method calls to the underlying ibis expression, allowing to wrap the resulting expression in a new relation""" + + # Get the method from the expression + method = getattr(self._ibis_object, method_name) + + # unwrap args and kwargs if they are relations + args = tuple( + arg._ibis_object if isinstance(arg, ReadableIbisRelation) else arg for arg in args + ) + kwargs = { + k: v._ibis_object if isinstance(v, ReadableIbisRelation) else v + for k, v in kwargs.items() + } + + # casefold string params, we assume these are column names + args = tuple( + self.sql_client.capabilities.casefold_identifier(arg) if isinstance(arg, str) else arg + for arg in args + ) + kwargs = { + k: self.sql_client.capabilities.casefold_identifier(v) if isinstance(v, str) else v + for k, v in kwargs.items() + } + + # Call it with provided args + result = method(*args, **kwargs) + + # calculate columns schema for the result, some operations we know will not change the schema + # and select will just reduce the amount of column + columns_schema = None + if method_name == "select": + columns_schema = self._get_filtered_columns_schema(args) + elif method_name in ["filter", "limit", "order_by", "head"]: + columns_schema = self._columns_schema + + # If result is an ibis expression, wrap it in a new relation else return raw result + return self.__class__( + readable_dataset=self._dataset, ibis_object=result, columns_schema=columns_schema + ) + + def __getattr__(self, name: str) -> Any: + """Wrap all callable attributes of the expression""" + + attr = getattr(self._ibis_object, name, None) + + # try casefolded name for ibis columns access + if attr is None: + name = self.sql_client.capabilities.casefold_identifier(name) + attr = getattr(self._ibis_object, name, None) + + if attr is None: + raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'") + + if not callable(attr): + # NOTE: we don't need to forward columns schema for non-callable attributes, these are usually columns + return self.__class__(readable_dataset=self._dataset, ibis_object=attr) + + return partial(self._proxy_expression_method, name) + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "ReadableIbisRelation": + # casefold column-names + columns = [columns] if isinstance(columns, str) else columns + columns = [self.sql_client.capabilities.casefold_identifier(col) for col in columns] + expr = self._ibis_object[columns] + return self.__class__( + readable_dataset=self._dataset, + ibis_object=expr, + columns_schema=self._get_filtered_columns_schema(columns), + ) + + def _get_filtered_columns_schema(self, columns: Sequence[str]) -> TTableSchemaColumns: + if not self._columns_schema: + return None + try: + return {col: self._columns_schema[col] for col in columns} + except KeyError: + # NOTE: select statements can contain new columns not present in the original schema + # here we just break the column schema inheritance chain + return None + + # forward ibis methods defined on interface + def limit(self, limit: int, **kwargs: Any) -> "ReadableIbisRelation": + """limit the result to 'limit' items""" + return self._proxy_expression_method("limit", limit, **kwargs) # type: ignore + + def head(self, limit: int = 5) -> "ReadableIbisRelation": + """limit the result to 5 items by default""" + return self._proxy_expression_method("head", limit) # type: ignore + + def select(self, *columns: str) -> "ReadableIbisRelation": + """set which columns will be selected""" + return self._proxy_expression_method("select", *columns) # type: ignore + + # forward ibis comparison and math operators + def __lt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__lt__", other) # type: ignore + + def __gt__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__gt__", other) # type: ignore + + def __ge__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__ge__", other) # type: ignore + + def __le__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__le__", other) # type: ignore + + def __eq__(self, other: Any) -> bool: + return self._proxy_expression_method("__eq__", other) # type: ignore + + def __ne__(self, other: Any) -> bool: + return self._proxy_expression_method("__ne__", other) # type: ignore + + def __and__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__and__", other) # type: ignore + + def __or__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__or__", other) # type: ignore + + def __mul__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__mul__", other) # type: ignore + + def __div__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__div__", other) # type: ignore + + def __add__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__add__", other) # type: ignore + + def __sub__(self, other: Any) -> "ReadableIbisRelation": + return self._proxy_expression_method("__sub__", other) # type: ignore diff --git a/dlt/destinations/dataset/relation.py b/dlt/destinations/dataset/relation.py new file mode 100644 index 0000000000..2cdb7640df --- /dev/null +++ b/dlt/destinations/dataset/relation.py @@ -0,0 +1,207 @@ +from typing import Any, Generator, Sequence, Union, TYPE_CHECKING + +from contextlib import contextmanager + + +from dlt.common.destination.reference import ( + SupportsReadableRelation, +) + +from dlt.destinations.dataset.exceptions import ( + ReadableRelationHasQueryException, + ReadableRelationUnknownColumnException, +) + +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.destinations.sql_client import SqlClientBase +from dlt.common.schema import Schema + +if TYPE_CHECKING: + from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +else: + ReadableDBAPIDataset = Any + + +class BaseReadableDBAPIRelation(SupportsReadableRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + self._dataset = readable_dataset + + # wire protocol functions + self.df = self._wrap_func("df") # type: ignore + self.arrow = self._wrap_func("arrow") # type: ignore + self.fetchall = self._wrap_func("fetchall") # type: ignore + self.fetchmany = self._wrap_func("fetchmany") # type: ignore + self.fetchone = self._wrap_func("fetchone") # type: ignore + + self.iter_df = self._wrap_iter("iter_df") # type: ignore + self.iter_arrow = self._wrap_iter("iter_arrow") # type: ignore + self.iter_fetch = self._wrap_iter("iter_fetch") # type: ignore + + @property + def sql_client(self) -> SqlClientBase[Any]: + return self._dataset.sql_client + + @property + def schema(self) -> Schema: + return self._dataset.schema + + @property + def query(self) -> Any: + raise NotImplementedError("No query in ReadableDBAPIRelation") + + @contextmanager + def cursor(self) -> Generator[SupportsReadableRelation, Any, Any]: + """Gets a DBApiCursor for the current relation""" + with self.sql_client as client: + # this hacky code is needed for mssql to disable autocommit, read iterators + # will not work otherwise. in the future we should be able to create a readony + # client which will do this automatically + if hasattr(self.sql_client, "_conn") and hasattr(self.sql_client._conn, "autocommit"): + self.sql_client._conn.autocommit = False + with client.execute_query(self.query) as cursor: + if columns_schema := self.columns_schema: + cursor.columns_schema = columns_schema + yield cursor + + def _wrap_iter(self, func_name: str) -> Any: + """wrap SupportsReadableRelation generators in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + yield from getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + def _wrap_func(self, func_name: str) -> Any: + """wrap SupportsReadableRelation functions in cursor context""" + + def _wrap(*args: Any, **kwargs: Any) -> Any: + with self.cursor() as cursor: + return getattr(cursor, func_name)(*args, **kwargs) + + return _wrap + + +class ReadableDBAPIRelation(BaseReadableDBAPIRelation): + def __init__( + self, + *, + readable_dataset: "ReadableDBAPIDataset", + provided_query: Any = None, + table_name: str = None, + limit: int = None, + selected_columns: Sequence[str] = None, + ) -> None: + """Create a lazy evaluated relation to for the dataset of a destination""" + + # NOTE: we can keep an assertion here, this class will not be created by the user + assert bool(table_name) != bool( + provided_query + ), "Please provide either an sql query OR a table_name" + + super().__init__(readable_dataset=readable_dataset) + + self._provided_query = provided_query + self._table_name = table_name + self._limit = limit + self._selected_columns = selected_columns + + @property + def query(self) -> Any: + """build the query""" + if self._provided_query: + return self._provided_query + + table_name = self.sql_client.make_qualified_table_name( + self.schema.naming.normalize_path(self._table_name) + ) + + maybe_limit_clause_1 = "" + maybe_limit_clause_2 = "" + if self._limit: + maybe_limit_clause_1, maybe_limit_clause_2 = self.sql_client._limit_clause_sql( + self._limit + ) + + selector = "*" + if self._selected_columns: + selector = ",".join( + [ + self.sql_client.escape_column_name(self.schema.naming.normalize_tables_path(c)) + for c in self._selected_columns + ] + ) + + return f"SELECT {maybe_limit_clause_1} {selector} FROM {table_name} {maybe_limit_clause_2}" + + @property + def columns_schema(self) -> TTableSchemaColumns: + return self.compute_columns_schema() + + @columns_schema.setter + def columns_schema(self, new_value: TTableSchemaColumns) -> None: + raise NotImplementedError("columns schema in ReadableDBAPIRelation can only be computed") + + def compute_columns_schema(self) -> TTableSchemaColumns: + """provide schema columns for the cursor, may be filtered by selected columns""" + + columns_schema = ( + self.schema.tables.get(self._table_name, {}).get("columns", {}) if self.schema else {} + ) + + if not columns_schema: + return None + if not self._selected_columns: + return columns_schema + + filtered_columns: TTableSchemaColumns = {} + for sc in self._selected_columns: + sc = self.schema.naming.normalize_path(sc) + if sc not in columns_schema.keys(): + raise ReadableRelationUnknownColumnException(sc) + filtered_columns[sc] = columns_schema[sc] + + return filtered_columns + + def __copy__(self) -> "ReadableDBAPIRelation": + return self.__class__( + readable_dataset=self._dataset, + provided_query=self._provided_query, + table_name=self._table_name, + limit=self._limit, + selected_columns=self._selected_columns, + ) + + def limit(self, limit: int, **kwargs: Any) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("limit") + rel = self.__copy__() + rel._limit = limit + return rel + + def select(self, *columns: str) -> "ReadableDBAPIRelation": + if self._provided_query: + raise ReadableRelationHasQueryException("select") + rel = self.__copy__() + rel._selected_columns = columns + # NOTE: the line below will ensure that no unknown columns are selected if + # schema is known + rel.compute_columns_schema() + return rel + + def __getitem__(self, columns: Union[str, Sequence[str]]) -> "SupportsReadableRelation": + if isinstance(columns, str): + return self.select(columns) + elif isinstance(columns, Sequence): + return self.select(*columns) + else: + raise TypeError(f"Invalid argument type: {type(columns).__name__}") + + def head(self, limit: int = 5) -> "ReadableDBAPIRelation": + return self.limit(limit) diff --git a/dlt/destinations/dataset/utils.py b/dlt/destinations/dataset/utils.py new file mode 100644 index 0000000000..766fbc13ea --- /dev/null +++ b/dlt/destinations/dataset/utils.py @@ -0,0 +1,95 @@ +from typing import Tuple + +from dlt import version + +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.destination import AnyDestination +from dlt.common.destination.reference import ( + Destination, + JobClientBase, + DestinationClientDwhConfiguration, + DestinationClientStagingConfiguration, + DestinationClientConfiguration, + DestinationClientDwhWithStagingConfiguration, +) + +from dlt.common.schema import Schema + + +# helpers +def get_destination_client_initial_config( + destination: AnyDestination, + default_schema_name: str, + dataset_name: str, + as_staging: bool = False, +) -> DestinationClientConfiguration: + client_spec = destination.spec + + # this client supports many schemas and datasets + if issubclass(client_spec, DestinationClientDwhConfiguration): + if issubclass(client_spec, DestinationClientStagingConfiguration): + spec: DestinationClientDwhConfiguration = client_spec(as_staging_destination=as_staging) + else: + spec = client_spec() + + spec._bind_dataset_name(dataset_name, default_schema_name) + return spec + + return client_spec() + + +def get_destination_clients( + schema: Schema, + destination: AnyDestination = None, + destination_dataset_name: str = None, + destination_initial_config: DestinationClientConfiguration = None, + staging: AnyDestination = None, + staging_dataset_name: str = None, + staging_initial_config: DestinationClientConfiguration = None, + # pipeline specific settings + default_schema_name: str = None, +) -> Tuple[JobClientBase, JobClientBase]: + destination = Destination.from_reference(destination) if destination else None + staging = Destination.from_reference(staging) if staging else None + + try: + # resolve staging config in order to pass it to destination client config + staging_client = None + if staging: + if not staging_initial_config: + # this is just initial config - without user configuration injected + staging_initial_config = get_destination_client_initial_config( + staging, + dataset_name=staging_dataset_name, + default_schema_name=default_schema_name, + as_staging=True, + ) + # create the client - that will also resolve the config + staging_client = staging.client(schema, staging_initial_config) + + if not destination_initial_config: + # config is not provided then get it with injected credentials + initial_config = get_destination_client_initial_config( + destination, + dataset_name=destination_dataset_name, + default_schema_name=default_schema_name, + ) + + # attach the staging client config to destination client config - if its type supports it + if ( + staging_client + and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) + and isinstance(staging_client.config, DestinationClientStagingConfiguration) + ): + initial_config.staging_config = staging_client.config + # create instance with initial_config properly set + client = destination.client(schema, initial_config) + return client, staging_client + except ModuleNotFoundError: + client_spec = destination.spec() + raise MissingDependencyException( + f"{client_spec.destination_type} destination", + [f"{version.DLT_PKG_NAME}[{client_spec.destination_type}]"], + "Dependencies for specific destinations are available as extras of dlt", + ) diff --git a/dlt/destinations/impl/sqlalchemy/db_api_client.py b/dlt/destinations/impl/sqlalchemy/db_api_client.py index 6f3ff065bf..27c4f2f1f9 100644 --- a/dlt/destinations/impl/sqlalchemy/db_api_client.py +++ b/dlt/destinations/impl/sqlalchemy/db_api_client.py @@ -84,7 +84,7 @@ def __init__(self, curr: sa.engine.CursorResult) -> None: def _get_columns(self) -> List[str]: try: - return list(self.native_cursor.keys()) # type: ignore[attr-defined] + return list(self.native_cursor.keys()) except ResourceClosedError: # this happens if now rows are returned return [] @@ -314,7 +314,7 @@ def execute_sql( self, sql: Union[AnyStr, sa.sql.Executable], *args: Any, **kwargs: Any ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as cursor: - if cursor.returns_rows: # type: ignore[attr-defined] + if cursor.returns_rows: return cursor.fetchall() return None diff --git a/dlt/common/libs/ibis.py b/dlt/helpers/ibis.py similarity index 74% rename from dlt/common/libs/ibis.py rename to dlt/helpers/ibis.py index ba6f363e66..ed4264dac7 100644 --- a/dlt/common/libs/ibis.py +++ b/dlt/helpers/ibis.py @@ -1,12 +1,14 @@ -from typing import cast +from typing import cast, Any from dlt.common.exceptions import MissingDependencyException - from dlt.common.destination.reference import TDestinationReferenceArg, Destination, JobClientBase +from dlt.common.schema import Schema +from dlt.destinations.sql_client import SqlClientBase try: import ibis # type: ignore - from ibis import BaseBackend + import sqlglot + from ibis import BaseBackend, Expr except ModuleNotFoundError: raise MissingDependencyException("dlt ibis Helpers", ["ibis"]) @@ -29,6 +31,22 @@ ] +# Map dlt data types to ibis data types +DATA_TYPE_MAP = { + "text": "string", + "double": "float64", + "bool": "boolean", + "timestamp": "timestamp", + "bigint": "int64", + "binary": "binary", + "json": "string", # Store JSON as string in ibis + "decimal": "decimal", + "wei": "int64", # Wei is a large integer + "date": "date", + "time": "time", +} + + def create_ibis_backend( destination: TDestinationReferenceArg, client: JobClientBase ) -> BaseBackend: @@ -119,3 +137,37 @@ def create_ibis_backend( con = ibis.duckdb.from_connection(duck) return con + + +def create_unbound_ibis_table( + sql_client: SqlClientBase[Any], schema: Schema, table_name: str +) -> Expr: + """Create an unbound ibis table from a dlt schema""" + + if table_name not in schema.tables: + raise Exception( + f"Table {table_name} not found in schema. Available tables: {schema.tables.keys()}" + ) + table_schema = schema.tables[table_name] + + # Convert dlt table schema columns to ibis schema + ibis_schema = { + sql_client.capabilities.casefold_identifier(col_name): DATA_TYPE_MAP[ + col_info.get("data_type", "string") + ] + for col_name, col_info in table_schema.get("columns", {}).items() + } + + # normalize table name + table_path = sql_client.make_qualified_table_name_path(table_name, escape=False) + + catalog = None + if len(table_path) == 3: + catalog, database, table = table_path + else: + database, table = table_path + + # create unbound ibis table and return in dlt wrapper + unbound_table = ibis.table(schema=ibis_schema, name=table, database=database, catalog=catalog) + + return unbound_table diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 70d160ea67..9bd2d6911f 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -1751,9 +1751,17 @@ def __getstate__(self) -> Any: return {"pipeline_name": self.pipeline_name} def _dataset( - self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "dbapi" + self, schema: Union[Schema, str, None] = None, dataset_type: TDatasetType = "auto" ) -> SupportsReadableDataset: - """Access helper to dataset""" + """Returns a dataset object for querying the destination data. + + Args: + schema: Schema name or Schema object to use. If None, uses the default schema if set. + dataset_type: Type of dataset interface to return. Defaults to 'auto' which will select ibis if available + otherwise it will fallback to the standard dbapi interface. + Returns: + A dataset object that supports querying the destination data. + """ if schema is None: schema = self.default_schema if self.default_schema_name else None return dataset( diff --git a/docs/website/docs/general-usage/dataset-access/dataset.md b/docs/website/docs/general-usage/dataset-access/dataset.md index 68635383c5..b2e3f03d4d 100644 --- a/docs/website/docs/general-usage/dataset-access/dataset.md +++ b/docs/website/docs/general-usage/dataset-access/dataset.md @@ -156,6 +156,64 @@ You can combine `select`, `limit`, and other methods. arrow_table = items_relation.select("col1", "col2").limit(50).arrow() ``` +## Modifying queries with ibis expressions + +If you install the amazing [ibis](https://ibis-project.org/) library, you can use ibis expressions to modify your queries. + +```sh +pip install ibis-framework +``` + +dlt will then wrap an `ibis.UnboundTable` with a `ReadableIbisRelation` object under the hood that will allow you to modify the query of a reltaion using ibis expressions: + +```py +# now that ibis is installed, we can get a dataset with ibis relations +dataset = pipeline._dataset() + +# get two relations +items_relation = dataset["items"] +order_relation = dataset["orders"] + +# join them using an ibis expression +joined_relation = items_relation.join(order_relation, items_relation.id == order_relation.item_id) + +# now we can use the ibis expression to filter the data +filtered_relation = joined_relation.filter(order_relation.status == "completed") + +# we can inspect the query that will be used to read the data +print(filtered_relation.query) + +# and finally fetch the data as a pandas dataframe, the same way we would do with a normal relation +df = filtered_relation.df() + +# a few more examples + +# filter for rows where the id is in the list of ids +items_relation.filter(items_relation.id.isin([1, 2, 3])).df() + +# limit and offset +items_relation.limit(10, offset=5).arrow() + +# mutate columns by adding a new colums that always is 10 times the value of the id column +items_relation.mutate(new_id=items_relation.id * 10).df() + +# sort asc and desc +import ibis +items_relation.order_by(ibis.desc("id"), ibis.asc("price")).limit(10) + +# group by and aggregate +items_relation.group_by("item_group").having(items_table.count() >= 1000).aggregate(sum_id=items_table.id.sum()).df() + +# subqueries +items_relation.filter(items_table.category.isin(beverage_categories.name)).df() +``` + +You can learn more about the available expressions on the [ibis for sql users](https://ibis-project.org/tutorials/ibis-for-sql-users) page. + +:::note +Keep in mind that you can use only methods that modify the executed query and none of the methods ibis provides for fetching data. This is done with the same methods defined on the regular relations explained above. If you need full native ibis integration, please read the ibis section in the advanced part further down. Additionally, not all ibis expressions may be supported by all destinations and sql dialects. +::: + ## Supported destinations All SQL and filesystem destinations supported by `dlt` can utilize this data access interface. For filesystem destinations, `dlt` [uses **DuckDB** under the hood](./sql-client.md#the-filesystem-sql-client) to create views from Parquet or JSONL files dynamically. This allows you to query data stored in files using the same interface as you would with SQL databases. If you plan on accessing data in buckets or the filesystem a lot this way, it is advised to load data as Parquet instead of JSONL, as **DuckDB** is able to only load the parts of the data actually needed for the query to work. diff --git a/poetry.lock b/poetry.lock index 6232b383c8..749979439d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -776,7 +776,7 @@ files = [ name = "atpublic" version = "5.0" description = "Keep all y'all's __all__'s in sync" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "atpublic-5.0-py3-none-any.whl", hash = "sha256:b651dcd886666b1042d1e38158a22a4f2c267748f4e97fde94bc492a4a28a3f3"}, @@ -1755,7 +1755,7 @@ PyYAML = ">=3.11" name = "clickhouse-connect" version = "0.7.8" description = "ClickHouse Database Core Driver for Python, Pandas, and Superset" -optional = true +optional = false python-versions = "~=3.8" files = [ {file = "clickhouse-connect-0.7.8.tar.gz", hash = "sha256:dad10ba90eabfe215dfb1fef59f2821a95c752988e66f1093ca8590a51539b8f"}, @@ -2242,7 +2242,7 @@ urllib3 = ">=1.0" name = "db-dtypes" version = "1.3.0" description = "Pandas Data Types for SQL systems (BigQuery, Spanner)" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "db_dtypes-1.3.0-py2.py3-none-any.whl", hash = "sha256:7e65c59f849ccbe6f7bc4d0253edcc212a7907662906921caba3e4aadd0bc277"}, @@ -3526,7 +3526,7 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-bigquery-storage" version = "2.27.0" description = "Google Cloud Bigquery Storage API client library" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "google_cloud_bigquery_storage-2.27.0-py2.py3-none-any.whl", hash = "sha256:3bfa8f74a61ceaffd3bfe90be5bbef440ad81c1c19ac9075188cccab34bffc2b"}, @@ -4504,63 +4504,64 @@ files = [ [[package]] name = "ibis-framework" -version = "10.0.0.dev256" +version = "9.5.0" description = "The portable Python dataframe library" -optional = true +optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "ibis_framework-10.0.0.dev256-py3-none-any.whl", hash = "sha256:d6f21278e6fd78920bbe986df2c871921142635cc4f7d5d2048cae26e307a3df"}, - {file = "ibis_framework-10.0.0.dev256.tar.gz", hash = "sha256:e9f97d8177fd88f4a3578be20519c1da79a6a7ffac678b46b790bfde67405930"}, + {file = "ibis_framework-9.5.0-py3-none-any.whl", hash = "sha256:145fe30d94f111cff332580c275ce77725c5ff7086eede93af0b371649d009c0"}, + {file = "ibis_framework-9.5.0.tar.gz", hash = "sha256:1c8a29277e63ee0dfc289bc8f550164b5e3bdaec1b76b62436c37d331bb4ef84"}, ] [package.dependencies] atpublic = ">=2.3,<6" clickhouse-connect = {version = ">=0.5.23,<1", extras = ["arrow", "numpy", "pandas"], optional = true, markers = "extra == \"clickhouse\""} db-dtypes = {version = ">=0.3,<2", optional = true, markers = "extra == \"bigquery\""} -duckdb = {version = ">=0.10,<1.2", optional = true, markers = "extra == \"duckdb\""} +duckdb = {version = ">=0.8.1,<1.2", optional = true, markers = "extra == \"duckdb\""} google-cloud-bigquery = {version = ">=3,<4", optional = true, markers = "extra == \"bigquery\""} google-cloud-bigquery-storage = {version = ">=2,<3", optional = true, markers = "extra == \"bigquery\""} -numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"duckdb\" or extra == \"oracle\" or extra == \"polars\" or extra == \"pyspark\""} -pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +numpy = {version = ">=1.23.2,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +packaging = {version = ">=21.3,<25", optional = true, markers = "extra == \"dask\" or extra == \"duckdb\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"pyspark\""} +pandas = {version = ">=1.5.3,<3", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} parsy = ">=2,<3" psycopg2 = {version = ">=2.8.4,<3", optional = true, markers = "extra == \"postgres\" or extra == \"risingwave\""} -pyarrow = {version = ">=10.0.1,<19", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} -pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow = {version = ">=10.0.1,<18", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +pyarrow-hotfix = {version = ">=0.4,<1", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} pydata-google-auth = {version = ">=1.4.0,<2", optional = true, markers = "extra == \"bigquery\""} pyodbc = {version = ">=4.0.39,<6", optional = true, markers = "extra == \"mssql\""} python-dateutil = ">=2.8.2,<3" pytz = ">=2022.7" -rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"databricks\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} +rich = {version = ">=12.4.4,<14", optional = true, markers = "extra == \"bigquery\" or extra == \"clickhouse\" or extra == \"dask\" or extra == \"datafusion\" or extra == \"druid\" or extra == \"duckdb\" or extra == \"exasol\" or extra == \"flink\" or extra == \"impala\" or extra == \"mssql\" or extra == \"mysql\" or extra == \"oracle\" or extra == \"pandas\" or extra == \"polars\" or extra == \"postgres\" or extra == \"pyspark\" or extra == \"snowflake\" or extra == \"sqlite\" or extra == \"risingwave\" or extra == \"trino\""} snowflake-connector-python = {version = ">=3.0.2,<3.3.0b1 || >3.3.0b1,<4", optional = true, markers = "extra == \"snowflake\""} -sqlglot = ">=23.4,<25.30" -toolz = ">=0.11,<2" +sqlglot = ">=23.4,<25.21" +toolz = ">=0.11,<1" typing-extensions = ">=4.3.0,<5" [package.extras] -bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] -clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -databricks = ["databricks-sql-connector-core (>=4,<5)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -datafusion = ["datafusion (>=0.6,<43)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +bigquery = ["db-dtypes (>=0.3,<2)", "google-cloud-bigquery (>=3,<4)", "google-cloud-bigquery-storage (>=2,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydata-google-auth (>=1.4.0,<2)", "rich (>=12.4.4,<14)"] +clickhouse = ["clickhouse-connect[arrow,numpy,pandas] (>=0.5.23,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +dask = ["dask[array,dataframe] (>=2022.9.1,<2024.3.0)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +datafusion = ["datafusion (>=0.6,<41)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] decompiler = ["black (>=22.1.0,<25)"] deltalake = ["deltalake (>=0.9.0,<1)"] -druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] -duckdb = ["duckdb (>=0.10,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +druid = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pydruid (>=0.6.7,<1)", "rich (>=12.4.4,<14)"] +duckdb = ["duckdb (>=0.8.1,<1.2)", "numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] examples = ["pins[gcs] (>=0.8.3,<1)"] -exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] -flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +exasol = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyexasol[pandas] (>=0.25.2,<1)", "rich (>=12.4.4,<14)"] +flink = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] geospatial = ["geoarrow-types (>=0.2,<1)", "geopandas (>=0.6,<2)", "pyproj (>=3.3.0,<4)", "shapely (>=2,<3)"] -impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] -mysql = ["mysqlclient (>=2.2.4,<3)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] -risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] -snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] -sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] -trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<19)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] +impala = ["impyla (>=0.17,<1)", "numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +mssql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyodbc (>=4.0.39,<6)", "rich (>=12.4.4,<14)"] +mysql = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pymysql (>=1,<2)", "rich (>=12.4.4,<14)"] +oracle = ["numpy (>=1.23.2,<3)", "oracledb (>=1.3.1,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pandas = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +polars = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "polars (>=1,<2)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +postgres = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +pyspark = ["numpy (>=1.23.2,<3)", "packaging (>=21.3,<25)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "pyspark (>=3.3.3,<4)", "rich (>=12.4.4,<14)"] +risingwave = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "psycopg2 (>=2.8.4,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)"] +snowflake = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "snowflake-connector-python (>=3.0.2,!=3.3.0b1,<4)"] +sqlite = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "regex (>=2021.7.6)", "rich (>=12.4.4,<14)"] +trino = ["numpy (>=1.23.2,<3)", "pandas (>=1.5.3,<3)", "pyarrow (>=10.0.1,<18)", "pyarrow-hotfix (>=0.4,<1)", "rich (>=12.4.4,<14)", "trino (>=0.321,<1)"] visualization = ["graphviz (>=0.16,<1)"] [[package]] @@ -5212,7 +5213,7 @@ source = ["Cython (>=0.29.35)"] name = "lz4" version = "4.3.3" description = "LZ4 Bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "lz4-4.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b891880c187e96339474af2a3b2bfb11a8e4732ff5034be919aa9029484cd201"}, @@ -6645,7 +6646,7 @@ future = "*" name = "parsy" version = "2.1" description = "Easy-to-use parser combinators, for parsing in pure Python" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "parsy-2.1-py3-none-any.whl", hash = "sha256:8f18e7b11985e7802e7e3ecbd8291c6ca243d29820b1186e4c84605db4efffa0"}, @@ -7080,7 +7081,7 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] name = "psycopg2" version = "2.9.10" description = "psycopg2 - Python-PostgreSQL Database Adapter" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "psycopg2-2.9.10-cp310-cp310-win32.whl", hash = "sha256:5df2b672140f95adb453af93a7d669d7a7bf0a56bcd26f1502329166f4a61716"}, @@ -7243,7 +7244,7 @@ test = ["cffi", "hypothesis", "pandas", "pytest", "pytz"] name = "pyarrow-hotfix" version = "0.6" description = "" -optional = true +optional = false python-versions = ">=3.5" files = [ {file = "pyarrow_hotfix-0.6-py3-none-any.whl", hash = "sha256:dcc9ae2d220dff0083be6a9aa8e0cdee5182ad358d4931fce825c545e5c89178"}, @@ -7456,7 +7457,7 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" name = "pydata-google-auth" version = "1.9.0" description = "PyData helpers for authenticating to Google APIs" -optional = true +optional = false python-versions = ">=3.9" files = [ {file = "pydata-google-auth-1.9.0.tar.gz", hash = "sha256:2f546e88f007dfdb050087556eb46d6008e351386a7b368096797fae5df374f2"}, @@ -9265,13 +9266,13 @@ typing-extensions = "*" [[package]] name = "sqlglot" -version = "25.24.5" +version = "25.20.2" description = "An easily customizable SQL parser and transpiler" optional = false python-versions = ">=3.7" files = [ - {file = "sqlglot-25.24.5-py3-none-any.whl", hash = "sha256:f8a8870d1f5cdd2e2dc5c39a5030a0c7b0a91264fb8972caead3dac8e8438873"}, - {file = "sqlglot-25.24.5.tar.gz", hash = "sha256:6d3d604034301ca3b614d6b4148646b4033317b7a93d1801e9661495eb4b4fcf"}, + {file = "sqlglot-25.20.2-py3-none-any.whl", hash = "sha256:cdbfd7ce3f2f39f32bd7b4c23fd9e0fd261636a6b14285b914e8def25fd0a567"}, + {file = "sqlglot-25.20.2.tar.gz", hash = "sha256:169fe8308dd70d7bd40117b2221b62bdc7c4e2ea8eb07394b2a6146cdedf05ab"}, ] [package.extras] @@ -9648,13 +9649,13 @@ files = [ [[package]] name = "toolz" -version = "1.0.0" +version = "0.12.1" description = "List processing tools and functional utilities" -optional = true -python-versions = ">=3.8" +optional = false +python-versions = ">=3.7" files = [ - {file = "toolz-1.0.0-py3-none-any.whl", hash = "sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236"}, - {file = "toolz-1.0.0.tar.gz", hash = "sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02"}, + {file = "toolz-0.12.1-py3-none-any.whl", hash = "sha256:d22731364c07d72eea0a0ad45bafb2c2937ab6fd38a3507bf55eae8744aa7d85"}, + {file = "toolz-0.12.1.tar.gz", hash = "sha256:ecca342664893f177a13dac0e6b41cbd8ac25a358e5f215316d43e2100224f4d"}, ] [[package]] @@ -10529,7 +10530,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p name = "zstandard" version = "0.22.0" description = "Zstandard bindings for Python" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "zstandard-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:275df437ab03f8c033b8a2c181e51716c32d831082d93ce48002a5227ec93019"}, @@ -10618,4 +10619,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "1bf3deccd929c083b880c1a82be0983430ab49f7ade247b1c5573bb8c70d9ff5" +content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" diff --git a/pyproject.toml b/pyproject.toml index f736fc65ad..0fb7f94e36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -167,7 +167,6 @@ pytest-mock = "^3.14.0" types-regex = "^2024.5.15.20240519" flake8-print = "^5.0.0" mimesis = "^7.0.0" -ibis-framework = { version = ">=9.0.0", markers = "python_version >= '3.10'", optional = true, extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} shapely = ">=2.0.6" [tool.poetry.group.sources] @@ -205,6 +204,12 @@ optional = true [tool.poetry.group.airflow.dependencies] apache-airflow = {version = "^2.8.0", markers = "python_version < '3.12'"} +[tool.poetry.group.ibis] +optional = true + +[tool.poetry.group.ibis.dependencies] +ibis-framework = { version = ">=9.0.0,<10.0.0", markers = "python_version >= '3.10'", extras = ["duckdb", "postgres", "bigquery", "snowflake", "mssql", "clickhouse"]} + [tool.poetry.group.providers] optional = true diff --git a/tests/destinations/test_readable_dbapi_dataset.py b/tests/destinations/test_readable_dbapi_dataset.py index 4745735371..bc58a18fa0 100644 --- a/tests/destinations/test_readable_dbapi_dataset.py +++ b/tests/destinations/test_readable_dbapi_dataset.py @@ -2,7 +2,7 @@ import dlt import pytest -from dlt.destinations.dataset import ( +from dlt.destinations.dataset.exceptions import ( ReadableRelationHasQueryException, ReadableRelationUnknownColumnException, ) @@ -12,44 +12,44 @@ def test_query_builder() -> None: dataset = dlt.pipeline(destination="duckdb", pipeline_name="pipeline")._dataset() # default query for a table - assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # type: ignore[attr-defined] + assert dataset.my_table.query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table"' # head query assert ( - dataset.my_table.head().query.strip() # type: ignore[attr-defined] + dataset.my_table.head().query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 5' ) # limit query assert ( - dataset.my_table.limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.limit(24).query.strip() == 'SELECT * FROM "pipeline_dataset"."my_table" LIMIT 24' ) # select columns assert ( - dataset.my_table.select("col1", "col2").query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # also indexer notation assert ( - dataset.my_table[["col1", "col2"]].query.strip() # type: ignore[attr-defined] + dataset.my_table[["col1", "col2"]].query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table"' ) # identifiers are normalized assert ( - dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY_TABLE"].select("CoL1", "cOl2").query.strip() == 'SELECT "co_l1","c_ol2" FROM "pipeline_dataset"."my_table"' ) assert ( - dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() # type: ignore[attr-defined] + dataset["MY__TABLE"].select("Co__L1", "cOl2").query.strip() == 'SELECT "co__l1","c_ol2" FROM "pipeline_dataset"."my__table"' ) # limit and select chained assert ( - dataset.my_table.select("col1", "col2").limit(24).query.strip() # type: ignore[attr-defined] + dataset.my_table.select("col1", "col2").limit(24).query.strip() == 'SELECT "col1","col2" FROM "pipeline_dataset"."my_table" LIMIT 24' ) @@ -65,18 +65,18 @@ def test_copy_and_chaining() -> None: relation2 = relation.__copy__() assert relation != relation2 - assert relation._limit == relation2._limit # type: ignore[attr-defined] - assert relation._table_name == relation2._table_name # type: ignore[attr-defined] - assert relation._provided_query == relation2._provided_query # type: ignore[attr-defined] - assert relation._selected_columns == relation2._selected_columns # type: ignore[attr-defined] + assert relation._limit == relation2._limit + assert relation._table_name == relation2._table_name + assert relation._provided_query == relation2._provided_query + assert relation._selected_columns == relation2._selected_columns # test copy while chaining limit relation3 = relation2.limit(22) assert relation2 != relation3 - assert relation2._limit != relation3._limit # type: ignore[attr-defined] + assert relation2._limit != relation3._limit # test last setting prevails chaining - assert relation.limit(23).limit(67).limit(11)._limit == 11 # type: ignore[attr-defined] + assert relation.limit(23).limit(67).limit(11)._limit == 11 def test_computed_schema_columns() -> None: diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index 98642bb263..a7aa4d36e4 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -283,8 +283,8 @@ def test_duckdb_credentials_separation( print(p1_dataset.p1_data.fetchall()) print(p2_dataset.p2_data.fetchall()) - assert "p1" in p1_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] - assert "p2" in p2_dataset.sql_client.credentials._conn_str() # type: ignore[attr-defined] + assert "p1" in p1_dataset.sql_client.credentials._conn_str() + assert "p2" in p2_dataset.sql_client.credentials._conn_str() - assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 # type: ignore[attr-defined] - assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 # type: ignore[attr-defined] + assert p1_dataset.sql_client.credentials.bound_to_pipeline is p1 + assert p2_dataset.sql_client.credentials.bound_to_pipeline is p2 diff --git a/tests/load/test_read_interfaces.py b/tests/load/test_read_interfaces.py index 1a9c8a383b..d2f5f7951e 100644 --- a/tests/load/test_read_interfaces.py +++ b/tests/load/test_read_interfaces.py @@ -1,5 +1,5 @@ -from typing import Any, cast - +from typing import Any, cast, Tuple, List +import re import pytest import dlt import os @@ -20,8 +20,10 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT, clean_test_storage -from dlt.common.destination.reference import TDestinationReferenceArg -from dlt.destinations.dataset import ReadableDBAPIDataset, ReadableRelationUnknownColumnException +from dlt.destinations.dataset.dataset import ReadableDBAPIDataset +from dlt.destinations.dataset.exceptions import ( + ReadableRelationUnknownColumnException, +) from tests.load.utils import drop_pipeline_data EXPECTED_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] @@ -58,6 +60,7 @@ def autouse_test_storage() -> FileStorage: @pytest.fixture(scope="session") def populated_pipeline(request, autouse_test_storage) -> Any: """fixture that returns a pipeline object populated with the example data""" + destination_config = cast(DestinationTestConfiguration, request.param) if ( @@ -104,6 +107,7 @@ def items(): columns={ "id": {"data_type": "bigint"}, "double_id": {"data_type": "bigint"}, + "di_decimal": {"data_type": "decimal", "precision": 7, "scale": 3}, }, ) def double_items(): @@ -111,6 +115,7 @@ def double_items(): { "id": i, "double_id": i * 2, + "di_decimal": Decimal("10.433"), } for i in range(total_records) ] @@ -151,6 +156,24 @@ def double_items(): ) +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_explicit_dataset_type_selection(populated_pipeline: Pipeline): + from dlt.destinations.dataset.dataset import ReadableDBAPIRelation + from dlt.destinations.dataset.ibis_relation import ReadableIbisRelation + + assert isinstance( + populated_pipeline._dataset(dataset_type="default").items, ReadableDBAPIRelation + ) + assert isinstance(populated_pipeline._dataset(dataset_type="ibis").items, ReadableIbisRelation) + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -258,71 +281,6 @@ def test_db_cursor_access(populated_pipeline: Pipeline) -> None: assert set(ids) == set(range(total_records)) -@pytest.mark.no_load -@pytest.mark.essential -@pytest.mark.parametrize( - "populated_pipeline", - configs, - indirect=True, - ids=lambda x: x.name, -) -def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: - # NOTE: we could generalize this with a context for certain deps - import subprocess - - subprocess.check_call( - ["pip", "install", "ibis-framework[duckdb,postgres,bigquery,snowflake,mssql,clickhouse]"] - ) - - from dlt.common.libs.ibis import SUPPORTED_DESTINATIONS - - # check correct error if not supported - if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: - with pytest.raises(NotImplementedError): - populated_pipeline._dataset().ibis() - return - - total_records = _total_records(populated_pipeline) - ibis_connection = populated_pipeline._dataset().ibis() - - map_i = lambda x: x - if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": - map_i = lambda x: x.upper() - - dataset_name = map_i(populated_pipeline.dataset_name) - table_like_statement = None - table_name_prefix = "" - addtional_tables = [] - - # clickhouse has no datasets, but table prefixes and a sentinel table - if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": - table_like_statement = dataset_name + "." - table_name_prefix = dataset_name + "___" - dataset_name = None - addtional_tables = ["dlt_sentinel_table"] - - add_table_prefix = lambda x: table_name_prefix + x - - # just do a basic check to see wether ibis can connect - assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { - add_table_prefix(map_i(x)) - for x in ( - [ - "_dlt_loads", - "_dlt_pipeline_state", - "_dlt_version", - "double_items", - "items", - "items__children", - ] - + addtional_tables - ) - } - - items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) - assert items_table.count().to_pandas() == total_records - - @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -332,7 +290,8 @@ def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_hint_preservation(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items + # NOTE: for now hints are only preserved for the default dataset + table_relationship = populated_pipeline._dataset(dataset_type="default").items # check that hints are carried over to arrow table expected_decimal_precision = 10 expected_decimal_precision_2 = 12 @@ -425,8 +384,7 @@ def test_limit_and_head(populated_pipeline: Pipeline) -> None: ids=lambda x: x.name, ) def test_column_selection(populated_pipeline: Pipeline) -> None: - table_relationship = populated_pipeline._dataset().items - + table_relationship = populated_pipeline._dataset(dataset_type="default").items columns = ["_dlt_load_id", "other_decimal"] data_frame = table_relationship.select(*columns).head().df() assert [v.lower() for v in data_frame.columns.values] == columns @@ -479,6 +437,266 @@ def test_schema_arg(populated_pipeline: Pipeline) -> None: assert "items" in dataset.schema.tables +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_expression_relation(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + import ibis # type: ignore + + # now we should get the more powerful ibis relation + dataset = populated_pipeline._dataset() + total_records = _total_records(populated_pipeline) + + items_table = dataset["items"] + double_items_table = dataset["double_items"] + + # check full table access + df = items_table.df() + assert len(df.index) == total_records + + df = double_items_table.df() + assert len(df.index) == total_records + + # check limit + df = items_table.limit(5).df() + assert len(df.index) == 5 + + # check chained expression with join, column selection, order by and limit + joined_table = ( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + .order_by("id") + .limit(20) + ) + table = joined_table.fetchall() + assert len(table) == 20 + assert list(table[0]) == [0, 0] + assert list(table[5]) == [5, 10] + assert list(table[10]) == [10, 20] + + # check aggregate of first 20 items + agg_table = items_table.order_by("id").limit(20).aggregate(sum_id=items_table.id.sum()) + assert agg_table.fetchone()[0] == reduce(lambda a, b: a + b, range(20)) + + # check filtering + filtered_table = items_table.filter(items_table.id < 10) + assert len(filtered_table.fetchall()) == 10 + + if populated_pipeline.destination.destination_type != "dlt.destinations.duckdb": + return + + # we check a bunch of expressions without executing them to see that they produce correct sql + # also we return the keys of the disovered schema columns + def sql_from_expr(expr: Any) -> Tuple[str, List[str]]: + query = str(expr.query).replace(populated_pipeline.dataset_name, "dataset") + columns = list(expr.columns_schema.keys()) if expr.columns_schema else None + return re.sub(r"\s+", " ", query), columns + + # test all functions discussed here: https://ibis-project.org/tutorials/ibis-for-sql-users + ALL_COLUMNS = ["id", "decimal", "other_decimal", "_dlt_load_id", "_dlt_id"] + + # selecting two columns + assert sql_from_expr(items_table.select("id", "decimal")) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # selecting all columns + assert sql_from_expr(items_table) == ('SELECT * FROM "dataset"."items"', ALL_COLUMNS) + + # selecting two other columns via item getter + assert sql_from_expr(items_table["id", "decimal"]) == ( + 'SELECT "t0"."id", "t0"."decimal" FROM "dataset"."items" AS "t0"', + ["id", "decimal"], + ) + + # adding a new columns + new_col = (items_table.id * 2).name("new_col") + assert sql_from_expr(items_table.select("id", "decimal", new_col)) == ( + ( + 'SELECT "t0"."id", "t0"."decimal", "t0"."id" * 2 AS "new_col" FROM' + ' "dataset"."items" AS "t0"' + ), + None, + ) + + # mutating table (add a new column computed from existing columns) + assert sql_from_expr( + items_table.mutate(double_id=items_table.id * 2).select("id", "double_id") + ) == ( + 'SELECT "t0"."id", "t0"."id" * 2 AS "double_id" FROM "dataset"."items" AS "t0"', + None, + ) + + # mutating table add new static column + assert sql_from_expr( + items_table.mutate(new_col=ibis.literal("static_value")).select("id", "new_col") + ) == ('SELECT "t0"."id", \'static_value\' AS "new_col" FROM "dataset"."items" AS "t0"', None) + + # check filtering (preserves all columns) + assert sql_from_expr(items_table.filter(items_table.id < 10)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ALL_COLUMNS, + ) + + # filtering and selecting a single column + assert sql_from_expr(items_table.filter(items_table.id < 10).select("id")) == ( + 'SELECT "t0"."id" FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10', + ["id"], + ) + + # check filter "and" condition + assert sql_from_expr(items_table.filter(items_table.id < 10).filter(items_table.id > 5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."id" < 10 AND "t0"."id" > 5', + ALL_COLUMNS, + ) + + # check filter "or" condition + assert sql_from_expr(items_table.filter((items_table.id < 10) | (items_table.id > 5))) == ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE ( "t0"."id" < 10 ) OR ( "t0"."id" > 5 )', + ALL_COLUMNS, + ) + + # check group by and aggregate + assert sql_from_expr( + items_table.group_by("id") + .having(items_table.count() >= 1000) + .aggregate(sum_id=items_table.id.sum()) + ) == ( + ( + 'SELECT "t1"."id", "t1"."sum_id" FROM ( SELECT "t0"."id", SUM("t0"."id") AS "sum_id",' + ' COUNT(*) AS "CountStar(items)" FROM "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1"' + ' WHERE "t1"."CountStar(items)" >= 1000' + ), + None, + ) + + # sorting and ordering + assert sql_from_expr(items_table.order_by("id", "decimal").limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # sort desc and asc + assert sql_from_expr(items_table.order_by(ibis.desc("id"), ibis.asc("decimal")).limit(10)) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" DESC, "t0"."decimal" ASC' + " LIMIT 10" + ), + ALL_COLUMNS, + ) + + # offset and limit + assert sql_from_expr(items_table.order_by("id").limit(10, offset=5)) == ( + 'SELECT * FROM "dataset"."items" AS "t0" ORDER BY "t0"."id" ASC LIMIT 10 OFFSET 5', + ALL_COLUMNS, + ) + + # join + assert sql_from_expr( + items_table.join(double_items_table, items_table.id == double_items_table.id)[ + ["id", "double_id"] + ] + ) == ( + ( + 'SELECT "t2"."id", "t3"."double_id" FROM "dataset"."items" AS "t2" INNER JOIN' + ' "dataset"."double_items" AS "t3" ON "t2"."id" = "t3"."id"' + ), + None, + ) + + # subqueries + assert sql_from_expr( + items_table.filter(items_table.decimal.isin(double_items_table.di_decimal)) + ) == ( + ( + 'SELECT * FROM "dataset"."items" AS "t0" WHERE "t0"."decimal" IN ( SELECT' + ' "t1"."di_decimal" FROM "dataset"."double_items" AS "t1" )' + ), + ALL_COLUMNS, + ) + + # topk + assert sql_from_expr(items_table.decimal.topk(10)) == ( + ( + 'SELECT * FROM ( SELECT "t0"."decimal", COUNT(*) AS "CountStar(items)" FROM' + ' "dataset"."items" AS "t0" GROUP BY 1 ) AS "t1" ORDER BY "t1"."CountStar(items)" DESC' + " LIMIT 10" + ), + None, + ) + + +@pytest.mark.no_load +@pytest.mark.essential +@pytest.mark.parametrize( + "populated_pipeline", + configs, + indirect=True, + ids=lambda x: x.name, +) +def test_ibis_dataset_access(populated_pipeline: Pipeline) -> None: + # NOTE: we could generalize this with a context for certain deps + + from dlt.helpers.ibis import SUPPORTED_DESTINATIONS + + # check correct error if not supported + if populated_pipeline.destination.destination_type not in SUPPORTED_DESTINATIONS: + with pytest.raises(NotImplementedError): + populated_pipeline._dataset().ibis() + return + + total_records = _total_records(populated_pipeline) + ibis_connection = populated_pipeline._dataset().ibis() + + map_i = lambda x: x + if populated_pipeline.destination.destination_type == "dlt.destinations.snowflake": + map_i = lambda x: x.upper() + + dataset_name = map_i(populated_pipeline.dataset_name) + table_like_statement = None + table_name_prefix = "" + addtional_tables = [] + + # clickhouse has no datasets, but table prefixes and a sentinel table + if populated_pipeline.destination.destination_type == "dlt.destinations.clickhouse": + table_like_statement = dataset_name + "." + table_name_prefix = dataset_name + "___" + dataset_name = None + addtional_tables = ["dlt_sentinel_table"] + + add_table_prefix = lambda x: table_name_prefix + x + + # just do a basic check to see wether ibis can connect + assert set(ibis_connection.list_tables(database=dataset_name, like=table_like_statement)) == { + add_table_prefix(map_i(x)) + for x in ( + [ + "_dlt_loads", + "_dlt_pipeline_state", + "_dlt_version", + "double_items", + "items", + "items__children", + ] + + addtional_tables + ) + } + + items_table = ibis_connection.table(add_table_prefix(map_i("items")), database=dataset_name) + assert items_table.count().to_pandas() == total_records + + @pytest.mark.no_load @pytest.mark.essential @pytest.mark.parametrize( @@ -546,6 +764,7 @@ def test_standalone_dataset(populated_pipeline: Pipeline) -> None: assert dataset.schema.name == "unknown_dataset" assert "items" not in dataset.schema.tables + # NOTE: this breaks the following test, it will need to be fixed somehow # create a newer schema with different name and see wether this is loaded from dlt.common.schema import Schema from dlt.common.schema import utils From 4e5a2405e23c7dfae89903327569ae31fb535d4b Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink <47451109+jorritsandbrink@users.noreply.github.com> Date: Wed, 11 Dec 2024 12:35:59 +0400 Subject: [PATCH 5/5] `iceberg` table format support for `filesystem` destination (#2067) * add pyiceberg dependency and upgrade mypy - mypy upgrade needed to solve this issue: https://github.com/apache/iceberg-python/issues/768 - uses <1.13.0 requirement on mypy because 1.13.0 gives error - new lint errors arising due to version upgrade are simply ignored * extend pyiceberg dependencies * remove redundant delta annotation * add basic local filesystem iceberg support * add active table format setting * disable merge tests for iceberg table format * restore non-redundant extra info * refactor to in-memory iceberg catalog * add s3 support for iceberg table format * add schema evolution support for iceberg table format * extract _register_table function * add partition support for iceberg table format * update docstring * enable child table test for iceberg table format * enable empty source test for iceberg table format * make iceberg catalog namespace configurable and default to dataset name * add optional typing * fix typo * improve typing * extract logic into dedicated function * add iceberg read support to filesystem sql client * remove unused import * add todo * extract logic into separate functions * add azure support for iceberg table format * generalize delta table format tests * enable get tables function test for iceberg table format * remove ignores * undo table directory management change * enable test_read_interfaces tests for iceberg * fix active table format filter * use mixin for object store rs credentials * generalize catalog typing * extract pyiceberg scheme mapping into separate function * generalize credentials mixin test setup * remove unused import * add centralized fallback to append when merge is not supported * Revert "add centralized fallback to append when merge is not supported" This reverts commit 54cd0bcebffad15d522e734da321c602f4bd7461. * fall back to append if merge is not supported on filesystem * fix test for s3-compatible storage * remove obsolete code path * exclude gcs read interface tests for iceberg * add gcs support for iceberg table format * switch to UnsupportedAuthenticationMethodException * add iceberg table format docs * use shorter pipeline name to prevent too long sql identifiers * add iceberg catalog note to docs * black format * use shorter pipeline name to prevent too long sql identifiers * correct max id length for sqlalchemy mysql dialect * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit 6cce03b77111825b0714597e6d494df97145f0f2. * Revert "use shorter pipeline name to prevent too long sql identifiers" This reverts commit ef29aa7c2fdba79441573850c7d15b83526c011a. * replace show with execute to prevent useless print output * add abfss scheme to test * remove az support for iceberg table format * remove iceberg bucket test exclusion * add note to docs on azure scheme support for iceberg table format * exclude iceberg from duckdb s3-compatibility test * disable pyiceberg info logs for tests * extend table format docs and move into own page * upgrade adlfs to enable account_host attribute * Merge branch 'devel' of https://github.com/dlt-hub/dlt into feat/1996-iceberg-filesystem * fix lint errors * re-add pyiceberg dependency * enabled iceberg in dbt-duckdb * upgrade pyiceberg version * remove pyiceberg mypy errors across python version * does not install airflow group for dev * fixes gcp oauth iceberg credentials handling * fixes ca cert bundle duckdb azure on ci * allow for airflow dep to be present during type check --------- Co-authored-by: Marcin Rudolf --- .github/workflows/test_destinations.yml | 9 +- .github/workflows/test_local_destinations.yml | 5 +- Makefile | 2 +- dlt/cli/source_detection.py | 3 +- .../configuration/specs/aws_credentials.py | 15 +- .../configuration/specs/azure_credentials.py | 22 +- .../configuration/specs/base_configuration.py | 2 +- .../specs/config_providers_context.py | 7 +- dlt/common/configuration/specs/exceptions.py | 4 + .../configuration/specs/gcp_credentials.py | 36 +- dlt/common/configuration/specs/mixins.py | 24 ++ dlt/common/data_writers/buffered.py | 2 +- dlt/common/destination/utils.py | 2 +- dlt/common/libs/deltalake.py | 6 +- dlt/common/libs/pyiceberg.py | 192 +++++++++ dlt/common/logger.py | 2 +- dlt/common/metrics.py | 2 +- dlt/common/reflection/utils.py | 14 +- dlt/common/schema/schema.py | 2 +- dlt/common/typing.py | 2 +- dlt/destinations/impl/filesystem/factory.py | 4 +- .../impl/filesystem/filesystem.py | 86 +++- .../impl/filesystem/sql_client.py | 27 +- dlt/destinations/impl/sqlalchemy/factory.py | 3 + dlt/extract/incremental/lag.py | 2 +- dlt/helpers/airflow_helper.py | 4 +- dlt/helpers/dbt/profiles.yml | 1 + .../destinations/delta-iceberg.md | 168 ++++++++ .../dlt-ecosystem/destinations/filesystem.md | 113 +---- .../dlt-ecosystem/table-formats/iceberg.md | 2 +- .../dataset-access/ibis-backend.md | 3 +- docs/website/sidebars.js | 1 + mypy.ini | 6 + poetry.lock | 154 +++++-- pyproject.toml | 11 +- tests/conftest.py | 3 + tests/libs/test_csv_writer.py | 4 +- ...dentials.py => test_credentials_mixins.py} | 169 +++++--- tests/load/filesystem/test_sql_client.py | 18 +- .../load/pipeline/test_filesystem_pipeline.py | 393 +++++++++++------- .../sql_database/test_sql_database_source.py | 5 +- tests/load/utils.py | 33 +- tests/pipeline/utils.py | 13 + .../helpers/rest_client/test_client.py | 2 +- tests/utils.py | 7 + 45 files changed, 1163 insertions(+), 422 deletions(-) create mode 100644 dlt/common/configuration/specs/mixins.py create mode 100644 dlt/common/libs/pyiceberg.py create mode 100644 docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md rename tests/load/filesystem/{test_object_store_rs_credentials.py => test_credentials_mixins.py} (50%) diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index cfd0a3bd56..84a8f95d71 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -77,8 +77,13 @@ jobs: # key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-redshift - name: Install dependencies - # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E redshift -E postgis -E postgres -E gs -E s3 -E az -E parquet -E duckdb -E cli -E filesystem --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: enable certificates for azure and duckdb + run: sudo mkdir -p /etc/pki/tls/certs && sudo ln -s /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6f44e5fd5a..706bae1b0c 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -95,7 +95,10 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake + run: poetry install --no-interaction -E postgres -E postgis -E duckdb -E parquet -E filesystem -E cli -E weaviate -E qdrant -E sftp --with sentry-sdk --with pipeline,ibis -E deltalake -E pyiceberg + + - name: Upgrade sqlalchemy + run: poetry run pip install sqlalchemy==2.0.18 # minimum version required by `pyiceberg` - name: Start SFTP server run: docker compose -f "tests/load/filesystem_sftp/docker-compose.yml" up -d diff --git a/Makefile b/Makefile index 2a7f6dac0a..0ca8a2e0c3 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk,airflow + poetry install --all-extras --with docs,providers,pipeline,sources,sentry-sdk lint: ./tools/check-package.sh diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 7067f8b896..0769605d01 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -29,8 +29,7 @@ def find_call_arguments_to_replace( if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): raise CliCommandInnerException( "init", - f"The pipeline script {init_script_name} must pass the {t_arg_name} as" - f" string to '{arg_name}' function in line {dn_node.lineno}", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}", # type: ignore[attr-defined] ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 5f69be6a33..a75cd85225 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.configuration.specs.exceptions import ( InvalidBoto3Session, ObjectStoreRsCredentialsException, @@ -16,7 +17,9 @@ @configspec -class AwsCredentialsWithoutDefaults(CredentialsConfiguration): +class AwsCredentialsWithoutDefaults( + CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig +): # credentials without boto implementation aws_access_key_id: str = None aws_secret_access_key: TSecretStrValue = None @@ -77,6 +80,16 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: return creds + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "s3.access-key-id": self.aws_access_key_id, + "s3.secret-access-key": self.aws_secret_access_key, + "s3.session-token": self.aws_session_token, + "s3.region": self.region_name, + "s3.endpoint": self.endpoint_url, + "s3.connect-timeout": 300, + } + @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index cf6ec493de..aabd0b471a 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -8,6 +8,7 @@ CredentialsWithDefault, configspec, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt import version from dlt.common.utils import without_none @@ -15,7 +16,7 @@ @configspec -class AzureCredentialsBase(CredentialsConfiguration): +class AzureCredentialsBase(CredentialsConfiguration, WithObjectStoreRsCredentials): azure_storage_account_name: str = None azure_account_host: Optional[str] = None """Alternative host when accessing blob storage endpoint ie. my_account.dfs.core.windows.net""" @@ -32,7 +33,7 @@ def to_object_store_rs_credentials(self) -> Dict[str, str]: @configspec -class AzureCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): """Credentials for Azure Blob Storage, compatible with adlfs""" azure_storage_account_key: Optional[TSecretStrValue] = None @@ -49,6 +50,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: account_host=self.azure_account_host, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.account-key": self.azure_storage_account_key, + "adlfs.sas-token": self.azure_storage_sas_token, + } + def create_sas_token(self) -> None: try: from azure.storage.blob import generate_account_sas, ResourceTypes @@ -72,7 +80,7 @@ def on_partial(self) -> None: @configspec -class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase): +class AzureServicePrincipalCredentialsWithoutDefaults(AzureCredentialsBase, WithPyicebergConfig): azure_tenant_id: str = None azure_client_id: str = None azure_client_secret: TSecretStrValue = None @@ -86,6 +94,14 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: client_secret=self.azure_client_secret, ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + return { + "adlfs.account-name": self.azure_storage_account_name, + "adlfs.tenant-id": self.azure_tenant_id, + "adlfs.client-id": self.azure_client_id, + "adlfs.client-secret": self.azure_client_secret, + } + @configspec class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 8d913d0542..41d1d7a0ca 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -359,7 +359,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: def get_resolvable_fields(cls) -> Dict[str, type]: """Returns a mapping of fields to their type hints. Dunders should not be resolved and are not returned""" return { - f.name: eval(f.type) if isinstance(f.type, str) else f.type # type: ignore[arg-type] + f.name: eval(f.type) if isinstance(f.type, str) else f.type for f in cls._get_resolvable_dataclass_fields() } diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 5d1a5b7f26..a244ab571f 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -1,5 +1,4 @@ import contextlib -import dataclasses import io from typing import ClassVar, List @@ -8,10 +7,6 @@ ConfigProvider, ContextProvider, ) -from dlt.common.configuration.specs.base_configuration import ( - ContainerInjectableContext, - NotResolved, -) from dlt.common.configuration.specs import ( GcpServiceAccountCredentials, BaseConfiguration, @@ -137,7 +132,7 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] # log outside of stderr/out redirect if secrets_toml_var is None: diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 928e46a8a0..fe87ef24d7 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -72,3 +72,7 @@ def __init__(self, spec: Type[Any], native_value: Any): class ObjectStoreRsCredentialsException(ConfigurationException): pass + + +class UnsupportedAuthenticationMethodException(ConfigurationException): + pass diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 60ab1d4b56..17519b032a 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -11,7 +11,9 @@ InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired, + UnsupportedAuthenticationMethodException, ) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretStrValue, StrAny from dlt.common.configuration.specs.base_configuration import ( @@ -23,7 +25,7 @@ @configspec -class GcpCredentials(CredentialsConfiguration): +class GcpCredentials(CredentialsConfiguration, WithObjectStoreRsCredentials, WithPyicebergConfig): token_uri: Final[str] = dataclasses.field( default="https://oauth2.googleapis.com/token", init=False, repr=False, compare=False ) @@ -126,6 +128,12 @@ def to_native_credentials(self) -> Any: else: return ServiceAccountCredentials.from_service_account_info(self) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Service Account authentication not supported with `iceberg` table format. Use OAuth" + " authentication instead." + ) + def __str__(self) -> str: return f"{self.client_email}@{self.project_id}" @@ -176,11 +184,19 @@ def to_native_representation(self) -> str: return json.dumps(self._info_dict()) def to_object_store_rs_credentials(self) -> Dict[str, str]: - raise NotImplementedError( - "`object_store` Rust crate does not support OAuth for GCP credentials. Reference:" - " https://docs.rs/object_store/latest/object_store/gcp." + raise UnsupportedAuthenticationMethodException( + "OAuth authentication not supported with `delta` table format. Use Service Account or" + " Application Default Credentials authentication instead." ) + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + self.auth() + return { + "gcs.project-id": self.project_id, + "gcs.oauth2.token": self.token, + "gcs.oauth2.token-expires-at": (pendulum.now().timestamp() + 60) * 1000, + } + def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: if not self.refresh_token: self.add_scopes(scopes) @@ -313,6 +329,12 @@ def to_native_credentials(self) -> Any: else: return super().to_native_credentials() + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + raise UnsupportedAuthenticationMethodException( + "Application Default Credentials authentication not supported with `iceberg` table" + " format. Use OAuth authentication instead." + ) + @configspec class GcpServiceAccountCredentials( @@ -334,3 +356,9 @@ def parse_native_representation(self, native_value: Any) -> None: except NativeValueError: pass GcpOAuthCredentialsWithoutDefaults.parse_native_representation(self, native_value) + + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + if self.has_default_credentials(): + return GcpDefaultCredentials.to_pyiceberg_fileio_config(self) + else: + return GcpOAuthCredentialsWithoutDefaults.to_pyiceberg_fileio_config(self) diff --git a/dlt/common/configuration/specs/mixins.py b/dlt/common/configuration/specs/mixins.py new file mode 100644 index 0000000000..2f843aee5b --- /dev/null +++ b/dlt/common/configuration/specs/mixins.py @@ -0,0 +1,24 @@ +from typing import Dict, Any +from abc import abstractmethod, ABC + + +class WithObjectStoreRsCredentials(ABC): + @abstractmethod + def to_object_store_rs_credentials(self) -> Dict[str, Any]: + """Returns credentials dictionary for object_store Rust crate. + + Can be used for libraries that build on top of the object_store crate, such as `deltalake`. + + https://docs.rs/object_store/latest/object_store/ + """ + pass + + +class WithPyicebergConfig(ABC): + @abstractmethod + def to_pyiceberg_fileio_config(self) -> Dict[str, Any]: + """Returns `pyiceberg` FileIO configuration dictionary. + + https://py.iceberg.apache.org/configuration/#fileio + """ + pass diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index e2b6c9a442..6ef431a4d0 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -242,7 +242,7 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if self.writer_spec.is_binary_format: self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8", newline="") self._writer = self.writer_cls(self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/destination/utils.py b/dlt/common/destination/utils.py index 0bad5b152e..c98344b687 100644 --- a/dlt/common/destination/utils.py +++ b/dlt/common/destination/utils.py @@ -38,7 +38,7 @@ def verify_schema_capabilities( exception_log: List[Exception] = [] # combined casing function case_identifier = lambda ident: capabilities.casefold_identifier( - (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) # type: ignore + (str if capabilities.has_case_sensitive_identifiers else str.casefold)(ident) ) table_name_lookup: DictStrStr = {} # name collision explanation diff --git a/dlt/common/libs/deltalake.py b/dlt/common/libs/deltalake.py index 4047bc3a1a..0f938e7102 100644 --- a/dlt/common/libs/deltalake.py +++ b/dlt/common/libs/deltalake.py @@ -10,6 +10,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.storages import FilesystemConfiguration from dlt.common.utils import assert_min_pkg_version +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials from dlt.destinations.impl.filesystem.filesystem import FilesystemClient try: @@ -191,10 +192,9 @@ def get_delta_tables( def _deltalake_storage_options(config: FilesystemConfiguration) -> Dict[str, str]: """Returns dict that can be passed as `storage_options` in `deltalake` library.""" - creds = {} # type: ignore + creds = {} extra_options = {} - # TODO: create a mixin with to_object_store_rs_credentials for a proper discovery - if hasattr(config.credentials, "to_object_store_rs_credentials"): + if isinstance(config.credentials, WithObjectStoreRsCredentials): creds = config.credentials.to_object_store_rs_credentials() if config.deltalake_storage_options is not None: extra_options = config.deltalake_storage_options diff --git a/dlt/common/libs/pyiceberg.py b/dlt/common/libs/pyiceberg.py new file mode 100644 index 0000000000..19ce9abbf2 --- /dev/null +++ b/dlt/common/libs/pyiceberg.py @@ -0,0 +1,192 @@ +from typing import Dict, Any, List, Optional + +from dlt import version, Pipeline +from dlt.common.libs.pyarrow import cast_arrow_schema_types +from dlt.common.schema.typing import TWriteDisposition +from dlt.common.utils import assert_min_pkg_version +from dlt.common.exceptions import MissingDependencyException +from dlt.common.storages.configuration import FileSystemCredentials +from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs.mixins import WithPyicebergConfig +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +try: + from pyiceberg.table import Table as IcebergTable + from pyiceberg.catalog import MetastoreCatalog + import pyarrow as pa +except ModuleNotFoundError: + raise MissingDependencyException( + "dlt pyiceberg helpers", + [f"{version.DLT_PKG_NAME}[pyiceberg]"], + "Install `pyiceberg` so dlt can create Iceberg tables in the `filesystem` destination.", + ) + + +def ensure_iceberg_compatible_arrow_schema(schema: pa.Schema) -> pa.Schema: + ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP = { + pa.types.is_time: pa.string(), + pa.types.is_decimal256: pa.string(), # pyarrow does not allow downcasting to decimal128 + } + return cast_arrow_schema_types(schema, ARROW_TO_ICEBERG_COMPATIBLE_ARROW_TYPE_MAP) + + +def ensure_iceberg_compatible_arrow_data(data: pa.Table) -> pa.Table: + schema = ensure_iceberg_compatible_arrow_schema(data.schema) + return data.cast(schema) + + +def write_iceberg_table( + table: IcebergTable, + data: pa.Table, + write_disposition: TWriteDisposition, +) -> None: + if write_disposition == "append": + table.append(ensure_iceberg_compatible_arrow_data(data)) + elif write_disposition == "replace": + table.overwrite(ensure_iceberg_compatible_arrow_data(data)) + + +def get_sql_catalog(credentials: FileSystemCredentials) -> "SqlCatalog": # type: ignore[name-defined] # noqa: F821 + assert_min_pkg_version( + pkg_name="sqlalchemy", + version="2.0.18", + msg=( + "`sqlalchemy>=2.0.18` is needed for `iceberg` table format on `filesystem` destination." + ), + ) + + from pyiceberg.catalog.sql import SqlCatalog + + return SqlCatalog( + "default", + uri="sqlite:///:memory:", + **_get_fileio_config(credentials), + ) + + +def create_or_evolve_table( + catalog: MetastoreCatalog, + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + # add table to catalog + table_id = f"{namespace_name}.{table_name}" + table_path = f"{client.dataset_path}/{table_name}" + metadata_path = f"{table_path}/metadata" + if client.fs_client.exists(metadata_path): + # found metadata; register existing table + table = _register_table(table_id, metadata_path, catalog, client) + + # evolve schema + if schema is not None: + with table.update_schema() as update: + update.union_by_name(ensure_iceberg_compatible_arrow_schema(schema)) + else: + # found no metadata; create new table + assert schema is not None + with catalog.create_table_transaction( + table_id, + schema=ensure_iceberg_compatible_arrow_schema(schema), + location=_make_path(table_path, client), + ) as txn: + # add partitioning + with txn.update_spec() as update_spec: + for col in partition_columns: + update_spec.add_identity(col) + + return catalog + + +def get_catalog( + client: FilesystemClient, + table_name: str, + namespace_name: Optional[str] = None, + schema: Optional[pa.Schema] = None, + partition_columns: Optional[List[str]] = None, +) -> MetastoreCatalog: + """Returns single-table, ephemeral, in-memory Iceberg catalog.""" + + # create in-memory catalog + catalog: MetastoreCatalog = get_sql_catalog(client.config.credentials) + + # create namespace + if namespace_name is None: + namespace_name = client.dataset_name + catalog.create_namespace(namespace_name) + + # add table to catalog + catalog = create_or_evolve_table( + catalog=catalog, + client=client, + table_name=table_name, + namespace_name=namespace_name, + schema=schema, + partition_columns=partition_columns, + ) + + return catalog + + +def get_iceberg_tables( + pipeline: Pipeline, *tables: str, schema_name: Optional[str] = None +) -> Dict[str, IcebergTable]: + from dlt.common.schema.utils import get_table_format + + with pipeline.destination_client(schema_name=schema_name) as client: + assert isinstance( + client, FilesystemClient + ), "The `get_iceberg_tables` function requires a `filesystem` destination." + + schema_iceberg_tables = [ + t["name"] + for t in client.schema.tables.values() + if get_table_format(client.schema.tables, t["name"]) == "iceberg" + ] + if len(tables) > 0: + invalid_tables = set(tables) - set(schema_iceberg_tables) + if len(invalid_tables) > 0: + available_schemas = "" + if len(pipeline.schema_names) > 1: + available_schemas = f" Available schemas are {pipeline.schema_names}" + raise ValueError( + f"Schema {client.schema.name} does not contain Iceberg tables with these names:" + f" {', '.join(invalid_tables)}.{available_schemas}" + ) + schema_iceberg_tables = [t for t in schema_iceberg_tables if t in tables] + + return { + name: get_catalog(client, name).load_table(f"{pipeline.dataset_name}.{name}") + for name in schema_iceberg_tables + } + + +def _get_fileio_config(credentials: CredentialsConfiguration) -> Dict[str, Any]: + if isinstance(credentials, WithPyicebergConfig): + return credentials.to_pyiceberg_fileio_config() + return {} + + +def _get_last_metadata_file(metadata_path: str, client: FilesystemClient) -> str: + # TODO: implement faster way to obtain `last_metadata_file` (listing is slow) + metadata_files = [f for f in client.fs_client.ls(metadata_path) if f.endswith(".json")] + return _make_path(sorted(metadata_files)[-1], client) + + +def _register_table( + identifier: str, + metadata_path: str, + catalog: MetastoreCatalog, + client: FilesystemClient, +) -> IcebergTable: + last_metadata_file = _get_last_metadata_file(metadata_path, client) + return catalog.register_table(identifier, last_metadata_file) + + +def _make_path(path: str, client: FilesystemClient) -> str: + # don't use file protocol for local files because duckdb does not support it + # https://github.com/duckdb/duckdb/issues/13669 + return path if client.is_local_filesystem else client.config.make_url(path) diff --git a/dlt/common/logger.py b/dlt/common/logger.py index b163c15672..634e305805 100644 --- a/dlt/common/logger.py +++ b/dlt/common/logger.py @@ -47,7 +47,7 @@ def is_logging() -> bool: def log_level() -> str: if not LOGGER: raise RuntimeError("Logger not initialized") - return logging.getLevelName(LOGGER.level) # type: ignore + return logging.getLevelName(LOGGER.level) def is_json_logging(log_format: str) -> bool: diff --git a/dlt/common/metrics.py b/dlt/common/metrics.py index d6acf19d0d..2f9f574dd0 100644 --- a/dlt/common/metrics.py +++ b/dlt/common/metrics.py @@ -9,7 +9,7 @@ class DataWriterMetrics(NamedTuple): created: float last_modified: float - def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: + def __add__(self, other: Tuple[object, ...], /) -> Tuple[object, ...]: # type: ignore[override] if isinstance(other, DataWriterMetrics): return DataWriterMetrics( self.file_path if self.file_path == other.file_path else "", diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c612c5a4f1..27c7bd8758 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -90,24 +90,24 @@ def rewrite_python_script( last_line = -1 last_offset = -1 # sort transformed nodes by line and offset - for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): + for node, t_value in sorted(transformed_nodes, key=lambda n: (n[0].lineno, n[0].col_offset)): # type: ignore[attr-defined] # do we have a line changed - if last_line != node.lineno - 1: + if last_line != node.lineno - 1: # type: ignore[attr-defined] # add remainder from the previous line if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # type: ignore[attr-defined] # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) # type: ignore[attr-defined] elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # type: ignore[attr-defined] # replace node value script_lines.append(ast_unparse(t_value).strip()) - last_line = node.end_lineno - 1 - last_offset = node.end_col_offset + last_line = node.end_lineno - 1 # type: ignore[attr-defined] + last_offset = node.end_col_offset # type: ignore[attr-defined] # add all that was missing if last_offset >= 0: diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index d6031a08fa..276bbe9c09 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -525,7 +525,7 @@ def get_new_table_columns( Typically they come from the destination schema. Columns that are in `existing_columns` and not in `table_name` columns are ignored. Optionally includes incomplete columns (without data type)""" - casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str # type: ignore[assignment] + casefold_f: Callable[[str], str] = str.casefold if not case_sensitive else str casefold_existing = { casefold_f(col_name): col for col_name, col in existing_columns.items() } diff --git a/dlt/common/typing.py b/dlt/common/typing.py index a3364d1b07..8986d753f3 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -446,7 +446,7 @@ def get_generic_type_argument_from_instance( if cls_: orig_param_type = get_args(cls_)[0] if orig_param_type in (Any, CallableAny) and sample_value is not None: - orig_param_type = type(sample_value) + orig_param_type = type(sample_value) # type: ignore[assignment] return orig_param_type # type: ignore diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 2463da58fa..906bd157e4 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -19,7 +19,7 @@ def filesystem_loader_file_format_selector( *, table_schema: TTableSchema, ) -> t.Tuple[TLoaderFileFormat, t.Sequence[TLoaderFileFormat]]: - if table_schema.get("table_format") == "delta": + if table_schema.get("table_format") in ("delta", "iceberg"): return ("parquet", ["parquet"]) return (preferred_loader_file_format, supported_loader_file_formats) @@ -43,7 +43,7 @@ def _raw_capabilities(self) -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext.generic_capabilities( preferred_loader_file_format="jsonl", loader_file_format_selector=filesystem_loader_file_format_selector, - supported_table_formats=["delta"], + supported_table_formats=["delta", "iceberg"], supported_merge_strategies=["upsert"], merge_strategies_selector=filesystem_merge_strategies_selector, ) diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index 1739c87fb3..ccf764811b 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -119,16 +119,27 @@ def metrics(self) -> Optional[LoadJobMetrics]: return m._replace(remote_url=self.make_remote_url()) -class DeltaLoadFilesystemJob(FilesystemLoadJob): +class TableFormatLoadFilesystemJob(FilesystemLoadJob): def __init__(self, file_path: str) -> None: super().__init__(file_path=file_path) self.file_paths = ReferenceFollowupJobRequest.resolve_references(self._file_path) def make_remote_path(self) -> str: - # remote path is table dir - delta will create its file structure inside it return self._job_client.get_table_dir(self.load_table_name) + @property + def arrow_dataset(self) -> Any: + from dlt.common.libs.pyarrow import pyarrow + + return pyarrow.dataset.dataset(self.file_paths) + + @property + def _partition_columns(self) -> List[str]: + return get_columns_names_with_prop(self._load_table, "partition") + + +class DeltaLoadFilesystemJob(TableFormatLoadFilesystemJob): def run(self) -> None: # create Arrow dataset from Parquet files from dlt.common.libs.pyarrow import pyarrow as pa @@ -138,7 +149,7 @@ def run(self) -> None: f"Will copy file(s) {self.file_paths} to delta table {self.make_remote_url()} [arrow" f" buffer: {pa.total_allocated_bytes()}]" ) - source_ds = pa.dataset.dataset(self.file_paths) + source_ds = self.arrow_dataset delta_table = self._delta_table() # explicitly check if there is data @@ -148,9 +159,6 @@ def run(self) -> None: else: with source_ds.scanner().to_reader() as arrow_rbr: # RecordBatchReader if self._load_table["write_disposition"] == "merge" and delta_table is not None: - self._load_table["x-merge-strategy"] = resolve_merge_strategy( # type: ignore[typeddict-unknown-key] - self._schema.tables, self._load_table, self._job_client.capabilities - ) merge_delta_table( table=delta_table, data=arrow_rbr, @@ -188,10 +196,6 @@ def _delta_table(self) -> Optional["DeltaTable"]: # type: ignore[name-defined] else: return None - @property - def _partition_columns(self) -> List[str]: - return get_columns_names_with_prop(self._load_table, "partition") - def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "DeltaTable") -> "DeltaTable": # type: ignore[name-defined] # noqa: F821 from dlt.common.libs.deltalake import ( DeltaTable, @@ -211,13 +215,36 @@ def _create_or_evolve_delta_table(self, arrow_ds: "Dataset", delta_table: "Delta return _evolve_delta_table_schema(delta_table, arrow_ds.schema) +class IcebergLoadFilesystemJob(TableFormatLoadFilesystemJob): + def run(self) -> None: + from dlt.common.libs.pyiceberg import write_iceberg_table + + write_iceberg_table( + table=self._iceberg_table(), + data=self.arrow_dataset.to_table(), + write_disposition=self._load_table["write_disposition"], + ) + + def _iceberg_table(self) -> "pyiceberg.table.Table": # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyiceberg import get_catalog + + catalog = get_catalog( + client=self._job_client, + table_name=self.load_table_name, + schema=self.arrow_dataset.schema, + partition_columns=self._partition_columns, + ) + return catalog.load_table(self.table_identifier) + + @property + def table_identifier(self) -> str: + return f"{self._job_client.dataset_name}.{self.load_table_name}" + + class FilesystemLoadJobWithFollowup(HasFollowupJobs, FilesystemLoadJob): def create_followup_jobs(self, final_state: TLoadJobState) -> List[FollowupJobRequest]: jobs = super().create_followup_jobs(final_state) - if self._load_table.get("table_format") == "delta": - # delta table jobs only require table chain followup jobs - pass - elif final_state == "completed": + if final_state == "completed": ref_job = ReferenceFollowupJobRequest( original_file_name=self.file_name(), remote_paths=[self._job_client.make_remote_url(self.make_remote_path())], @@ -394,6 +421,13 @@ def prepare_load_table(self, table_name: str) -> PreparedTableSchema: if table["write_disposition"] == "merge": table["write_disposition"] = "append" table.pop("table_format", None) + merge_strategy = resolve_merge_strategy(self.schema.tables, table, self.capabilities) + if table["write_disposition"] == "merge": + if merge_strategy is None: + # no supported merge strategies, fall back to append + table["write_disposition"] = "append" + else: + table["x-merge-strategy"] = merge_strategy # type: ignore[typeddict-unknown-key] return table def get_table_dir(self, table_name: str, remote: bool = False) -> str: @@ -458,12 +492,20 @@ def create_load_job( # where we want to load the state the regular way if table["name"] == self.schema.state_table_name and not self.config.as_staging_destination: return FinalizedLoadJob(file_path) - if table.get("table_format") == "delta": - import dlt.common.libs.deltalake # assert dependencies are installed + table_format = table.get("table_format") + if table_format in ("delta", "iceberg"): # a reference job for a delta table indicates a table chain followup job if ReferenceFollowupJobRequest.is_reference_job(file_path): - return DeltaLoadFilesystemJob(file_path) + if table_format == "delta": + import dlt.common.libs.deltalake + + return DeltaLoadFilesystemJob(file_path) + elif table_format == "iceberg": + import dlt.common.libs.pyiceberg + + return IcebergLoadFilesystemJob(file_path) + # otherwise just continue return FinalizedLoadJobWithFollowupJobs(file_path) @@ -494,10 +536,10 @@ def should_load_data_to_staging_dataset(self, table_name: str) -> bool: def should_truncate_table_before_load(self, table_name: str) -> bool: table = self.prepare_load_table(table_name) - return ( - table["write_disposition"] == "replace" - and not table.get("table_format") == "delta" # Delta can do a logical replace - ) + return table["write_disposition"] == "replace" and not table.get("table_format") in ( + "delta", + "iceberg", + ) # Delta/Iceberg can do a logical replace # # state stuff @@ -718,7 +760,7 @@ def create_table_chain_completed_followup_jobs( jobs = super().create_table_chain_completed_followup_jobs( table_chain, completed_table_chain_jobs ) - if table_chain[0].get("table_format") == "delta": + if table_chain[0].get("table_format") in ("delta", "iceberg"): for table in table_chain: table_job_paths = [ job.file_path diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d03a00b418..d39f4c3431 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -13,6 +13,7 @@ from dlt.common.destination.reference import DBApiCursor +from dlt.common.storages.fsspec_filesystem import AZURE_BLOB_STORAGE_PROTOCOLS from dlt.destinations.sql_client import raise_database_error from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient @@ -169,8 +170,9 @@ def create_authentication(self, persistent: bool = False, secret_name: str = Non # native google storage implementation is not supported.. elif self.fs_client.config.protocol in ["gs", "gcs"]: logger.warn( - "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer. Falling" - " back to fsspec." + "For gs/gcs access via duckdb please use the gs/gcs s3 compatibility layer if" + " possible (not supported when using `iceberg` table format). Falling back to" + " fsspec." ) self._conn.register_filesystem(self.fs_client.fs_client) @@ -192,7 +194,7 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux # see duckdb docs - if self.fs_client.config.protocol in ["az", "abfss"]: + if self.fs_client.config.protocol in AZURE_BLOB_STORAGE_PROTOCOLS: self._conn.sql("SET azure_transport_option_type = 'curl';") return self._conn @@ -258,6 +260,13 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: from_statement = "" if schema_table.get("table_format") == "delta": from_statement = f"delta_scan('{resolved_folder}')" + elif schema_table.get("table_format") == "iceberg": + from dlt.common.libs.pyiceberg import _get_last_metadata_file + + self._setup_iceberg(self._conn) + metadata_path = f"{resolved_folder}/metadata" + last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) + from_statement = f"iceberg_scan('{last_metadata_file}')" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": @@ -267,7 +276,7 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: else: raise NotImplementedError( f"Unknown filetype {first_file_type} for table {table_name}. Currently only" - " jsonl and parquet files as well as delta tables are supported." + " jsonl and parquet files as well as delta and iceberg tables are supported." ) # create table @@ -299,6 +308,16 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB with super().execute_query(query, *args, **kwargs) as cursor: yield cursor + @staticmethod + def _setup_iceberg(conn: duckdb.DuckDBPyConnection) -> None: + # needed to make persistent secrets work in new connection + # https://github.com/duckdb/duckdb_iceberg/issues/83 + conn.execute("FROM duckdb_secrets();") + + # `duckdb_iceberg` extension does not support autoloading + # https://github.com/duckdb/duckdb_iceberg/issues/71 + conn.execute("INSTALL iceberg; LOAD iceberg;") + def __del__(self) -> None: if self.memory_db: self.memory_db.close() diff --git a/dlt/destinations/impl/sqlalchemy/factory.py b/dlt/destinations/impl/sqlalchemy/factory.py index edd827ed00..e61ac1fb6a 100644 --- a/dlt/destinations/impl/sqlalchemy/factory.py +++ b/dlt/destinations/impl/sqlalchemy/factory.py @@ -81,6 +81,9 @@ def adjust_capabilities( caps.max_column_identifier_length = dialect.max_identifier_length caps.supports_native_boolean = dialect.supports_native_boolean if dialect.name == "mysql": + # correct max identifier length + # dialect uses 255 (max length for aliases) instead of 64 (max length of identifiers) + caps.max_identifier_length = 64 caps.format_datetime_literal = _format_mysql_datetime_literal return caps diff --git a/dlt/extract/incremental/lag.py b/dlt/extract/incremental/lag.py index ee102a9961..dfafa2cd11 100644 --- a/dlt/extract/incremental/lag.py +++ b/dlt/extract/incremental/lag.py @@ -20,7 +20,7 @@ def _apply_lag_to_value( parsed_value = ensure_pendulum_datetime(value) if is_str else value if isinstance(parsed_value, (datetime, date)): - parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) + parsed_value = _apply_lag_to_datetime(lag, parsed_value, last_value_func, is_str_date) # type: ignore[assignment] # go back to string or pass exact type value = parsed_value.strftime(value_format) if value_format else parsed_value # type: ignore[assignment] diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 99458a3949..aaa19ea97d 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -18,7 +18,7 @@ from airflow.configuration import conf from airflow.models import TaskInstance from airflow.utils.task_group import TaskGroup - from airflow.operators.dummy import DummyOperator # type: ignore + from airflow.operators.dummy import DummyOperator from airflow.operators.python import PythonOperator, get_current_context except ModuleNotFoundError: raise MissingDependencyException("Airflow", ["apache-airflow>=2.5"]) @@ -255,7 +255,7 @@ def _run( # use task logger if self.use_task_logger: - ti: TaskInstance = get_current_context()["ti"] # type: ignore + ti: TaskInstance = get_current_context()["ti"] # type: ignore[assignment,unused-ignore] logger.LOGGER = ti.log # set global number of buffered items diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index a2a0014e4e..fd114478fb 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -83,6 +83,7 @@ duckdb: extensions: - httpfs - parquet + - iceberg # TODO: emit the config of duck db motherduck: diff --git a/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md new file mode 100644 index 0000000000..7a056d6b40 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/delta-iceberg.md @@ -0,0 +1,168 @@ +--- +title: Delta / Iceberg +description: Delta / Iceberg `dlt` destination +keywords: [delta, iceberg, destination, data warehouse] +--- + +# Delta and Iceberg table formats +`dlt` supports writing [Delta](https://delta.io/) and [Iceberg](https://iceberg.apache.org/) tables when using the [filesystem](./filesystem.md) destination. + +## How it works +`dlt` uses the [deltalake](https://pypi.org/project/deltalake/) and [pyiceberg](https://pypi.org/project/pyiceberg/) libraries to write Delta and Iceberg tables, respectively. One or multiple Parquet files are prepared during the extract and normalize steps. In the load step, these Parquet files are exposed as an Arrow data structure and fed into `deltalake` or `pyiceberg`. + +## Iceberg single-user ephemeral catalog +`dlt` uses single-table, ephemeral, in-memory, sqlite-based [Iceberg catalog](https://iceberg.apache.org/concepts/catalog/)s. These catalogs are created "on demand" when a pipeline is run, and do not persist afterwards. If a table already exists in the filesystem, it gets registered into the catalog using its latest metadata file. This allows for a serverless setup. It is currently not possible to connect your own Iceberg catalog. + +:::caution +While ephemeral catalogs make it easy to get started with Iceberg, it comes with limitations: +- concurrent writes are not handled and may lead to corrupt table state +- we cannot guarantee that reads concurrent with writes are clean +- the latest manifest file needs to be searched for using file listing—this can become slow with large tables, especially in cloud object stores +::: + +## Delta dependencies + +You need the `deltalake` package to use this format: + +```sh +pip install "dlt[deltalake]" +``` + +You also need `pyarrow>=17.0.0`: + +```sh +pip install 'pyarrow>=17.0.0' +``` + +## Iceberg dependencies + +You need Python version 3.9 or higher and the `pyiceberg` package to use this format: + +```sh +pip install "dlt[pyiceberg]" +``` + +You also need `sqlalchemy>=2.0.18`: + +```sh +pip install 'sqlalchemy>=2.0.18' +``` + +## Set table format + +Set the `table_format` argument to `delta` or `iceberg` when defining your resource: + +```py +@dlt.resource(table_format="delta") +def my_delta_resource(): + ... +``` + +or when calling `run` on your pipeline: + +```py +pipeline.run(my_resource, table_format="delta") +``` + +:::note +`dlt` always uses Parquet as `loader_file_format` when using the `delta` or `iceberg` table format. Any setting of `loader_file_format` is disregarded. +::: + + +## Table format partitioning +Both `delta` and `iceberg` tables can be partitioned by specifying one or more `partition` column hints. This example partitions a Delta table by the `foo` column: + +```py +@dlt.resource( + table_format="delta", + columns={"foo": {"partition": True}} +) +def my_delta_resource(): + ... +``` + +:::note +Delta uses [Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/), while Iceberg uses [hidden partioning](https://iceberg.apache.org/docs/latest/partitioning/). +::: + +:::caution +Partition evolution (changing partition columns after a table has been created) is not supported. +::: + +## Table access helper functions +You can use the `get_delta_tables` and `get_iceberg_tables` helper functions to acccess native table objects. For `delta` these are `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects, for `iceberg` these are `pyiceberg` [Table](https://py.iceberg.apache.org/reference/pyiceberg/table/#pyiceberg.table.Table) objects. + +```py +from dlt.common.libs.deltalake import get_delta_tables +# from dlt.common.libs.pyiceberg import get_iceberg_tables + +... + +# get dictionary of DeltaTable objects +delta_tables = get_delta_tables(pipeline) + +# execute operations on DeltaTable objects +delta_tables["my_delta_table"].optimize.compact() +delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) +# delta_tables["my_delta_table"].vacuum() +# etc. +``` + +## Table format Google Cloud Storage authentication + +Note that not all authentication methods are supported when using table formats on Google Cloud Storage: + +| Authentication method | `delta` | `iceberg` | +| -- | -- | -- | +| [Service Account](bigquery.md#setup-guide) | ✅ | ❌ | +| [OAuth](../destinations/bigquery.md#oauth-20-authentication) | ❌ | ✅ | +| [Application Default Credentials](bigquery.md#using-default-credentials) | ✅ | ❌ | + +:::note +The [S3-compatible](#using-s3-compatible-storage) interface for Google Cloud Storage is not supported when using `iceberg`. +::: + +## Iceberg Azure scheme +The `az` [scheme](#supported-schemes) is not supported when using the `iceberg` table format. Please use the `abfss` scheme. This is because `pyiceberg`, which `dlt` used under the hood, currently does not support `az`. + +## Table format `merge` support (**experimental**) +The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported for `delta`. For `iceberg`, the `merge` write disposition is not supported and falls back to `append`. + +:::caution +The `upsert` merge strategy for the filesystem destination with Delta table format is **experimental**. +::: + +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "upsert"}, + primary_key="my_primary_key", + table_format="delta" +) +def my_upsert_resource(): + ... +... +``` + +### Known limitations +- `hard_delete` hint not supported +- Deleting records from nested tables not supported + - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. + +## Delta table format storage options +You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: + +```toml +[destination.filesystem] +deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' +``` + +`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. + +You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. + +>❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. + +## Delta table format memory usage +:::caution +Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. +::: \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 9b243b9429..de3d12e8e1 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -108,7 +108,8 @@ You need to create an S3 bucket and a user who can access that bucket. dlt does #### Using S3 compatible storage -To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/) or [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: +To use an S3 compatible storage other than AWS S3, such as [MinIO](https://min.io/), [Cloudflare R2](https://www.cloudflare.com/en-ca/developer-platform/r2/) or [Google +Cloud Storage](https://cloud.google.com/storage/docs/interoperability), you may supply an `endpoint_url` in the config. This should be set along with AWS credentials: ```toml [destination.filesystem] @@ -166,6 +167,8 @@ Run `pip install "dlt[az]"` which will install the `adlfs` package to interface Edit the credentials in `.dlt/secrets.toml`, you'll see AWS credentials by default; replace them with your Azure credentials. +#### Supported schemes + `dlt` supports both forms of the blob storage urls: ```toml [destination.filesystem] @@ -404,29 +407,6 @@ The filesystem destination handles the write dispositions as follows: - `replace` - all files that belong to such tables are deleted from the dataset folder, and then the current set of files is added. - `merge` - falls back to `append` -### Merge with Delta table format (experimental) -The [`upsert`](../../general-usage/incremental-loading.md#upsert-strategy) merge strategy is supported when using the [Delta table format](#delta-table-format). - -:::caution -The `upsert` merge strategy for the filesystem destination with Delta table format is experimental. -::: - -```py -@dlt.resource( - write_disposition={"disposition": "merge", "strategy": "upsert"}, - primary_key="my_primary_key", - table_format="delta" -) -def my_upsert_resource(): - ... -... -``` - -#### Known limitations -- `hard_delete` hint not supported -- Deleting records from nested tables not supported - - This means updates to JSON columns that involve element removals are not propagated. For example, if you first load `{"key": 1, "nested": [1, 2]}` and then load `{"key": 1, "nested": [1]}`, then the record for element `2` will not be deleted from the nested table. - ## File compression The filesystem destination in the dlt library uses `gzip` compression by default for efficiency, which may result in the files being stored in a compressed format. This format may not be easily readable as plain text or JSON Lines (`jsonl`) files. If you encounter files that seem unreadable, they may be compressed. @@ -645,88 +625,9 @@ You can choose the following file formats: ## Supported table formats -You can choose the following table formats: -* [Delta table](../table-formats/delta.md) is supported - -### Delta table format - -You need the `deltalake` package to use this format: - -```sh -pip install "dlt[deltalake]" -``` - -You also need `pyarrow>=17.0.0`: - -```sh -pip install 'pyarrow>=17.0.0' -``` - -Set the `table_format` argument to `delta` when defining your resource: - -```py -@dlt.resource(table_format="delta") -def my_delta_resource(): - ... -``` - -:::note -`dlt` always uses Parquet as `loader_file_format` when using the `delta` table format. Any setting of `loader_file_format` is disregarded. -::: - -:::caution -Beware that when loading a large amount of data for one table, the underlying rust implementation will consume a lot of memory. This is a known issue and the maintainers are actively working on a solution. You can track the progress [here](https://github.com/delta-io/delta-rs/pull/2289). Until the issue is resolved, you can mitigate the memory consumption by doing multiple smaller incremental pipeline runs. -::: - -#### Delta table partitioning -A Delta table can be partitioned ([Hive-style partitioning](https://delta.io/blog/pros-cons-hive-style-partionining/)) by specifying one or more `partition` column hints. This example partitions the Delta table by the `foo` column: - -```py -@dlt.resource( - table_format="delta", - columns={"foo": {"partition": True}} -) -def my_delta_resource(): - ... -``` - -:::caution -It is **not** possible to change partition columns after the Delta table has been created. Trying to do so causes an error stating that the partition columns don't match. -::: - - -#### Storage options -You can pass storage options by configuring `destination.filesystem.deltalake_storage_options`: - -```toml -[destination.filesystem] -deltalake_storage_options = '{"AWS_S3_LOCKING_PROVIDER": "dynamodb", "DELTA_DYNAMO_TABLE_NAME": "custom_table_name"}' -``` - -`dlt` passes these options to the `storage_options` argument of the `write_deltalake` method in the `deltalake` library. Look at their [documentation](https://delta-io.github.io/delta-rs/api/delta_writer/#deltalake.write_deltalake) to see which options can be used. - -You don't need to specify credentials here. `dlt` merges the required credentials with the options you provided before passing it as `storage_options`. - ->❗When using `s3`, you need to specify storage options to [configure](https://delta-io.github.io/delta-rs/usage/writing/writing-to-s3-with-locking-provider/) locking behavior. - -#### `get_delta_tables` helper -You can use the `get_delta_tables` helper function to get `deltalake` [DeltaTable](https://delta-io.github.io/delta-rs/api/delta_table/) objects for your Delta tables: - -```py -from dlt.common.libs.deltalake import get_delta_tables - -... - -# get dictionary of DeltaTable objects -delta_tables = get_delta_tables(pipeline) - -# execute operations on DeltaTable objects -delta_tables["my_delta_table"].optimize.compact() -delta_tables["another_delta_table"].optimize.z_order(["col_a", "col_b"]) -# delta_tables["my_delta_table"].vacuum() -# etc. - -``` +You can choose the following [table formats](./delta-iceberg.md): +* Delta table +* Iceberg ## Syncing of dlt state This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). To this end, special folders and files will be created at your destination which hold information about your pipeline state, schemas, and completed loads. These folders DO NOT respect your settings in the layout section. When using filesystem as a staging destination, not all of these folders are created, as the state and schemas are managed in the regular way by the final destination you have configured. diff --git a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md index 233ae0ce21..edca521e52 100644 --- a/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md +++ b/docs/website/docs/dlt-ecosystem/table-formats/iceberg.md @@ -10,5 +10,5 @@ keywords: [iceberg, table formats] ## Supported destinations -Supported by: **Athena** +Supported by: **Athena**, **filesystem** diff --git a/docs/website/docs/general-usage/dataset-access/ibis-backend.md b/docs/website/docs/general-usage/dataset-access/ibis-backend.md index 8f4b0fb6b6..9f9b65e9c0 100644 --- a/docs/website/docs/general-usage/dataset-access/ibis-backend.md +++ b/docs/website/docs/general-usage/dataset-access/ibis-backend.md @@ -6,7 +6,7 @@ keywords: [data, dataset, ibis] # Ibis -Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). +Ibis is a powerful portable Python dataframe library. Learn more about what it is and how to use it in the [official documentation](https://ibis-project.org/). `dlt` provides an easy way to hand over your loaded dataset to an Ibis backend connection. @@ -46,4 +46,3 @@ print(table.limit(10).execute()) # Visit the ibis docs to learn more about the available methods ``` - diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 274f3e82b3..8e8c11fc09 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -167,6 +167,7 @@ const sidebars = { 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/clickhouse', 'dlt-ecosystem/destinations/filesystem', + 'dlt-ecosystem/destinations/delta-iceberg', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', 'dlt-ecosystem/destinations/snowflake', diff --git a/mypy.ini b/mypy.ini index 769e84b13a..fdf0ceb1e6 100644 --- a/mypy.ini +++ b/mypy.ini @@ -135,3 +135,9 @@ ignore_missing_imports = True [mypy-time_machine.*] ignore_missing_imports = True + +[mypy-pyiceberg.*] +ignore_missing_imports = True + +[mypy-airflow.*] +ignore_missing_imports = True diff --git a/poetry.lock b/poetry.lock index 749979439d..83090360b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1543,13 +1543,13 @@ files = [ [[package]] name = "cachetools" -version = "5.3.1" +version = "5.5.0" description = "Extensible memoizing collections and decorators" optional = false python-versions = ">=3.7" files = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, + {file = "cachetools-5.5.0-py3-none-any.whl", hash = "sha256:02134e8439cdc2ffb62023ce1debca2944c3f289d66bb17ead3ab3dede74b292"}, + {file = "cachetools-5.5.0.tar.gz", hash = "sha256:2cc24fb4cbe39633fb7badd9db9ca6295d766d9c2995f245725a46715d050f2a"}, ] [[package]] @@ -5872,44 +5872,49 @@ files = [ [[package]] name = "mypy" -version = "1.10.0" +version = "1.12.1" description = "Optional static typing for Python" optional = false python-versions = ">=3.8" files = [ - {file = "mypy-1.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:da1cbf08fb3b851ab3b9523a884c232774008267b1f83371ace57f412fe308c2"}, - {file = "mypy-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:12b6bfc1b1a66095ab413160a6e520e1dc076a28f3e22f7fb25ba3b000b4ef99"}, - {file = "mypy-1.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e36fb078cce9904c7989b9693e41cb9711e0600139ce3970c6ef814b6ebc2b2"}, - {file = "mypy-1.10.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2b0695d605ddcd3eb2f736cd8b4e388288c21e7de85001e9f85df9187f2b50f9"}, - {file = "mypy-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:cd777b780312ddb135bceb9bc8722a73ec95e042f911cc279e2ec3c667076051"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3be66771aa5c97602f382230165b856c231d1277c511c9a8dd058be4784472e1"}, - {file = "mypy-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8b2cbaca148d0754a54d44121b5825ae71868c7592a53b7292eeb0f3fdae95ee"}, - {file = "mypy-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ec404a7cbe9fc0e92cb0e67f55ce0c025014e26d33e54d9e506a0f2d07fe5de"}, - {file = "mypy-1.10.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e22e1527dc3d4aa94311d246b59e47f6455b8729f4968765ac1eacf9a4760bc7"}, - {file = "mypy-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:a87dbfa85971e8d59c9cc1fcf534efe664d8949e4c0b6b44e8ca548e746a8d53"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a781f6ad4bab20eef8b65174a57e5203f4be627b46291f4589879bf4e257b97b"}, - {file = "mypy-1.10.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b808e12113505b97d9023b0b5e0c0705a90571c6feefc6f215c1df9381256e30"}, - {file = "mypy-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f55583b12156c399dce2df7d16f8a5095291354f1e839c252ec6c0611e86e2e"}, - {file = "mypy-1.10.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4cf18f9d0efa1b16478c4c129eabec36148032575391095f73cae2e722fcf9d5"}, - {file = "mypy-1.10.0-cp312-cp312-win_amd64.whl", hash = "sha256:bc6ac273b23c6b82da3bb25f4136c4fd42665f17f2cd850771cb600bdd2ebeda"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9fd50226364cd2737351c79807775136b0abe084433b55b2e29181a4c3c878c0"}, - {file = "mypy-1.10.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f90cff89eea89273727d8783fef5d4a934be2fdca11b47def50cf5d311aff727"}, - {file = "mypy-1.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fcfc70599efde5c67862a07a1aaf50e55bce629ace26bb19dc17cece5dd31ca4"}, - {file = "mypy-1.10.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:075cbf81f3e134eadaf247de187bd604748171d6b79736fa9b6c9685b4083061"}, - {file = "mypy-1.10.0-cp38-cp38-win_amd64.whl", hash = "sha256:3f298531bca95ff615b6e9f2fc0333aae27fa48052903a0ac90215021cdcfa4f"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa7ef5244615a2523b56c034becde4e9e3f9b034854c93639adb667ec9ec2976"}, - {file = "mypy-1.10.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3236a4c8f535a0631f85f5fcdffba71c7feeef76a6002fcba7c1a8e57c8be1ec"}, - {file = "mypy-1.10.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4a2b5cdbb5dd35aa08ea9114436e0d79aceb2f38e32c21684dcf8e24e1e92821"}, - {file = "mypy-1.10.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92f93b21c0fe73dc00abf91022234c79d793318b8a96faac147cd579c1671746"}, - {file = "mypy-1.10.0-cp39-cp39-win_amd64.whl", hash = "sha256:28d0e038361b45f099cc086d9dd99c15ff14d0188f44ac883010e172ce86c38a"}, - {file = "mypy-1.10.0-py3-none-any.whl", hash = "sha256:f8c083976eb530019175aabadb60921e73b4f45736760826aa1689dda8208aee"}, - {file = "mypy-1.10.0.tar.gz", hash = "sha256:3d087fcbec056c4ee34974da493a826ce316947485cef3901f511848e687c131"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3d7d4371829184e22fda4015278fbfdef0327a4b955a483012bd2d423a788801"}, + {file = "mypy-1.12.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f59f1dfbf497d473201356966e353ef09d4daec48caeacc0254db8ef633a28a5"}, + {file = "mypy-1.12.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b947097fae68004b8328c55161ac9db7d3566abfef72d9d41b47a021c2fba6b1"}, + {file = "mypy-1.12.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:96af62050971c5241afb4701c15189ea9507db89ad07794a4ee7b4e092dc0627"}, + {file = "mypy-1.12.1-cp310-cp310-win_amd64.whl", hash = "sha256:d90da248f4c2dba6c44ddcfea94bb361e491962f05f41990ff24dbd09969ce20"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1230048fec1380faf240be6385e709c8570604d2d27ec6ca7e573e3bc09c3735"}, + {file = "mypy-1.12.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02dcfe270c6ea13338210908f8cadc8d31af0f04cee8ca996438fe6a97b4ec66"}, + {file = "mypy-1.12.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a437c9102a6a252d9e3a63edc191a3aed5f2fcb786d614722ee3f4472e33f6"}, + {file = "mypy-1.12.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:186e0c8346efc027ee1f9acf5ca734425fc4f7dc2b60144f0fbe27cc19dc7931"}, + {file = "mypy-1.12.1-cp311-cp311-win_amd64.whl", hash = "sha256:673ba1140a478b50e6d265c03391702fa11a5c5aff3f54d69a62a48da32cb811"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9fb83a7be97c498176fb7486cafbb81decccaef1ac339d837c377b0ce3743a7f"}, + {file = "mypy-1.12.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:389e307e333879c571029d5b93932cf838b811d3f5395ed1ad05086b52148fb0"}, + {file = "mypy-1.12.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:94b2048a95a21f7a9ebc9fbd075a4fcd310410d078aa0228dbbad7f71335e042"}, + {file = "mypy-1.12.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ee5932370ccf7ebf83f79d1c157a5929d7ea36313027b0d70a488493dc1b179"}, + {file = "mypy-1.12.1-cp312-cp312-win_amd64.whl", hash = "sha256:19bf51f87a295e7ab2894f1d8167622b063492d754e69c3c2fed6563268cb42a"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d34167d43613ffb1d6c6cdc0cc043bb106cac0aa5d6a4171f77ab92a3c758bcc"}, + {file = "mypy-1.12.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:427878aa54f2e2c5d8db31fa9010c599ed9f994b3b49e64ae9cd9990c40bd635"}, + {file = "mypy-1.12.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fcde63ea2c9f69d6be859a1e6dd35955e87fa81de95bc240143cf00de1f7f81"}, + {file = "mypy-1.12.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d54d840f6c052929f4a3d2aab2066af0f45a020b085fe0e40d4583db52aab4e4"}, + {file = "mypy-1.12.1-cp313-cp313-win_amd64.whl", hash = "sha256:20db6eb1ca3d1de8ece00033b12f793f1ea9da767334b7e8c626a4872090cf02"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b16fe09f9c741d85a2e3b14a5257a27a4f4886c171d562bc5a5e90d8591906b8"}, + {file = "mypy-1.12.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:0dcc1e843d58f444fce19da4cce5bd35c282d4bde232acdeca8279523087088a"}, + {file = "mypy-1.12.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e10ba7de5c616e44ad21005fa13450cd0de7caaa303a626147d45307492e4f2d"}, + {file = "mypy-1.12.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:0e6fe449223fa59fbee351db32283838a8fee8059e0028e9e6494a03802b4004"}, + {file = "mypy-1.12.1-cp38-cp38-win_amd64.whl", hash = "sha256:dc6e2a2195a290a7fd5bac3e60b586d77fc88e986eba7feced8b778c373f9afe"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de5b2a8988b4e1269a98beaf0e7cc71b510d050dce80c343b53b4955fff45f19"}, + {file = "mypy-1.12.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:843826966f1d65925e8b50d2b483065c51fc16dc5d72647e0236aae51dc8d77e"}, + {file = "mypy-1.12.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9fe20f89da41a95e14c34b1ddb09c80262edcc295ad891f22cc4b60013e8f78d"}, + {file = "mypy-1.12.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8135ffec02121a75f75dc97c81af7c14aa4ae0dda277132cfcd6abcd21551bfd"}, + {file = "mypy-1.12.1-cp39-cp39-win_amd64.whl", hash = "sha256:a7b76fa83260824300cc4834a3ab93180db19876bce59af921467fd03e692810"}, + {file = "mypy-1.12.1-py3-none-any.whl", hash = "sha256:ce561a09e3bb9863ab77edf29ae3a50e65685ad74bba1431278185b7e5d5486e"}, + {file = "mypy-1.12.1.tar.gz", hash = "sha256:f5b3936f7a6d0e8280c9bdef94c7ce4847f5cdfc258fbb2c29a8c1711e8bb96d"}, ] [package.dependencies] mypy-extensions = ">=1.0.0" tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" +typing-extensions = ">=4.6.0" [package.extras] dmypy = ["psutil (>=4.0)"] @@ -7521,6 +7526,74 @@ files = [ [package.extras] plugins = ["importlib-metadata"] +[[package]] +name = "pyiceberg" +version = "0.8.1" +description = "Apache Iceberg is an open table format for huge analytic datasets" +optional = true +python-versions = "!=2.7.*,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,!=3.7.*,!=3.8.*,>=3.9" +files = [ + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c121d1d3baf64510db94740ad870ae4b6eb9eb59a5ff7ecb4e96f7510666b2f"}, + {file = "pyiceberg-0.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6f14aa588a3883fc7fddc136ca75b75660b4abb0b55b4c541619953f8971e7"}, + {file = "pyiceberg-0.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c720c2a191ac6faf01fe4c0f4c01c64b94bf064185b0292003d42939049277c"}, + {file = "pyiceberg-0.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d421d6e51ac1c581cba9fce96aa6b9118cf4a02270066a7fdc9490ab5d57ece9"}, + {file = "pyiceberg-0.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:ae11fb0515ea0a046370e09a7f6039a7e86622ab910360eaa732f0106b8f00c7"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:9488954c9eb5ce42ca6b816fc61873f219414cfdb9e9928d1c4a302702be1d89"}, + {file = "pyiceberg-0.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:44179e0fb844887b440c162279ba526dfe0e0f72d32945236528838518b55af0"}, + {file = "pyiceberg-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e121c6f5505d8ec711a1dd1690e07156cd54fb3d0844d5d991e02f1593f2708"}, + {file = "pyiceberg-0.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5961a288f2d4bbb2ab300c803da1bf0e70cea837e3f14b14108827cc821af252"}, + {file = "pyiceberg-0.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbe192324a6fb552c2fd29cab51086e21fa248ea2a0b95fbab921dede49e5a69"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:60430f0d8f6d650ed7d1893d038b847565a8e9ac135a1cc812e57d24f0482f6c"}, + {file = "pyiceberg-0.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0f697977dac672d8b00e125836423585a97ebf59a28b865b1296a2b6ee81c51"}, + {file = "pyiceberg-0.8.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370de7c230970ff858f713d150164d492ba8450e771e59a0c520520b13ea6226"}, + {file = "pyiceberg-0.8.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3036ed226020d50e30648a71f968cf78bde5d6b609294508e60754e100e5ef36"}, + {file = "pyiceberg-0.8.1-cp312-cp312-win_amd64.whl", hash = "sha256:9ac9555f3bd25a31059229089ae639cf738a8e8286a175cea128561ac1ed9452"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:51da3a553d3a881042bf436e66a91cc2b6c4a3fea0e174cd73af2eb6ed255323"}, + {file = "pyiceberg-0.8.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:863f1dce7340e6ed870706a3fa4a73457178dae8529725bb80522ddcd4253afb"}, + {file = "pyiceberg-0.8.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4dbf52b39080a6a2cda6a5126a74e3a88d5b206f609c128d001a728b36b81075"}, + {file = "pyiceberg-0.8.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:eb77d65e8efbb883c163817e4a9c373d907110ab6343c1b816b48f336955d4d7"}, + {file = "pyiceberg-0.8.1-cp39-cp39-win_amd64.whl", hash = "sha256:1fcd35b7de0eddc3fd8fd0c38b98741217ef6de4eeb0e72b798b4007692aa76c"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6f0f56f8fc61bcd795f6a3d03e8ce6bee09ebaa64425eb08327e975f906d98be"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:d7099c6631743ad29c451de2bebd9ed3c96c42bcb1fe5d5d5c93aec895858e3f"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d6436f5a782491115f64131882a737d77c9dc0040493e1b7f9b3081ea8cf6a26"}, + {file = "pyiceberg-0.8.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c1d75b40a98a327f7436eb0d6187c51834c44b79adf61c6945b33645f4afbf17"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8de988fa2363e6a51b40b85b5ff1e8261cda5bfc14ac54dd4ebe58391b95acae"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:dd06c5b606011155aa0b76e7b001e30f1c40ab2fb3eeb8a0652b88629259c2bb"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8142f0dbc12dda0e6d7aaf564a3fbb0f17fc934630e7cf866773c8caaebf666"}, + {file = "pyiceberg-0.8.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6126ee3a46ff975f15abf2085f184591d21643bffb96330907e003eea0b63005"}, + {file = "pyiceberg-0.8.1.tar.gz", hash = "sha256:4502f0cfddf6f7cd48b9cd54016bce0ab94052b0ab01efcfa515879074f4c8e3"}, +] + +[package.dependencies] +cachetools = ">=5.5.0,<6.0.0" +click = ">=7.1.1,<9.0.0" +fsspec = ">=2023.1.0" +mmh3 = ">=4.0.0,<6.0.0" +pydantic = ">=2.0,<2.4.0 || >2.4.0,<2.4.1 || >2.4.1,<3.0" +pyparsing = ">=3.1.0,<4.0.0" +requests = ">=2.20.0,<3.0.0" +rich = ">=10.11.0,<14.0.0" +sortedcontainers = "2.4.0" +strictyaml = ">=1.7.0,<2.0.0" +tenacity = ">=8.2.3,<10.0.0" + +[package.extras] +adlfs = ["adlfs (>=2023.1.0)"] +daft = ["getdaft (>=0.2.12)"] +duckdb = ["duckdb (>=0.5.0,<2.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +dynamodb = ["boto3 (>=1.24.59)"] +gcsfs = ["gcsfs (>=2023.1.0)"] +glue = ["boto3 (>=1.24.59)", "mypy-boto3-glue (>=1.28.18)"] +hive = ["thrift (>=0.13.0,<1.0.0)"] +pandas = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)"] +pyarrow = ["pyarrow (>=14.0.0,<19.0.0)"] +ray = ["pandas (>=1.0.0,<3.0.0)", "pyarrow (>=14.0.0,<19.0.0)", "ray (==2.10.0)", "ray (>=2.10.0,<3.0.0)"] +s3fs = ["s3fs (>=2023.1.0)"] +snappy = ["python-snappy (>=0.6.0,<1.0.0)"] +sql-postgres = ["psycopg2-binary (>=2.9.6)", "sqlalchemy (>=2.0.18,<3.0.0)"] +sql-sqlite = ["sqlalchemy (>=2.0.18,<3.0.0)"] +zstandard = ["zstandard (>=0.13.0,<1.0.0)"] + [[package]] name = "pyjwt" version = "2.8.0" @@ -9327,6 +9400,20 @@ files = [ [package.dependencies] pbr = ">=2.0.0,<2.1.0 || >2.1.0" +[[package]] +name = "strictyaml" +version = "1.7.3" +description = "Strict, typed YAML parser" +optional = true +python-versions = ">=3.7.0" +files = [ + {file = "strictyaml-1.7.3-py3-none-any.whl", hash = "sha256:fb5c8a4edb43bebb765959e420f9b3978d7f1af88c80606c03fb420888f5d1c7"}, + {file = "strictyaml-1.7.3.tar.gz", hash = "sha256:22f854a5fcab42b5ddba8030a0e4be51ca89af0267961c8d6cfa86395586c407"}, +] + +[package.dependencies] +python-dateutil = ">=2.6.0" + [[package]] name = "sympy" version = "1.12" @@ -10606,6 +10693,7 @@ mssql = ["pyodbc"] parquet = ["pyarrow"] postgis = ["psycopg2-binary", "psycopg2cffi"] postgres = ["psycopg2-binary", "psycopg2cffi"] +pyiceberg = ["pyarrow", "pyiceberg", "sqlalchemy"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] @@ -10619,4 +10707,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "a7cd6b599326d80b5beb8d4a3d3e3b4074eda6dc53daa5c296ef8d54002c5f78" +content-hash = "84e8b8eccd9b8ee104a2dc08f5b83987aeb06540d61330390ce849cc1ad6acb4" diff --git a/pyproject.toml b/pyproject.toml index 0fb7f94e36..bfa830cd06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -adlfs = {version = ">=2022.4.0", optional = true} +adlfs = {version = ">=2024.7.0", optional = true} pyodbc = {version = ">=4.0.39", optional = true} qdrant-client = {version = ">=1.8", optional = true, extras = ["fastembed"]} databricks-sql-connector = {version = ">=2.9.3", optional = true} @@ -89,6 +89,12 @@ alembic = {version = ">1.10.0", optional = true} paramiko = {version = ">=3.3.0", optional = true} sqlglot = {version = ">=20.0.0", optional = true} db-dtypes = { version = ">=1.2.0", optional = true } +# `sql-sqlite` extra leads to dependency conflict with `apache-airflow` because `apache-airflow` +# requires `sqlalchemy<2.0.0` while the extra requires `sqlalchemy>=2.0.18` +# https://github.com/apache/airflow/issues/28723 +# pyiceberg = { version = ">=0.7.1", optional = true, extras = ["sql-sqlite"] } +# we will rely on manual installation of `sqlalchemy>=2.0.18` instead +pyiceberg = { version = ">=0.8.1", python = ">=3.9", optional = true } [tool.poetry.extras] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -118,6 +124,7 @@ lancedb = ["lancedb", "pyarrow", "tantivy"] deltalake = ["deltalake", "pyarrow"] sql_database = ["sqlalchemy"] sqlalchemy = ["sqlalchemy", "alembic"] +pyiceberg = ["pyiceberg", "pyarrow", "sqlalchemy"] postgis = ["psycopg2-binary", "psycopg2cffi"] [tool.poetry.scripts] @@ -134,7 +141,7 @@ sqlfluff = "^2.3.2" types-deprecated = "^1.2.9.2" pytest-console-scripts = "^1.4.1" pytest = "^7.0.0" -mypy = "^1.10.0" +mypy = ">=1.11.0,<1.13.0" flake8 = "^5.0.0" bandit = "^1.7.0" black = "^23.7.0" diff --git a/tests/conftest.py b/tests/conftest.py index 6088fa976c..a5a349f8d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -120,6 +120,9 @@ def _create_pipeline_instance_id(self) -> str: # disable googleapiclient logging logging.getLogger("googleapiclient.discovery_cache").setLevel("WARNING") + # disable pyiceberg logging + logging.getLogger("pyiceberg").setLevel("WARNING") + # reset and init airflow db import warnings diff --git a/tests/libs/test_csv_writer.py b/tests/libs/test_csv_writer.py index 3c30123e1c..a120cd048e 100644 --- a/tests/libs/test_csv_writer.py +++ b/tests/libs/test_csv_writer.py @@ -178,7 +178,7 @@ def test_non_utf8_binary(item_type: TestDataItemFormat) -> None: table = pq.read_table(f) else: table = data - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with pytest.raises(InvalidDataItem) as inv_ex: with get_writer(writer_type, disable_compression=True) as writer: @@ -195,7 +195,7 @@ def test_arrow_struct() -> None: @pytest.mark.parametrize("item_type", ["object", "arrow-table"]) def test_csv_writer_empty(item_type: TestDataItemFormat) -> None: - writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter # type: ignore + writer_type: Type[DataWriter] = ArrowToCsvWriter if item_type == "arrow-table" else CsvWriter with get_writer(writer_type, disable_compression=True) as writer: writer.write_empty_file(TABLE_UPDATE_COLUMNS_SCHEMA) diff --git a/tests/load/filesystem/test_object_store_rs_credentials.py b/tests/load/filesystem/test_credentials_mixins.py similarity index 50% rename from tests/load/filesystem/test_object_store_rs_credentials.py rename to tests/load/filesystem/test_credentials_mixins.py index f23187a269..c1fb02c152 100644 --- a/tests/load/filesystem/test_object_store_rs_credentials.py +++ b/tests/load/filesystem/test_credentials_mixins.py @@ -1,12 +1,8 @@ -"""Tests translation of `dlt` credentials into `object_store` Rust crate credentials.""" - -from typing import Any, Dict +from typing import Any, Dict, Union, Type, get_args, cast import os import json # noqa: I251 import pytest -from deltalake import DeltaTable -from deltalake.exceptions import TableNotFoundError import dlt from dlt.common.configuration import resolve_configuration @@ -23,10 +19,15 @@ from dlt.common.utils import custom_environ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs.gcp_credentials import GcpDefaultCredentials -from dlt.common.configuration.specs.exceptions import ObjectStoreRsCredentialsException +from dlt.common.configuration.specs.exceptions import ( + ObjectStoreRsCredentialsException, + UnsupportedAuthenticationMethodException, +) +from dlt.common.configuration.specs.mixins import WithObjectStoreRsCredentials, WithPyicebergConfig from tests.load.utils import ( AZ_BUCKET, + ABFS_BUCKET, AWS_BUCKET, GCS_BUCKET, R2_BUCKET_CONFIG, @@ -34,6 +35,9 @@ ) +TCredentialsMixin = Union[WithObjectStoreRsCredentials, WithPyicebergConfig] +ALL_CREDENTIALS_MIXINS = get_args(TCredentialsMixin) + pytestmark = pytest.mark.essential if all(driver not in ALL_FILESYSTEM_DRIVERS for driver in ("az", "s3", "gs", "r2")): @@ -53,11 +57,27 @@ def fs_creds() -> Dict[str, Any]: return creds -def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> bool: - """Returns True if client can connect to object store, False otherwise. +def can_connect(bucket_url: str, credentials: TCredentialsMixin, mixin: Type[TCredentialsMixin]) -> bool: # type: ignore[return] + """Returns True if client can connect to object store, False otherwise.""" + if mixin == WithObjectStoreRsCredentials: + credentials = cast(WithObjectStoreRsCredentials, credentials) + return can_connect_object_store_rs_credentials( + bucket_url, credentials.to_object_store_rs_credentials() + ) + elif mixin == WithPyicebergConfig: + credentials = cast(WithPyicebergConfig, credentials) + return can_connect_pyiceberg_fileio_config( + bucket_url, credentials.to_pyiceberg_fileio_config() + ) + + +def can_connect_object_store_rs_credentials( + bucket_url: str, object_store_rs_credentials: Dict[str, str] +) -> bool: + # uses `deltatable` library as Python interface to `object_store` Rust crate + from deltalake import DeltaTable + from deltalake.exceptions import TableNotFoundError - Uses `deltatable` library as Python interface to `object_store` Rust crate. - """ try: DeltaTable( bucket_url, @@ -70,16 +90,40 @@ def can_connect(bucket_url: str, object_store_rs_credentials: Dict[str, str]) -> return False +def can_connect_pyiceberg_fileio_config( + bucket_url: str, pyiceberg_fileio_config: Dict[str, str] +) -> bool: + from pyiceberg.table import StaticTable + + try: + StaticTable.from_metadata( + f"{bucket_url}/non_existing_metadata_file.json", + properties=pyiceberg_fileio_config, + ) + except FileNotFoundError: + # this error implies the connection was successful + # there is no Iceberg metadata file at the specified path + return True + return False + + @pytest.mark.parametrize( - "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az")] + "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("az", "abfss")] ) -def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_azure_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: + if mixin == WithPyicebergConfig and driver == "az": + pytest.skip("`pyiceberg` does not support `az` scheme") + + buckets = {"az": AZ_BUCKET, "abfss": ABFS_BUCKET} creds: AnyAzureCredentials creds = AzureServicePrincipalCredentialsWithoutDefaults( **dlt.secrets.get("destination.fsazureprincipal.credentials") ) - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # without SAS token creds = AzureCredentialsWithoutDefaults( @@ -87,18 +131,21 @@ def test_azure_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any] azure_storage_account_key=fs_creds["azure_storage_account_key"], ) assert creds.azure_storage_sas_token is None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) # with SAS token creds = resolve_configuration(creds) assert creds.azure_storage_sas_token is not None - assert can_connect(AZ_BUCKET, creds.to_object_store_rs_credentials()) + assert can_connect(buckets[driver], creds, mixin) @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("s3", "r2")] ) -def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_aws_credentials_mixins( + driver: str, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: AwsCredentialsWithoutDefaults if driver == "r2": @@ -112,9 +159,11 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) + if mixin == WithObjectStoreRsCredentials: + assert ( + "aws_session_token" not in creds.to_object_store_rs_credentials() + ) # no auto-generated token + assert can_connect(AWS_BUCKET, creds, mixin) # AwsCredentials: no user-provided session token creds = AwsCredentials( @@ -124,24 +173,27 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) endpoint_url=fs_creds.get("endpoint_url"), ) assert creds.aws_session_token is None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert "aws_session_token" not in object_store_rs_creds # no auto-generated token - assert can_connect(AWS_BUCKET, object_store_rs_creds) - - # exception should be raised if both `endpoint_url` and `region_name` are - # not provided - with pytest.raises(ObjectStoreRsCredentialsException): - AwsCredentials( - aws_access_key_id=fs_creds["aws_access_key_id"], - aws_secret_access_key=fs_creds["aws_secret_access_key"], - ).to_object_store_rs_credentials() - - if "endpoint_url" in object_store_rs_creds: - # TODO: make sure this case is tested on GitHub CI, e.g. by adding - # a local MinIO bucket to the set of tested buckets - if object_store_rs_creds["endpoint_url"].startswith("http://"): + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert "aws_session_token" not in object_store_rs_creds # no auto-generated token + + # exception should be raised if both `endpoint_url` and `region_name` are + # not provided + with pytest.raises(ObjectStoreRsCredentialsException): + AwsCredentials( + aws_access_key_id=fs_creds["aws_access_key_id"], + aws_secret_access_key=fs_creds["aws_secret_access_key"], + ).to_object_store_rs_credentials() + + if "endpoint_url" in object_store_rs_creds and object_store_rs_creds[ + "endpoint_url" + ].startswith("http://"): + # TODO: make sure this case is tested on GitHub CI, e.g. by adding + # a local MinIO bucket to the set of tested buckets assert object_store_rs_creds["aws_allow_http"] == "true" + if creds.endpoint_url is not None: # remainder of tests use session tokens # we don't run them on S3 compatible storage because session tokens # may not be available @@ -158,9 +210,10 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None # AwsCredentialsWithoutDefaults: user-provided session token creds = AwsCredentialsWithoutDefaults( @@ -170,15 +223,19 @@ def test_aws_object_store_rs_credentials(driver: str, fs_creds: Dict[str, Any]) region_name=fs_creds["region_name"], ) assert creds.aws_session_token is not None - object_store_rs_creds = creds.to_object_store_rs_credentials() - assert object_store_rs_creds["aws_session_token"] is not None - assert can_connect(AWS_BUCKET, object_store_rs_creds) + assert can_connect(AWS_BUCKET, creds, mixin) + if mixin == WithObjectStoreRsCredentials: + object_store_rs_creds = creds.to_object_store_rs_credentials() + assert object_store_rs_creds["aws_session_token"] is not None @pytest.mark.parametrize( "driver", [driver for driver in ALL_FILESYSTEM_DRIVERS if driver in ("gs")] ) -def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> None: +@pytest.mark.parametrize("mixin", ALL_CREDENTIALS_MIXINS) +def test_gcp_credentials_mixins( + driver, fs_creds: Dict[str, Any], mixin: Type[TCredentialsMixin] +) -> None: creds: GcpCredentials # GcpServiceAccountCredentialsWithoutDefaults @@ -189,7 +246,11 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No private_key_id=fs_creds["private_key_id"], client_email=fs_creds["client_email"], ) - assert can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) # GcpDefaultCredentials @@ -197,7 +258,7 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No GcpDefaultCredentials._LAST_FAILED_DEFAULT = 0 # write service account key to JSON file - service_json = json.loads(creds.to_object_store_rs_credentials()["service_account_key"]) + service_json = json.loads(creds.to_native_representation()) path = "_secrets/service.json" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, "w", encoding="utf-8") as f: @@ -206,8 +267,18 @@ def test_gcp_object_store_rs_credentials(driver, fs_creds: Dict[str, Any]) -> No with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": path}): creds = GcpDefaultCredentials() resolve_configuration(creds) - can_connect(GCS_BUCKET, creds.to_object_store_rs_credentials()) - - # GcpOAuthCredentialsWithoutDefaults is currently not supported - with pytest.raises(NotImplementedError): - GcpOAuthCredentialsWithoutDefaults().to_object_store_rs_credentials() + if mixin == WithPyicebergConfig: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + assert can_connect(GCS_BUCKET, creds, mixin) + + # GcpOAuthCredentialsWithoutDefaults + creds = resolve_configuration( + GcpOAuthCredentialsWithoutDefaults(), sections=("destination", "fsgcpoauth") + ) + if mixin == WithPyicebergConfig: + assert can_connect(GCS_BUCKET, creds, mixin) + elif mixin == WithObjectStoreRsCredentials: + with pytest.raises(UnsupportedAuthenticationMethodException): + assert can_connect(GCS_BUCKET, creds, mixin) diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index ac2ada2551..a73b0f7e31 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -1,17 +1,17 @@ """Test the duckdb supported sql client for special internal features""" -from typing import Any +from typing import Optional import pytest import dlt import os import shutil -import logging from dlt import Pipeline from dlt.common.utils import uniq_id +from dlt.common.schema.typing import TTableFormat from tests.load.utils import ( destinations_configs, @@ -19,7 +19,6 @@ GCS_BUCKET, SFTP_BUCKET, MEMORY_BUCKET, - AWS_BUCKET, ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT @@ -37,7 +36,7 @@ def _run_dataset_checks( pipeline: Pipeline, destination_config: DestinationTestConfiguration, secret_directory: str, - table_format: Any = None, + table_format: Optional[TTableFormat] = None, alternate_access_pipeline: Pipeline = None, ) -> None: total_records = 200 @@ -144,6 +143,8 @@ def _external_duckdb_connection() -> duckdb.DuckDBPyConnection: # the line below solves problems with certificate path lookup on linux, see duckdb docs external_db.sql("SET azure_transport_option_type = 'curl';") external_db.sql(f"SET secret_directory = '{secret_directory}';") + if table_format == "iceberg": + FilesystemSqlClient._setup_iceberg(external_db) return external_db def _fs_sql_client_for_external_db( @@ -283,13 +284,13 @@ def test_read_interfaces_filesystem( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=[SFTP_BUCKET, MEMORY_BUCKET], # NOTE: delta does not work on memory buckets ), ids=lambda x: x.name, ) -def test_delta_tables( +def test_table_formats( destination_config: DestinationTestConfiguration, secret_directory: str ) -> None: os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "700" @@ -302,8 +303,9 @@ def test_delta_tables( # in case of gcs we use the s3 compat layer for reading # for writing we still need to use the gc authentication, as delta_rs seems to use # methods on the s3 interface that are not implemented by gcs + # s3 compat layer does not work with `iceberg` table format access_pipeline = pipeline - if destination_config.bucket_url == GCS_BUCKET: + if destination_config.bucket_url == GCS_BUCKET and destination_config.table_format != "iceberg": gcp_bucket = filesystem( GCS_BUCKET.replace("gs://", "s3://"), destination_name="filesystem_s3_gcs_comp" ) @@ -315,7 +317,7 @@ def test_delta_tables( pipeline, destination_config, secret_directory=secret_directory, - table_format="delta", + table_format=destination_config.table_format, alternate_access_pipeline=access_pipeline, ) diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8d890642ee..c70fa5ab5d 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -2,7 +2,7 @@ import os import posixpath from pathlib import Path -from typing import Any, Callable, List, Dict, cast +from typing import Any, Callable, List, Dict, cast, Tuple from importlib.metadata import version as pkg_version from packaging.version import Version @@ -15,7 +15,7 @@ from dlt.common.storages.configuration import FilesystemConfiguration from dlt.common.storages.load_package import ParsedLoadJobFileName from dlt.common.utils import uniq_id -from dlt.common.schema.typing import TWriteDisposition +from dlt.common.schema.typing import TWriteDisposition, TTableFormat from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.destinations import filesystem from dlt.destinations.impl.filesystem.filesystem import FilesystemClient @@ -223,6 +223,48 @@ def some_source(): assert table.column("value").to_pylist() == [1, 2, 3, 4, 5] +# here start the `table_format` tests + + +def get_expected_actual( + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + arrow_table: "pyarrow.Table", # type: ignore[name-defined] # noqa: F821 +) -> Tuple["pyarrow.Table", "pyarrow.Table"]: # type: ignore[name-defined] # noqa: F821 + from dlt.common.libs.pyarrow import pyarrow, cast_arrow_schema_types + + if table_format == "delta": + from dlt.common.libs.deltalake import ( + get_delta_tables, + ensure_delta_compatible_arrow_data, + ) + + dt = get_delta_tables(pipeline, table_name)[table_name] + expected = ensure_delta_compatible_arrow_data(arrow_table) + actual = dt.to_pyarrow_table() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import ( + get_iceberg_tables, + ensure_iceberg_compatible_arrow_data, + ) + + it = get_iceberg_tables(pipeline, table_name)[table_name] + expected = ensure_iceberg_compatible_arrow_data(arrow_table) + actual = it.scan().to_arrow() + + # work around pyiceberg bug https://github.com/apache/iceberg-python/issues/1128 + schema = cast_arrow_schema_types( + actual.schema, + { + pyarrow.types.is_large_string: pyarrow.string(), + pyarrow.types.is_large_binary: pyarrow.binary(), + }, + ) + actual = actual.cast(schema) + return (expected, actual) + + @pytest.mark.skip( reason="pyarrow version check not needed anymore, since we have 17 as a dependency" ) @@ -258,44 +300,44 @@ def foo(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_exclude=(MEMORY_BUCKET, SFTP_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_core( +def test_table_format_core( destination_config: DestinationTestConfiguration, ) -> None: - """Tests core functionality for `delta` table format. + """Tests core functionality for `delta` and `iceberg` table formats. Tests all data types, all filesystems. Tests `append` and `replace` write dispositions (`merge` is tested elsewhere). """ - - from dlt.common.libs.deltalake import get_delta_tables + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables # create resource that yields rows with all data types column_schemas, row = table_update_and_row() - @dlt.resource(columns=column_schemas, table_format="delta") + @dlt.resource(columns=column_schemas, table_format=destination_config.table_format) def data_types(): nonlocal row yield [row] * 10 pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - # run pipeline, this should create Delta table + # run pipeline, this should create table info = pipeline.run(data_types()) assert_load_info(info) - # `delta` table format should use `parquet` file format + # table formats should use `parquet` file format completed_jobs = info.load_packages[0].jobs["completed_jobs"] data_types_jobs = [ job for job in completed_jobs if job.job_file_info.table_name == "data_types" ] assert all([job.file_path.endswith((".parquet", ".reference")) for job in data_types_jobs]) - # 10 rows should be loaded to the Delta table and the content of the first + # 10 rows should be loaded to the table and the content of the first # row should match expected values rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -322,7 +364,8 @@ def data_types(): # should do logical replace, increasing the table version info = pipeline.run(data_types(), write_disposition="replace") assert_load_info(info) - assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 + if destination_config.table_format == "delta": + assert get_delta_tables(pipeline, "data_types")["data_types"].version() == 2 rows = load_tables_to_dicts(pipeline, "data_types", exclude_system_cols=True)["data_types"] assert len(rows) == 10 @@ -331,15 +374,16 @@ def data_types(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_does_not_contain_job_files( +def test_table_format_does_not_contain_job_files( destination_config: DestinationTestConfiguration, ) -> None: - """Asserts Parquet job files do not end up in Delta table.""" + """Asserts Parquet job files do not end up in table.""" pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -376,17 +420,18 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_multiple_files( +def test_table_format_multiple_files( destination_config: DestinationTestConfiguration, ) -> None: - """Tests loading multiple files into a Delta table. + """Tests loading multiple files into a table. - Files should be loaded into the Delta table in a single commit. + Files should be loaded into the table in a single commit. """ from dlt.common.libs.deltalake import get_delta_tables @@ -422,17 +467,17 @@ def delta_table(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_child_tables( +def test_table_format_child_tables( destination_config: DestinationTestConfiguration, ) -> None: - """Tests child table handling for `delta` table format.""" + """Tests child table handling for `delta` and `iceberg` table formats.""" - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def nested_table(): yield [ { @@ -494,49 +539,63 @@ def nested_table(): assert len(rows_dict["nested_table__child"]) == 3 assert len(rows_dict["nested_table__child__grandchild"]) == 5 - # now drop children and grandchildren, use merge write disposition to create and pass full table chain - # also for tables that do not have jobs - info = pipeline.run( - [{"foo": 3}] * 10000, - table_name="nested_table", - primary_key="foo", - write_disposition="merge", - ) - assert_load_info(info) + if destination_config.supports_merge: + # now drop children and grandchildren, use merge write disposition to create and pass full table chain + # also for tables that do not have jobs + info = pipeline.run( + [{"foo": 3}] * 10000, + table_name="nested_table", + primary_key="foo", + write_disposition="merge", + ) + assert_load_info(info) @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_partitioning( +def test_table_format_partitioning( destination_config: DestinationTestConfiguration, ) -> None: - """Tests partitioning for `delta` table format.""" + """Tests partitioning for `delta` and `iceberg` table formats.""" - from dlt.common.libs.deltalake import get_delta_tables from tests.pipeline.utils import users_materialize_table_schema + def assert_partition_columns( + table_name: str, table_format: TTableFormat, expected_partition_columns: List[str] + ) -> None: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + actual_partition_columns = dt.metadata().partition_columns + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + actual_partition_columns = [f.name for f in it.metadata.specs_struct().fields] + assert actual_partition_columns == expected_partition_columns + pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) # zero partition columns - @dlt.resource(table_format="delta") + @dlt.resource(table_format=destination_config.table_format) def zero_part(): yield {"foo": 1, "bar": 1} info = pipeline.run(zero_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) assert load_table_counts(pipeline, "zero_part")["zero_part"] == 1 # one partition column - @dlt.resource(table_format="delta", columns={"c1": {"partition": True}}) + @dlt.resource(table_format=destination_config.table_format, columns={"c1": {"partition": True}}) def one_part(): yield [ {"c1": "foo", "c2": 1}, @@ -547,13 +606,13 @@ def one_part(): info = pipeline.run(one_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "one_part")["one_part"] - assert dt.metadata().partition_columns == ["c1"] + assert_partition_columns("one_part", destination_config.table_format, ["c1"]) assert load_table_counts(pipeline, "one_part")["one_part"] == 4 # two partition columns @dlt.resource( - table_format="delta", columns={"c1": {"partition": True}, "c2": {"partition": True}} + table_format=destination_config.table_format, + columns={"c1": {"partition": True}, "c2": {"partition": True}}, ) def two_part(): yield [ @@ -565,29 +624,31 @@ def two_part(): info = pipeline.run(two_part()) assert_load_info(info) - dt = get_delta_tables(pipeline, "two_part")["two_part"] - assert dt.metadata().partition_columns == ["c1", "c2"] + assert_partition_columns("two_part", destination_config.table_format, ["c1", "c2"]) assert load_table_counts(pipeline, "two_part")["two_part"] == 4 # test partitioning with empty source users_materialize_table_schema.apply_hints( - table_format="delta", + table_format=destination_config.table_format, columns={"id": {"partition": True}}, ) info = pipeline.run(users_materialize_table_schema()) assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.metadata().partition_columns == ["id"] + assert_partition_columns("users", destination_config.table_format, ["id"]) assert load_table_counts(pipeline, "users")["users"] == 0 # changing partitioning after initial table creation is not supported zero_part.apply_hints(columns={"foo": {"partition": True}}) - with pytest.raises(PipelineStepFailed) as pip_ex: + if destination_config.table_format == "delta": + # Delta raises error when trying to change partitioning + with pytest.raises(PipelineStepFailed) as pip_ex: + pipeline.run(zero_part()) + assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) + assert "partitioning" in pip_ex.value.__context__.retry_message + elif destination_config.table_format == "iceberg": + # while Iceberg supports partition evolution, we don't apply it pipeline.run(zero_part()) - assert isinstance(pip_ex.value.__context__, LoadClientJobRetry) - assert "partitioning" in pip_ex.value.__context__.retry_message - dt = get_delta_tables(pipeline, "zero_part")["zero_part"] - assert dt.metadata().partition_columns == [] + assert_partition_columns("zero_part", destination_config.table_format, []) @pytest.mark.parametrize( @@ -646,7 +707,7 @@ def test_delta_table_partitioning_arrow_load_id( "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, @@ -659,20 +720,25 @@ def test_delta_table_partitioning_arrow_load_id( pytest.param({"disposition": "merge", "strategy": "upsert"}, id="upsert"), ), ) -def test_delta_table_schema_evolution( +def test_table_format_schema_evolution( destination_config: DestinationTestConfiguration, write_disposition: TWriteDisposition, ) -> None: - """Tests schema evolution (adding new columns) for `delta` table format.""" - from dlt.common.libs.deltalake import get_delta_tables, ensure_delta_compatible_arrow_data + """Tests schema evolution (adding new columns) for `delta` and `iceberg` table formats.""" + if destination_config.table_format == "iceberg" and write_disposition == { + "disposition": "merge", + "strategy": "upsert", + }: + pytest.skip("`upsert` currently not implemented for `iceberg`") + from dlt.common.libs.pyarrow import pyarrow @dlt.resource( write_disposition=write_disposition, primary_key="pk", - table_format="delta", + table_format=destination_config.table_format, ) - def delta_table(data): + def evolving_table(data): yield data pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) @@ -684,11 +750,11 @@ def delta_table(data): assert arrow_table.shape == (1, 1) # initial load - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - expected = ensure_delta_compatible_arrow_data(arrow_table) - actual = dt.to_pyarrow_table() + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) assert actual.equals(expected) # create Arrow table with many columns, two rows @@ -703,11 +769,11 @@ def delta_table(data): arrow_table = arrow_table.add_column(0, pk_field, [[1, 2]]) # second load — this should evolve the schema (i.e. add the new columns) - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(evolving_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected = ensure_delta_compatible_arrow_data(arrow_table) + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) if write_disposition == "append": # just check shape and schema for `append`, because table comparison is # more involved than with the other dispositions @@ -724,13 +790,21 @@ def delta_table(data): empty_arrow_table = arrow_table.schema.empty_table() # load 3 — this should evolve the schema without changing data - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(evolving_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - actual = dt.to_pyarrow_table() - expected_schema = ensure_delta_compatible_arrow_data(arrow_table).schema - assert actual.schema.equals(expected_schema) - expected_num_rows = 3 if write_disposition == "append" else 2 + expected, actual = get_expected_actual( + pipeline, "evolving_table", destination_config.table_format, arrow_table + ) + assert actual.schema.equals(expected.schema) + if write_disposition == "append": + expected_num_rows = 3 + elif write_disposition == "replace": + expected_num_rows = 0 + if destination_config.table_format == "delta": + # TODO: fix https://github.com/dlt-hub/dlt/issues/2092 and remove this if-clause + expected_num_rows = 2 + elif write_disposition == {"disposition": "merge", "strategy": "upsert"}: + expected_num_rows = 2 assert actual.num_rows == expected_num_rows # new column should have NULLs only assert ( @@ -743,23 +817,38 @@ def delta_table(data): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_empty_source( +def test_table_format_empty_source( destination_config: DestinationTestConfiguration, ) -> None: - """Tests empty source handling for `delta` table format. + """Tests empty source handling for `delta` and `iceberg` table formats. Tests both empty Arrow table and `dlt.mark.materialize_table_schema()`. """ - from dlt.common.libs.deltalake import ensure_delta_compatible_arrow_data, get_delta_tables from tests.pipeline.utils import users_materialize_table_schema - @dlt.resource(table_format="delta") - def delta_table(data): + def get_table_version( # type: ignore[return] + pipeline: dlt.Pipeline, + table_name: str, + table_format: TTableFormat, + ) -> int: + if table_format == "delta": + from dlt.common.libs.deltalake import get_delta_tables + + dt = get_delta_tables(pipeline, table_name)[table_name] + return dt.version() + elif table_format == "iceberg": + from dlt.common.libs.pyiceberg import get_iceberg_tables + + it = get_iceberg_tables(pipeline, table_name)[table_name] + return it.last_sequence_number - 1 # subtract 1 to match `delta` + + @dlt.resource(table_format=destination_config.table_format) + def a_table(data): yield data # create empty Arrow table with schema @@ -779,61 +868,62 @@ def delta_table(data): # run 1: empty Arrow table with schema # this should create empty Delta table with same schema as Arrow table - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (0, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 0 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (0, expected.num_columns) + assert actual.schema.equals(expected.schema) # run 2: non-empty Arrow table with same schema as run 1 # this should load records into Delta table - info = pipeline.run(delta_table(arrow_table)) + info = pipeline.run(a_table(arrow_table)) assert_load_info(info) - dt = get_delta_tables(pipeline, "delta_table")["delta_table"] - assert dt.version() == 1 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.shape == (2, empty_arrow_table.num_columns) - assert dt_arrow_table.schema.equals( - ensure_delta_compatible_arrow_data(empty_arrow_table).schema + assert get_table_version(pipeline, "a_table", destination_config.table_format) == 1 + expected, actual = get_expected_actual( + pipeline, "a_table", destination_config.table_format, empty_arrow_table ) + assert actual.shape == (2, expected.num_columns) + assert actual.schema.equals(expected.schema) # now run the empty frame again - info = pipeline.run(delta_table(empty_arrow_table)) + info = pipeline.run(a_table(empty_arrow_table)) assert_load_info(info) - # use materialized list - # NOTE: this will create an empty parquet file with a schema takes from dlt schema. - # the original parquet file had a nested (struct) type in `json` field that is now - # in the delta table schema. the empty parquet file lost this information and had - # string type (converted from dlt `json`) - info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="delta_table") - assert_load_info(info) + if destination_config.table_format == "delta": + # use materialized list + # NOTE: this will create an empty parquet file with a schema takes from dlt schema. + # the original parquet file had a nested (struct) type in `json` field that is now + # in the delta table schema. the empty parquet file lost this information and had + # string type (converted from dlt `json`) + info = pipeline.run([dlt.mark.materialize_table_schema()], table_name="a_table") + assert_load_info(info) # test `dlt.mark.materialize_table_schema()` - users_materialize_table_schema.apply_hints(table_format="delta") + users_materialize_table_schema.apply_hints(table_format=destination_config.table_format) info = pipeline.run(users_materialize_table_schema(), loader_file_format="parquet") assert_load_info(info) - dt = get_delta_tables(pipeline, "users")["users"] - assert dt.version() == 0 - dt_arrow_table = dt.to_pyarrow_table() - assert dt_arrow_table.num_rows == 0 - assert "id", "name" == dt_arrow_table.schema.names[:2] + assert get_table_version(pipeline, "users", destination_config.table_format) == 0 + _, actual = get_expected_actual( + pipeline, "users", destination_config.table_format, empty_arrow_table + ) + assert actual.num_rows == 0 + assert "id", "name" == actual.schema.names[:2] @pytest.mark.parametrize( "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_mixed_source( +def test_table_format_mixed_source( destination_config: DestinationTestConfiguration, ) -> None: """Tests file format handling in mixed source. @@ -877,12 +967,13 @@ def s(): "destination_config", destinations_configs( table_format_filesystem_configs=True, + # job orchestration is same across table formats—no need to test all formats with_table_format="delta", bucket_subset=(FILE_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_dynamic_dispatch( +def test_table_format_dynamic_dispatch( destination_config: DestinationTestConfiguration, ) -> None: @dlt.resource(primary_key="id", table_name=lambda i: i["type"], table_format="delta") @@ -905,80 +996,96 @@ def github_events(): "destination_config", destinations_configs( table_format_filesystem_configs=True, - with_table_format="delta", + with_table_format=("delta", "iceberg"), bucket_subset=(FILE_BUCKET, AZ_BUCKET), ), ids=lambda x: x.name, ) -def test_delta_table_get_delta_tables_helper( +def test_table_format_get_tables_helper( destination_config: DestinationTestConfiguration, ) -> None: - """Tests `get_delta_tables` helper function.""" - from dlt.common.libs.deltalake import DeltaTable, get_delta_tables + """Tests `get_delta_tables` / `get_iceberg_tables` helper functions.""" + get_tables: Any + if destination_config.table_format == "delta": + from dlt.common.libs.deltalake import DeltaTable, get_delta_tables - @dlt.resource(table_format="delta") - def foo_delta(): + get_tables = get_delta_tables + get_num_rows = lambda table: table.to_pyarrow_table().num_rows + elif destination_config.table_format == "iceberg": + from dlt.common.libs.pyiceberg import IcebergTable, get_iceberg_tables + + get_tables = get_iceberg_tables + get_num_rows = lambda table: table.scan().to_arrow().num_rows + + @dlt.resource(table_format=destination_config.table_format) + def foo_table_format(): yield [{"foo": 1}, {"foo": 2}] - @dlt.resource(table_format="delta") - def bar_delta(): + @dlt.resource(table_format=destination_config.table_format) + def bar_table_format(): yield [{"bar": 1}] @dlt.resource - def baz_not_delta(): + def baz_not_table_format(): yield [{"baz": 1}] pipeline = destination_config.setup_pipeline("fs_pipe", dev_mode=True) - info = pipeline.run(foo_delta()) + info = pipeline.run(foo_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta"} - assert isinstance(delta_tables["foo_delta"], DeltaTable) - assert delta_tables["foo_delta"].to_pyarrow_table().num_rows == 2 - - info = pipeline.run([foo_delta(), bar_delta(), baz_not_delta()]) + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format"} + if destination_config.table_format == "delta": + assert isinstance(tables["foo_table_format"], DeltaTable) + elif destination_config.table_format == "iceberg": + assert isinstance(tables["foo_table_format"], IcebergTable) + assert get_num_rows(tables["foo_table_format"]) == 2 + + info = pipeline.run([foo_table_format(), bar_table_format(), baz_not_table_format()]) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert delta_tables.keys() == {"foo_delta", "bar_delta"} - assert delta_tables["bar_delta"].to_pyarrow_table().num_rows == 1 - assert get_delta_tables(pipeline, "foo_delta").keys() == {"foo_delta"} - assert get_delta_tables(pipeline, "bar_delta").keys() == {"bar_delta"} - assert get_delta_tables(pipeline, "foo_delta", "bar_delta").keys() == {"foo_delta", "bar_delta"} + tables = get_tables(pipeline) + assert tables.keys() == {"foo_table_format", "bar_table_format"} + assert get_num_rows(tables["bar_table_format"]) == 1 + assert get_tables(pipeline, "foo_table_format").keys() == {"foo_table_format"} + assert get_tables(pipeline, "bar_table_format").keys() == {"bar_table_format"} + assert get_tables(pipeline, "foo_table_format", "bar_table_format").keys() == { + "foo_table_format", + "bar_table_format", + } # test with child table - @dlt.resource(table_format="delta") - def parent_delta(): + @dlt.resource(table_format=destination_config.table_format) + def parent_table_format(): yield [{"foo": 1, "child": [1, 2, 3]}] - info = pipeline.run(parent_delta()) + info = pipeline.run(parent_table_format()) assert_load_info(info) - delta_tables = get_delta_tables(pipeline) - assert "parent_delta__child" in delta_tables.keys() - assert delta_tables["parent_delta__child"].to_pyarrow_table().num_rows == 3 + tables = get_tables(pipeline) + assert "parent_table_format__child" in tables.keys() + assert get_num_rows(tables["parent_table_format__child"]) == 3 # test invalid input with pytest.raises(ValueError): - get_delta_tables(pipeline, "baz_not_delta") + get_tables(pipeline, "baz_not_table_format") with pytest.raises(ValueError): - get_delta_tables(pipeline, "non_existing_table") + get_tables(pipeline, "non_existing_table") # test unknown schema with pytest.raises(FileNotFoundError): - get_delta_tables(pipeline, "non_existing_table", schema_name="aux_2") + get_tables(pipeline, "non_existing_table", schema_name="aux_2") # load to a new schema and under new name aux_schema = dlt.Schema("aux_2") # NOTE: you cannot have a file with name - info = pipeline.run(parent_delta().with_name("aux_delta"), schema=aux_schema) + info = pipeline.run(parent_table_format().with_name("aux_table"), schema=aux_schema) # also state in seprate package assert_load_info(info, expected_load_packages=2) - delta_tables = get_delta_tables(pipeline, schema_name="aux_2") - assert "aux_delta__child" in delta_tables.keys() - get_delta_tables(pipeline, "aux_delta", schema_name="aux_2") + tables = get_tables(pipeline, schema_name="aux_2") + assert "aux_table__child" in tables.keys() + get_tables(pipeline, "aux_table", schema_name="aux_2") with pytest.raises(ValueError): - get_delta_tables(pipeline, "aux_delta") + get_tables(pipeline, "aux_table") @pytest.mark.parametrize( diff --git a/tests/load/sources/sql_database/test_sql_database_source.py b/tests/load/sources/sql_database/test_sql_database_source.py index 00257471e0..2de923fe38 100644 --- a/tests/load/sources/sql_database/test_sql_database_source.py +++ b/tests/load/sources/sql_database/test_sql_database_source.py @@ -1286,10 +1286,7 @@ def assert_no_precision_columns( ) -> None: actual = list(columns.values()) # we always infer and emit nullability - expected = cast( - List[TColumnSchema], - deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS), - ) + expected = deepcopy(NULL_NO_PRECISION_COLUMNS if nullable else NOT_NULL_NO_PRECISION_COLUMNS) if backend == "pyarrow": expected = cast( List[TColumnSchema], diff --git a/tests/load/utils.py b/tests/load/utils.py index 5c24b2d1dc..5660202ec3 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -26,7 +26,10 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.configuration.specs import CredentialsConfiguration +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.destination.reference import ( DestinationClientDwhConfiguration, JobClientBase, @@ -57,6 +60,7 @@ from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import ( ACTIVE_DESTINATIONS, + ACTIVE_TABLE_FORMATS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS, @@ -171,7 +175,9 @@ def destination_factory(self, **kwargs) -> Destination[Any, Any]: dest_type = kwargs.pop("destination", self.destination_type) dest_name = kwargs.pop("destination_name", self.destination_name) self.setup() - return Destination.from_reference(dest_type, destination_name=dest_name, **kwargs) + return Destination.from_reference( + dest_type, self.credentials, destination_name=dest_name, **kwargs + ) def raw_capabilities(self) -> DestinationCapabilitiesContext: dest = Destination.from_reference(self.destination_type) @@ -604,7 +610,7 @@ def destinations_configs( DestinationTestConfiguration( destination_type="filesystem", bucket_url=bucket, - extra_info=bucket + "-delta", + extra_info=bucket, table_format="delta", supports_merge=True, file_format="parquet", @@ -619,12 +625,33 @@ def destinations_configs( ), ) ] + if bucket == AZ_BUCKET: + # `pyiceberg` does not support `az` scheme + continue + destination_configs += [ + DestinationTestConfiguration( + destination_type="filesystem", + bucket_url=bucket, + extra_info=bucket, + table_format="iceberg", + supports_merge=False, + file_format="parquet", + destination_name="fsgcpoauth" if bucket == GCS_BUCKET else None, + ) + ] # filter out non active destinations destination_configs = [ conf for conf in destination_configs if conf.destination_type in ACTIVE_DESTINATIONS ] + # filter out non active table formats + destination_configs = [ + conf + for conf in destination_configs + if conf.table_format is None or conf.table_format in ACTIVE_TABLE_FORMATS + ] + # filter out destinations not in subset if subset: destination_configs = [ diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0ae734f72e..e72a27c827 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -197,10 +197,23 @@ def _load_tables_to_dicts_fs( delta_tables = get_delta_tables(p, *table_names, schema_name=schema_name) + iceberg_table_names = [ + table_name + for table_name in table_names + if get_table_format(client.schema.tables, table_name) == "iceberg" + ] + if len(iceberg_table_names) > 0: + from dlt.common.libs.pyiceberg import get_iceberg_tables + + iceberg_tables = get_iceberg_tables(p, *table_names, schema_name=schema_name) + for table_name in table_names: if table_name in client.schema.data_table_names() and table_name in delta_table_names: dt = delta_tables[table_name] result[table_name] = dt.to_pyarrow_table().to_pylist() + elif table_name in client.schema.data_table_names() and table_name in iceberg_table_names: + it = iceberg_tables[table_name] + result[table_name] = it.scan().to_arrow().to_pylist() else: table_files = client.list_table_files(table_name) for file in table_files: diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 36fe009b93..e67ff9c70a 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -401,7 +401,7 @@ def test_paginate_json_body_without_params(self, rest_client) -> None: posts_skip = (DEFAULT_TOTAL_PAGES - 3) * DEFAULT_PAGE_SIZE class JSONBodyPageCursorPaginator(BaseReferencePaginator): - def update_state(self, response, data): + def update_state(self, response, data): # type: ignore[override] self._next_reference = response.json().get("next_page") def update_request(self, request): diff --git a/tests/utils.py b/tests/utils.py index 1aafa4bfe4..82d742ac65 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -32,6 +32,7 @@ from dlt.common.runtime.run_context import DOT_DLT, RunContext from dlt.common.runtime.telemetry import start_telemetry, stop_telemetry from dlt.common.schema import Schema +from dlt.common.schema.typing import TTableFormat from dlt.common.storages import FileStorage from dlt.common.storages.versioned_storage import VersionedStorage from dlt.common.typing import DictStrAny, StrAny, TDataItem @@ -88,6 +89,12 @@ ACTIVE_SQL_DESTINATIONS = SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) ACTIVE_NON_SQL_DESTINATIONS = NON_SQL_DESTINATIONS.intersection(ACTIVE_DESTINATIONS) +# filter out active table formats for current tests +IMPLEMENTED_TABLE_FORMATS = set(get_args(TTableFormat)) +ACTIVE_TABLE_FORMATS = set( + dlt.config.get("ACTIVE_TABLE_FORMATS", list) or IMPLEMENTED_TABLE_FORMATS +) + # sanity checks assert len(ACTIVE_DESTINATIONS) >= 0, "No active destinations selected"