From 865e0e974268654c98a60c7496a109e69868f7de Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 12 Dec 2024 11:13:58 +0400 Subject: [PATCH] make duckdb handle iceberg table with nested types --- dlt/destinations/impl/filesystem/sql_client.py | 4 +++- tests/load/filesystem/test_sql_client.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/dlt/destinations/impl/filesystem/sql_client.py b/dlt/destinations/impl/filesystem/sql_client.py index d39f4c3431..dd1fc6a330 100644 --- a/dlt/destinations/impl/filesystem/sql_client.py +++ b/dlt/destinations/impl/filesystem/sql_client.py @@ -266,7 +266,9 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None: self._setup_iceberg(self._conn) metadata_path = f"{resolved_folder}/metadata" last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client) - from_statement = f"iceberg_scan('{last_metadata_file}')" + # skip schema inference to make nested data types work + # https://github.com/duckdb/duckdb_iceberg/issues/47 + from_statement = f"iceberg_scan('{last_metadata_file}', skip_schema_inference=True)" elif first_file_type == "parquet": from_statement = f"read_parquet([{resolved_files_string}])" elif first_file_type == "jsonl": diff --git a/tests/load/filesystem/test_sql_client.py b/tests/load/filesystem/test_sql_client.py index a73b0f7e31..1d5eaa9984 100644 --- a/tests/load/filesystem/test_sql_client.py +++ b/tests/load/filesystem/test_sql_client.py @@ -22,6 +22,7 @@ ) from dlt.destinations import filesystem from tests.utils import TEST_STORAGE_ROOT +from tests.cases import arrow_table_all_data_types from dlt.destinations.exceptions import DatabaseUndefinedRelation @@ -81,7 +82,11 @@ def double_items(): for i in range(total_records) ] - return [items, double_items] + @dlt.resource(table_format=table_format) + def arrow_all_types(): + yield arrow_table_all_data_types("arrow-table")[0] + + return [items, double_items, arrow_all_types] # run source pipeline.run(source(), loader_file_format=destination_config.file_format) @@ -98,6 +103,9 @@ def double_items(): # check we can create new tables from the views with pipeline.sql_client() as c: + # check if all data types are handled properly + c.execute_sql("SELECT * FROM arrow_all_types;") + c.execute_sql( "CREATE TABLE items_joined AS (SELECT i.id, di.double_id FROM items as i JOIN" " double_items as di ON (i.id = di.id));"