Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make duckdb handle Iceberg table with nested types #2141

Merged
merged 6 commits into from
Dec 15, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dlt/destinations/impl/filesystem/sql_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,9 @@ def create_views_for_tables(self, tables: Dict[str, str]) -> None:
self._setup_iceberg(self._conn)
metadata_path = f"{resolved_folder}/metadata"
last_metadata_file = _get_last_metadata_file(metadata_path, self.fs_client)
from_statement = f"iceberg_scan('{last_metadata_file}')"
# skip schema inference to make nested data types work
# https://github.com/duckdb/duckdb_iceberg/issues/47
from_statement = f"iceberg_scan('{last_metadata_file}', skip_schema_inference=True)"
elif first_file_type == "parquet":
from_statement = f"read_parquet([{resolved_files_string}])"
elif first_file_type == "jsonl":
Expand Down
10 changes: 9 additions & 1 deletion tests/load/filesystem/test_sql_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
)
from dlt.destinations import filesystem
from tests.utils import TEST_STORAGE_ROOT
from tests.cases import arrow_table_all_data_types
from dlt.destinations.exceptions import DatabaseUndefinedRelation


Expand Down Expand Up @@ -81,7 +82,11 @@ def double_items():
for i in range(total_records)
]

return [items, double_items]
@dlt.resource(table_format=table_format)
def arrow_all_types():
yield arrow_table_all_data_types("arrow-table")[0]

return [items, double_items, arrow_all_types]

# run source
pipeline.run(source(), loader_file_format=destination_config.file_format)
Expand All @@ -98,6 +103,9 @@ def double_items():

# check we can create new tables from the views
with pipeline.sql_client() as c:
# check if all data types are handled properly
c.execute_sql("SELECT * FROM arrow_all_types;")

c.execute_sql(
"CREATE TABLE items_joined AS (SELECT i.id, di.double_id FROM items as i JOIN"
" double_items as di ON (i.id = di.id));"
Expand Down
Loading