From 5691b650257da3d12d2c5f02b5677a4e51eb7322 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 23 Apr 2024 17:59:23 +0200 Subject: [PATCH] use text for json in clickhouse --- dlt/destinations/impl/clickhouse/clickhouse.py | 4 ++-- tests/cases.py | 4 ++++ tests/load/pipeline/test_merge_disposition.py | 10 +++------- tests/load/pipeline/test_pipelines.py | 3 ++- tests/load/pipeline/test_stage_loading.py | 5 +---- tests/load/test_job_client.py | 4 ---- 6 files changed, 12 insertions(+), 18 deletions(-) diff --git a/dlt/destinations/impl/clickhouse/clickhouse.py b/dlt/destinations/impl/clickhouse/clickhouse.py index 3f95bae5ea..7f6e5b2429 100644 --- a/dlt/destinations/impl/clickhouse/clickhouse.py +++ b/dlt/destinations/impl/clickhouse/clickhouse.py @@ -72,7 +72,7 @@ class ClickHouseTypeMapper(TypeMapper): sct_to_unbound_dbt = { - "complex": "JSON", + "complex": "text", "text": "String", "double": "Float64", "bool": "Boolean", @@ -182,7 +182,7 @@ def __init__( fmt=clickhouse_format, settings={ "allow_experimental_lightweight_delete": 1, - "allow_experimental_object_type": 1, + # "allow_experimental_object_type": 1, "enable_http_compression": 1, }, compression=compression, diff --git a/tests/cases.py b/tests/cases.py index 15e3fef091..0ba2c05f40 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -198,6 +198,7 @@ def assert_all_data_types_row( schema: TTableSchemaColumns = None, expect_filtered_null_columns=False, allow_string_binary: bool = False, + expect_empty_string_for_null_complex: bool = False, ) -> None: # content must equal # print(db_row) @@ -237,6 +238,9 @@ def assert_all_data_types_row( ensure_pendulum_time(expected_rows["col11_precision"]), 3 # type: ignore[arg-type] ) + if "col9_null" in expected_rows and expect_empty_string_for_null_complex: + expected_rows["col9_null"] = "" + # redshift and bigquery return strings from structured fields for binary_col in ["col7", "col7_precision"]: if binary_col in db_mapping: diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index d2978e105a..aaa8a73571 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -139,7 +139,7 @@ def test_merge_on_ad_hoc_primary_key(destination_config: DestinationTestConfigur @dlt.source(root_key=True) -def github(remove_lists: bool = False): +def github(): @dlt.resource( table_name="issues", write_disposition="merge", @@ -151,10 +151,6 @@ def load_issues(): "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" ) as f: for item in json.load(f): - # for clickhouse we cannot have lists in json fields - if remove_lists: - item.pop("assignees") - item.pop("labels") yield item return load_issues @@ -217,7 +213,7 @@ def test_merge_source_compound_keys_and_changes( ) def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) - github_data = github(True) + github_data = github() assert github_data.max_table_nesting is None assert github_data.root_key is True # set max nesting to 0 so no child tables are generated @@ -236,7 +232,7 @@ def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) assert github_1_counts["issues"] == 15 # load all - github_data = github(True) + github_data = github() github_data.max_table_nesting = 0 info = p.run(github_data, loader_file_format=destination_config.file_format) assert_load_info(info) diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 05dae6db61..818b6fa6e1 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -795,7 +795,7 @@ def other_data(): column_schemas = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) # parquet on bigquery and clickhouse does not support JSON but we still want to run the test - if destination_config.destination in ["bigquery", "clickhouse"]: + if destination_config.destination in ["bigquery"]: column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text" # duckdb 0.9.1 does not support TIME other than 6 @@ -873,6 +873,7 @@ def some_source(): in ["snowflake", "bigquery", "redshift"], allow_string_binary=destination_config.destination == "clickhouse", timestamp_precision=3 if destination_config.destination in ("athena", "dremio") else 6, + expect_empty_string_for_null_complex=destination_config.destination == "clickhouse", ) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 60a7be259b..f6c47f5ecd 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -204,10 +204,6 @@ def test_all_data_types(destination_config: DestinationTestConfiguration) -> Non exclude_types=exclude_types, exclude_columns=exclude_columns ) - # clickhouse json is experimental, will not work for parquet and makes strange changes for jsonl - if destination_config.destination in ["clickhouse"]: - column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text" - # bigquery and clickhouse cannot load into JSON fields from parquet if destination_config.file_format == "parquet": if destination_config.destination in ["bigquery"]: @@ -260,4 +256,5 @@ def my_source(): allow_string_binary=allow_string_binary, timestamp_precision=sql_client.capabilities.timestamp_precision, schema=column_schemas, + expect_empty_string_for_null_complex=destination_config.destination == "clickhouse", ) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index bd9f246aff..4c442d1ff9 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -521,10 +521,6 @@ def test_load_with_all_types( ), ) - # switch complex to string for clickhouse - if client.config.destination_type in ["clickhouse"]: - column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text" - # we should have identical content with all disposition types client.schema.update_table( new_table(