Skip to content

Commit

Permalink
use text for json in clickhouse
Browse files Browse the repository at this point in the history
  • Loading branch information
sh-rp committed Apr 23, 2024
1 parent 0ca6c36 commit 5691b65
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 18 deletions.
4 changes: 2 additions & 2 deletions dlt/destinations/impl/clickhouse/clickhouse.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@

class ClickHouseTypeMapper(TypeMapper):
sct_to_unbound_dbt = {
"complex": "JSON",
"complex": "text",
"text": "String",
"double": "Float64",
"bool": "Boolean",
Expand Down Expand Up @@ -182,7 +182,7 @@ def __init__(
fmt=clickhouse_format,
settings={
"allow_experimental_lightweight_delete": 1,
"allow_experimental_object_type": 1,
# "allow_experimental_object_type": 1,
"enable_http_compression": 1,
},
compression=compression,
Expand Down
4 changes: 4 additions & 0 deletions tests/cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def assert_all_data_types_row(
schema: TTableSchemaColumns = None,
expect_filtered_null_columns=False,
allow_string_binary: bool = False,
expect_empty_string_for_null_complex: bool = False,
) -> None:
# content must equal
# print(db_row)
Expand Down Expand Up @@ -237,6 +238,9 @@ def assert_all_data_types_row(
ensure_pendulum_time(expected_rows["col11_precision"]), 3 # type: ignore[arg-type]
)

if "col9_null" in expected_rows and expect_empty_string_for_null_complex:
expected_rows["col9_null"] = ""

# redshift and bigquery return strings from structured fields
for binary_col in ["col7", "col7_precision"]:
if binary_col in db_mapping:
Expand Down
10 changes: 3 additions & 7 deletions tests/load/pipeline/test_merge_disposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_merge_on_ad_hoc_primary_key(destination_config: DestinationTestConfigur


@dlt.source(root_key=True)
def github(remove_lists: bool = False):
def github():
@dlt.resource(
table_name="issues",
write_disposition="merge",
Expand All @@ -151,10 +151,6 @@ def load_issues():
"tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8"
) as f:
for item in json.load(f):
# for clickhouse we cannot have lists in json fields
if remove_lists:
item.pop("assignees")
item.pop("labels")
yield item

return load_issues
Expand Down Expand Up @@ -217,7 +213,7 @@ def test_merge_source_compound_keys_and_changes(
)
def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) -> None:
p = destination_config.setup_pipeline("github_3", full_refresh=True)
github_data = github(True)
github_data = github()
assert github_data.max_table_nesting is None
assert github_data.root_key is True
# set max nesting to 0 so no child tables are generated
Expand All @@ -236,7 +232,7 @@ def test_merge_no_child_tables(destination_config: DestinationTestConfiguration)
assert github_1_counts["issues"] == 15

# load all
github_data = github(True)
github_data = github()
github_data.max_table_nesting = 0
info = p.run(github_data, loader_file_format=destination_config.file_format)
assert_load_info(info)
Expand Down
3 changes: 2 additions & 1 deletion tests/load/pipeline/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,7 +795,7 @@ def other_data():
column_schemas = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA)

# parquet on bigquery and clickhouse does not support JSON but we still want to run the test
if destination_config.destination in ["bigquery", "clickhouse"]:
if destination_config.destination in ["bigquery"]:
column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text"

# duckdb 0.9.1 does not support TIME other than 6
Expand Down Expand Up @@ -873,6 +873,7 @@ def some_source():
in ["snowflake", "bigquery", "redshift"],
allow_string_binary=destination_config.destination == "clickhouse",
timestamp_precision=3 if destination_config.destination in ("athena", "dremio") else 6,
expect_empty_string_for_null_complex=destination_config.destination == "clickhouse",
)


Expand Down
5 changes: 1 addition & 4 deletions tests/load/pipeline/test_stage_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,10 +204,6 @@ def test_all_data_types(destination_config: DestinationTestConfiguration) -> Non
exclude_types=exclude_types, exclude_columns=exclude_columns
)

# clickhouse json is experimental, will not work for parquet and makes strange changes for jsonl
if destination_config.destination in ["clickhouse"]:
column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text"

# bigquery and clickhouse cannot load into JSON fields from parquet
if destination_config.file_format == "parquet":
if destination_config.destination in ["bigquery"]:
Expand Down Expand Up @@ -260,4 +256,5 @@ def my_source():
allow_string_binary=allow_string_binary,
timestamp_precision=sql_client.capabilities.timestamp_precision,
schema=column_schemas,
expect_empty_string_for_null_complex=destination_config.destination == "clickhouse",
)
4 changes: 0 additions & 4 deletions tests/load/test_job_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,10 +521,6 @@ def test_load_with_all_types(
),
)

# switch complex to string for clickhouse
if client.config.destination_type in ["clickhouse"]:
column_schemas["col9_null"]["data_type"] = column_schemas["col9"]["data_type"] = "text"

# we should have identical content with all disposition types
client.schema.update_table(
new_table(
Expand Down

0 comments on commit 5691b65

Please sign in to comment.