Skip to content

Commit b87dd1b

Browse files
author
Jorrit Sandbrink
committed
refine staging table indexing
1 parent 6d14d57 commit b87dd1b

File tree

3 files changed

+37
-9
lines changed

3 files changed

+37
-9
lines changed

dlt/destinations/impl/synapse/synapse.py

+25-7
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,22 @@ def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None:
7070
def _get_table_update_sql(
7171
self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool
7272
) -> List[str]:
73-
table = self.get_load_table(table_name)
73+
table = self.get_load_table(table_name, staging=self.in_staging_mode)
7474
if table is None:
7575
table_index_type = self.config.default_table_index_type
7676
else:
7777
table_index_type = cast(TTableIndexType, table.get(TABLE_INDEX_TYPE_HINT))
78-
if table_index_type == "clustered_columnstore_index":
78+
if self.in_staging_mode:
79+
final_table = self.get_load_table(table_name, staging=False)
80+
final_table_index_type = cast(
81+
TTableIndexType, final_table.get(TABLE_INDEX_TYPE_HINT)
82+
)
83+
else:
84+
final_table_index_type = table_index_type
85+
if final_table_index_type == "clustered_columnstore_index":
86+
# Even if the staging table has index type "heap", we still adjust
87+
# the column data types to prevent errors when writing into the
88+
# final table that has index type "clustered_columnstore_index".
7989
new_columns = self._get_columstore_valid_columns(new_columns)
8090

8191
_sql_result = SqlJobClientBase._get_table_update_sql(
@@ -129,12 +139,20 @@ def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema
129139
table = super().get_load_table(table_name, staging)
130140
if table is None:
131141
return None
132-
if table_name in self.schema.dlt_table_names():
133-
# dlt tables should always be heap tables, regardless of the user
134-
# configuration. Why? "For small lookup tables, less than 60 million rows,
135-
# consider using HEAP or clustered index for faster query performance."
136-
# https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables
142+
if staging and self.config.replace_strategy == "insert-from-staging":
143+
# Staging tables should always be heap tables, because "when you are
144+
# temporarily landing data in dedicated SQL pool, you may find that
145+
# using a heap table makes the overall process faster."
146+
# "staging-optimized" is not included, because in that strategy the
147+
# staging table becomes the final table, so we should already create
148+
# it with the desired index type.
149+
table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key]
150+
elif table_name in self.schema.dlt_table_names():
151+
# dlt tables should always be heap tables, because "for small lookup
152+
# tables, less than 60 million rows, consider using HEAP or clustered
153+
# index for faster query performance."
137154
table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key]
155+
# https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables
138156
elif table_name in self.schema.data_table_names():
139157
if TABLE_INDEX_TYPE_HINT not in table:
140158
# If present in parent table, fetch hint from there.

docs/website/docs/dlt-ecosystem/destinations/synapse.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,9 @@ Possible values:
135135
>* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)).
136136
>* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`.
137137
>* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions.
138-
>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense
138+
>* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense
139139
for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables).
140+
>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table.
140141
>* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance."
141142
>* Child tables, if any, inherent the table index type of their parent table.
142143

tests/load/synapse/test_synapse_table_indexing.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,22 @@ def items_with_table_index_type_specified() -> Iterator[Any]:
9898
@pytest.mark.parametrize(
9999
"table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID
100100
)
101+
@pytest.mark.parametrize(
102+
# Also test staging replace strategies, to make sure the final table index
103+
# type is not affected by staging table index type adjustments.
104+
"replace_strategy",
105+
["insert-from-staging", "staging-optimized"],
106+
)
101107
def test_resource_table_index_type_configuration(
102108
table_index_type: TTableIndexType,
103109
column_schema: Union[List[TColumnSchema], None],
110+
replace_strategy: str,
104111
) -> None:
112+
os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy
113+
105114
@dlt.resource(
106115
name="items_with_table_index_type_specified",
107-
write_disposition="append",
116+
write_disposition="replace",
108117
columns=column_schema,
109118
)
110119
def items_with_table_index_type_specified() -> Iterator[Any]:

0 commit comments

Comments
 (0)