From 7bc2163ff001b9a3299827e1d3ddf0da021f36d6 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 18 Jan 2024 01:18:56 +0100 Subject: [PATCH 01/23] Synapse destination initial commit --- .../workflows/test_destination_synapse.yml | 22 ++- dlt/common/data_writers/escape.py | 10 +- dlt/common/data_writers/writers.py | 24 +++- dlt/common/destination/capabilities.py | 1 + dlt/destinations/__init__.py | 2 + dlt/destinations/impl/mssql/configuration.py | 31 +++-- dlt/destinations/impl/mssql/sql_client.py | 7 +- dlt/destinations/impl/synapse/README.md | 58 ++++++++ dlt/destinations/impl/synapse/__init__.py | 46 +++++++ .../impl/synapse/configuration.py | 38 +++++ dlt/destinations/impl/synapse/factory.py | 51 +++++++ dlt/destinations/impl/synapse/sql_client.py | 28 ++++ dlt/destinations/impl/synapse/synapse.py | 99 +++++++++++++ dlt/destinations/insert_job_client.py | 18 ++- dlt/helpers/dbt/profiles.yml | 18 ++- poetry.lock | 3 +- pyproject.toml | 1 + tests/load/mssql/test_mssql_credentials.py | 69 +++++++--- tests/load/mssql/test_mssql_table_builder.py | 3 +- tests/load/pipeline/test_dbt_helper.py | 7 +- .../load/pipeline/test_replace_disposition.py | 6 + tests/load/synapse/__init__.py | 3 + .../synapse/test_synapse_configuration.py | 46 +++++++ .../synapse/test_synapse_table_builder.py | 130 ++++++++++++++++++ tests/load/test_job_client.py | 3 +- tests/load/test_sql_client.py | 12 +- tests/load/utils.py | 7 +- tests/utils.py | 1 + 28 files changed, 672 insertions(+), 72 deletions(-) create mode 100644 dlt/destinations/impl/synapse/README.md create mode 100644 dlt/destinations/impl/synapse/__init__.py create mode 100644 dlt/destinations/impl/synapse/configuration.py create mode 100644 dlt/destinations/impl/synapse/factory.py create mode 100644 dlt/destinations/impl/synapse/sql_client.py create mode 100644 dlt/destinations/impl/synapse/synapse.py create mode 100644 tests/load/synapse/__init__.py create mode 100644 tests/load/synapse/test_synapse_configuration.py create mode 100644 tests/load/synapse/test_synapse_table_builder.py diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index 83800fa789..ecd890d32a 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -5,7 +5,6 @@ on: branches: - master - devel - workflow_dispatch: env: @@ -18,19 +17,14 @@ env: ALL_FILESYSTEM_DRIVERS: "[\"memory\"]" jobs: - - build: - runs-on: ubuntu-latest - - steps: - - name: Check source branch name - run: | - if [[ "${{ github.head_ref }}" != "synapse" ]]; then - exit 1 - fi + get_docs_changes: + uses: ./.github/workflows/get_docs_changes.yml + if: ${{ !github.event.pull_request.head.repo.fork }} run_loader: name: Tests Synapse loader + needs: get_docs_changes + if: needs.get_docs_changes.outputs.changes_outside_docs == 'true' strategy: fail-fast: false matrix: @@ -69,17 +63,17 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E s3 -E gs -E az --with sentry-sdk --with pipeline + run: poetry install --no-interaction -E synapse -E parquet --with sentry-sdk --with pipeline - name: create secrets.toml run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os != 'Windows' name: Run tests Linux/MAC - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os == 'Windows' name: Run tests Windows shell: cmd diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 5bf8f29ccb..b56a0d8f19 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -98,8 +98,14 @@ def escape_mssql_literal(v: Any) -> Any: json.dumps(v), prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE ) if isinstance(v, bytes): - base_64_string = base64.b64encode(v).decode("ascii") - return f"""CAST('' AS XML).value('xs:base64Binary("{base_64_string}")', 'VARBINARY(MAX)')""" + # 8000 is the max value for n in VARBINARY(n) + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/binary-and-varbinary-transact-sql + if len(v) <= 8000: + n = len(v) + else: + n = "MAX" + return f"CONVERT(VARBINARY({n}), '{v.hex()}', 2)" + if isinstance(v, bool): return str(int(v)) if v is None: diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 0f9ff09259..0f3640da1e 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -175,18 +175,29 @@ def write_header(self, columns_schema: TTableSchemaColumns) -> None: # do not write INSERT INTO command, this must be added together with table name by the loader self._f.write("INSERT INTO {}(") self._f.write(",".join(map(self._caps.escape_identifier, headers))) - self._f.write(")\nVALUES\n") + if self._caps.insert_values_writer_type == "default": + self._f.write(")\nVALUES\n") + elif self._caps.insert_values_writer_type == "select_union": + self._f.write(")\n") def write_data(self, rows: Sequence[Any]) -> None: super().write_data(rows) - def write_row(row: StrAny) -> None: + def write_row(row: StrAny, last_row: bool = False) -> None: output = ["NULL"] * len(self._headers_lookup) for n, v in row.items(): output[self._headers_lookup[n]] = self._caps.escape_literal(v) - self._f.write("(") - self._f.write(",".join(output)) - self._f.write(")") + if self._caps.insert_values_writer_type == "default": + self._f.write("(") + self._f.write(",".join(output)) + self._f.write(")") + if not last_row: + self._f.write(",\n") + elif self._caps.insert_values_writer_type == "select_union": + self._f.write("SELECT ") + self._f.write(",".join(output)) + if not last_row: + self._f.write("\nUNION ALL\n") # if next chunk add separator if self._chunks_written > 0: @@ -195,10 +206,9 @@ def write_row(row: StrAny) -> None: # write rows for row in rows[:-1]: write_row(row) - self._f.write(",\n") # write last row without separator so we can write footer eventually - write_row(rows[-1]) + write_row(rows[-1], last_row=True) self._chunks_written += 1 def write_footer(self) -> None: diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 2596b2bf99..08c7a31388 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -52,6 +52,7 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): schema_supports_numeric_precision: bool = True timestamp_precision: int = 6 max_rows_per_insert: Optional[int] = None + insert_values_writer_type: str = "default" # do not allow to create default value, destination caps must be always explicitly inserted into container can_create_default: ClassVar[bool] = False diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index 980c4ce7f2..775778cd4a 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -10,6 +10,7 @@ from dlt.destinations.impl.qdrant.factory import qdrant from dlt.destinations.impl.motherduck.factory import motherduck from dlt.destinations.impl.weaviate.factory import weaviate +from dlt.destinations.impl.synapse.factory import synapse __all__ = [ @@ -25,4 +26,5 @@ "qdrant", "motherduck", "weaviate", + "synapse", ] diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index f33aca4b82..f00998cfb2 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -1,4 +1,4 @@ -from typing import Final, ClassVar, Any, List, Optional, TYPE_CHECKING +from typing import Final, ClassVar, Any, List, Dict, Optional, TYPE_CHECKING from sqlalchemy.engine import URL from dlt.common.configuration import configspec @@ -10,9 +10,6 @@ from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -SUPPORTED_DRIVERS = ["ODBC Driver 18 for SQL Server", "ODBC Driver 17 for SQL Server"] - - @configspec class MsSqlCredentials(ConnectionStringCredentials): drivername: Final[str] = "mssql" # type: ignore @@ -24,22 +21,27 @@ class MsSqlCredentials(ConnectionStringCredentials): __config_gen_annotations__: ClassVar[List[str]] = ["port", "connect_timeout"] + SUPPORTED_DRIVERS: ClassVar[List[str]] = [ + "ODBC Driver 18 for SQL Server", + "ODBC Driver 17 for SQL Server", + ] + def parse_native_representation(self, native_value: Any) -> None: # TODO: Support ODBC connection string or sqlalchemy URL super().parse_native_representation(native_value) if self.query is not None: self.query = {k.lower(): v for k, v in self.query.items()} # Make case-insensitive. - if "driver" in self.query and self.query.get("driver") not in SUPPORTED_DRIVERS: - raise SystemConfigurationException( - f"""The specified driver "{self.query.get('driver')}" is not supported.""" - f" Choose one of the supported drivers: {', '.join(SUPPORTED_DRIVERS)}." - ) self.driver = self.query.get("driver", self.driver) self.connect_timeout = int(self.query.get("connect_timeout", self.connect_timeout)) if not self.is_partial(): self.resolve() def on_resolved(self) -> None: + if self.driver not in self.SUPPORTED_DRIVERS: + raise SystemConfigurationException( + f"""The specified driver "{self.driver}" is not supported.""" + f" Choose one of the supported drivers: {', '.join(self.SUPPORTED_DRIVERS)}." + ) self.database = self.database.lower() def to_url(self) -> URL: @@ -55,20 +57,21 @@ def on_partial(self) -> None: def _get_driver(self) -> str: if self.driver: return self.driver + # Pick a default driver if available import pyodbc available_drivers = pyodbc.drivers() - for d in SUPPORTED_DRIVERS: + for d in self.SUPPORTED_DRIVERS: if d in available_drivers: return d docs_url = "https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16" raise SystemConfigurationException( f"No supported ODBC driver found for MS SQL Server. See {docs_url} for information on" - f" how to install the '{SUPPORTED_DRIVERS[0]}' on your platform." + f" how to install the '{self.SUPPORTED_DRIVERS[0]}' on your platform." ) - def to_odbc_dsn(self) -> str: + def _get_odbc_dsn_dict(self) -> Dict[str, Any]: params = { "DRIVER": self.driver, "SERVER": f"{self.host},{self.port}", @@ -78,6 +81,10 @@ def to_odbc_dsn(self) -> str: } if self.query is not None: params.update({k.upper(): v for k, v in self.query.items()}) + return params + + def to_odbc_dsn(self) -> str: + params = self._get_odbc_dsn_dict() return ";".join([f"{k}={v}" for k, v in params.items()]) diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index 427518feeb..2ddd56350e 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -106,8 +106,8 @@ def drop_dataset(self) -> None: ) table_names = [row[0] for row in rows] self.drop_tables(*table_names) - - self.execute_sql("DROP SCHEMA IF EXISTS %s;" % self.fully_qualified_dataset_name()) + # Drop schema + self._drop_schema() def _drop_views(self, *tables: str) -> None: if not tables: @@ -117,6 +117,9 @@ def _drop_views(self, *tables: str) -> None: ] self.execute_fragments(statements) + def _drop_schema(self) -> None: + self.execute_sql("DROP SCHEMA IF EXISTS %s;" % self.fully_qualified_dataset_name()) + def execute_sql( self, sql: AnyStr, *args: Any, **kwargs: Any ) -> Optional[Sequence[Sequence[Any]]]: diff --git a/dlt/destinations/impl/synapse/README.md b/dlt/destinations/impl/synapse/README.md new file mode 100644 index 0000000000..b133faf67a --- /dev/null +++ b/dlt/destinations/impl/synapse/README.md @@ -0,0 +1,58 @@ +# Set up loader user +Execute the following SQL statements to set up the [loader](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) user: +```sql +-- on master database + +CREATE LOGIN loader WITH PASSWORD = 'YOUR_LOADER_PASSWORD_HERE'; +``` + +```sql +-- on minipool database + +CREATE USER loader FOR LOGIN loader; + +-- DDL permissions +GRANT CREATE TABLE ON DATABASE :: minipool TO loader; +GRANT CREATE VIEW ON DATABASE :: minipool TO loader; + +-- DML permissions +GRANT SELECT ON DATABASE :: minipool TO loader; +GRANT INSERT ON DATABASE :: minipool TO loader; +GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; +``` + +```sql +-- https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation + +CREATE WORKLOAD GROUP DataLoads +WITH ( + MIN_PERCENTAGE_RESOURCE = 0 + ,CAP_PERCENTAGE_RESOURCE = 50 + ,REQUEST_MIN_RESOURCE_GRANT_PERCENT = 25 +); + +CREATE WORKLOAD CLASSIFIER [wgcELTLogin] +WITH ( + WORKLOAD_GROUP = 'DataLoads' + ,MEMBERNAME = 'loader' +); +``` + +# config.toml +```toml +[destination.synapse.credentials] +database = "minipool" +username = "loader" +host = "dlt-synapse-ci.sql.azuresynapse.net" +port = 1433 +driver = "ODBC Driver 18 for SQL Server" + +[destination.synapse] +create_indexes = false +``` + +# secrets.toml +```toml +[destination.synapse.credentials] +password = "YOUR_LOADER_PASSWORD_HERE" +``` \ No newline at end of file diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py new file mode 100644 index 0000000000..175b011186 --- /dev/null +++ b/dlt/destinations/impl/synapse/__init__.py @@ -0,0 +1,46 @@ +from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE +from dlt.common.wei import EVM_DECIMAL_PRECISION + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + + caps.preferred_loader_file_format = "insert_values" + caps.supported_loader_file_formats = ["insert_values"] + caps.preferred_staging_file_format = None + caps.supported_staging_file_formats = [] + + caps.insert_values_writer_type = "select_union" # https://stackoverflow.com/a/77014299 + + caps.escape_identifier = escape_postgres_identifier + caps.escape_literal = escape_mssql_literal + + # Synapse has a max precision of 38 + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#DataTypes + caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) + caps.wei_precision = (DEFAULT_NUMERIC_PRECISION, 0) + + # https://learn.microsoft.com/en-us/sql/t-sql/statements/create-table-azure-sql-data-warehouse?view=aps-pdw-2016-au7#LimitationsRestrictions + caps.max_identifier_length = 128 + caps.max_column_identifier_length = 128 + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-service-capacity-limits#queries + caps.max_query_length = 65536 * 4096 + caps.is_max_query_length_in_bytes = True + + # nvarchar(max) can store 2 GB + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/nchar-and-nvarchar-transact-sql?view=sql-server-ver16#nvarchar---n--max-- + caps.max_text_data_type_length = 2 * 1024 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = True + + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-develop-transactions + caps.supports_transactions = True + caps.supports_ddl_transactions = False + + # datetimeoffset can store 7 digits for fractional seconds + # https://learn.microsoft.com/en-us/sql/t-sql/data-types/datetimeoffset-transact-sql?view=sql-server-ver16 + caps.timestamp_precision = 7 + + return caps diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py new file mode 100644 index 0000000000..0596cc2c46 --- /dev/null +++ b/dlt/destinations/impl/synapse/configuration.py @@ -0,0 +1,38 @@ +from typing import Final, Any, List, Dict, Optional, ClassVar + +from dlt.common.configuration import configspec + +from dlt.destinations.impl.mssql.configuration import ( + MsSqlCredentials, + MsSqlClientConfiguration, +) +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials + + +@configspec +class SynapseCredentials(MsSqlCredentials): + drivername: Final[str] = "synapse" # type: ignore + + # LongAsMax keyword got introduced in ODBC Driver 18 for SQL Server. + SUPPORTED_DRIVERS: ClassVar[List[str]] = ["ODBC Driver 18 for SQL Server"] + + def _get_odbc_dsn_dict(self) -> Dict[str, Any]: + params = super()._get_odbc_dsn_dict() + # Long types (text, ntext, image) are not supported on Synapse. + # Convert to max types using LongAsMax keyword. + # https://stackoverflow.com/a/57926224 + params["LONGASMAX"] = "yes" + return params + + +@configspec +class SynapseClientConfiguration(MsSqlClientConfiguration): + destination_type: Final[str] = "synapse" # type: ignore + credentials: SynapseCredentials + + # Determines if `primary_key` and `unique` column hints are applied. + # Set to False by default because the PRIMARY KEY and UNIQUE constraints + # are tricky in Synapse: they are NOT ENFORCED and can lead to innacurate + # results if the user does not ensure all column values are unique. + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints + create_indexes: bool = False diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py new file mode 100644 index 0000000000..fa7facc0ca --- /dev/null +++ b/dlt/destinations/impl/synapse/factory.py @@ -0,0 +1,51 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.synapse import capabilities + +from dlt.destinations.impl.synapse.configuration import ( + SynapseCredentials, + SynapseClientConfiguration, +) + +if t.TYPE_CHECKING: + from dlt.destinations.impl.synapse.synapse import SynapseClient + + +class synapse(Destination[SynapseClientConfiguration, "SynapseClient"]): + spec = SynapseClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["SynapseClient"]: + from dlt.destinations.impl.synapse.synapse import SynapseClient + + return SynapseClient + + def __init__( + self, + credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = False, + destination_name: t.Optional[str] = None, + environment: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Synapse destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the Synapse dedicated pool. Can be an instance of `SynapseCredentials` or + a connection string in the format `synapse://user:password@host:port/database` + create_indexes: Should unique indexes be created, defaults to False + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + credentials=credentials, + create_indexes=create_indexes, + destination_name=destination_name, + environment=environment, + **kwargs, + ) diff --git a/dlt/destinations/impl/synapse/sql_client.py b/dlt/destinations/impl/synapse/sql_client.py new file mode 100644 index 0000000000..089c58e57c --- /dev/null +++ b/dlt/destinations/impl/synapse/sql_client.py @@ -0,0 +1,28 @@ +from typing import ClassVar +from contextlib import suppress + +from dlt.common.destination import DestinationCapabilitiesContext + +from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse.configuration import SynapseCredentials + +from dlt.destinations.exceptions import DatabaseUndefinedRelation + + +class SynapseSqlClient(PyOdbcMsSqlClient): + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def drop_tables(self, *tables: str) -> None: + if not tables: + return + # Synapse does not support DROP TABLE IF EXISTS. + # Workaround: use DROP TABLE and suppress non-existence errors. + statements = [f"DROP TABLE {self.make_qualified_table_name(table)};" for table in tables] + with suppress(DatabaseUndefinedRelation): + self.execute_fragments(statements) + + def _drop_schema(self) -> None: + # Synapse does not support DROP SCHEMA IF EXISTS. + self.execute_sql("DROP SCHEMA %s;" % self.fully_qualified_dataset_name()) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py new file mode 100644 index 0000000000..18d1fa81d4 --- /dev/null +++ b/dlt/destinations/impl/synapse/synapse.py @@ -0,0 +1,99 @@ +from typing import ClassVar, Sequence, List, Dict, Any, Optional +from copy import deepcopy + +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.reference import SupportsStagingDestination, NewLoadJob + +from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint +from dlt.common.schema.typing import TTableSchemaColumns + +from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams +from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.insert_job_client import InsertValuesJobClient +from dlt.destinations.job_client_impl import SqlJobClientBase + +from dlt.destinations.impl.mssql.mssql import MsSqlTypeMapper, MsSqlClient, HINT_TO_MSSQL_ATTR + +from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient +from dlt.destinations.impl.synapse.configuration import SynapseClientConfiguration + + +HINT_TO_SYNAPSE_ATTR: Dict[TColumnHint, str] = { + "primary_key": "PRIMARY KEY NONCLUSTERED NOT ENFORCED", + "unique": "UNIQUE NOT ENFORCED", +} + + +class SynapseClient(MsSqlClient, SupportsStagingDestination): + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() + + def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: + sql_client = SynapseSqlClient(config.normalize_dataset_name(schema), config.credentials) + InsertValuesJobClient.__init__(self, schema, config, sql_client) + self.config: SynapseClientConfiguration = config + self.sql_client = sql_client + self.type_mapper = MsSqlTypeMapper(self.capabilities) + + self.active_hints = deepcopy(HINT_TO_SYNAPSE_ATTR) + if not self.config.create_indexes: + self.active_hints.pop("primary_key", None) + self.active_hints.pop("unique", None) + + def _get_table_update_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> List[str]: + _sql_result = SqlJobClientBase._get_table_update_sql( + self, table_name, new_columns, generate_alter + ) + if not generate_alter: + # Append WITH clause to create heap table instead of default + # columnstore table. Heap tables are a more robust choice, because + # columnstore tables do not support varchar(max), nvarchar(max), + # and varbinary(max). + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index + sql_result = [_sql_result[0] + "\n WITH ( HEAP );"] + else: + sql_result = _sql_result + return sql_result + + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: + if self.config.replace_strategy == "staging-optimized": + return [SynapseStagingCopyJob.from_table_chain(table_chain, self.sql_client)] + return super()._create_replace_followup_jobs(table_chain) + + +class SynapseStagingCopyJob(SqlStagingCopyJob): + @classmethod + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: + sql: List[str] = [] + for table in table_chain: + with sql_client.with_staging_dataset(staging=True): + staging_table_name = sql_client.make_qualified_table_name(table["name"]) + table_name = sql_client.make_qualified_table_name(table["name"]) + # drop destination table + sql.append(f"DROP TABLE {table_name};") + # moving staging table to destination schema + sql.append( + f"ALTER SCHEMA {sql_client.fully_qualified_dataset_name()} TRANSFER" + f" {staging_table_name};" + ) + # recreate staging table + # In some cases, when multiple instances of this CTAS query are + # executed concurrently, Synapse suspends the queries and hangs. + # This can be prevented by setting the env var LOAD__WORKERS = "1". + sql.append( + f"CREATE TABLE {staging_table_name}" + " WITH ( DISTRIBUTION = ROUND_ROBIN, HEAP )" # distribution must be explicitly specified with CTAS + f" AS SELECT * FROM {table_name}" + " WHERE 1 = 0;" # no data, table structure only + ) + + return sql diff --git a/dlt/destinations/insert_job_client.py b/dlt/destinations/insert_job_client.py index 678ba43bcc..776176078e 100644 --- a/dlt/destinations/insert_job_client.py +++ b/dlt/destinations/insert_job_client.py @@ -36,9 +36,10 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # the procedure below will split the inserts into max_query_length // 2 packs with FileStorage.open_zipsafe_ro(file_path, "r", encoding="utf-8") as f: header = f.readline() - values_mark = f.readline() - # properly formatted file has a values marker at the beginning - assert values_mark == "VALUES\n" + if self._sql_client.capabilities.insert_values_writer_type == "default": + # properly formatted file has a values marker at the beginning + values_mark = f.readline() + assert values_mark == "VALUES\n" max_rows = self._sql_client.capabilities.max_rows_per_insert @@ -67,7 +68,9 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st # Chunk by max_rows - 1 for simplicity because one more row may be added for chunk in chunks(values_rows, max_rows - 1): processed += len(chunk) - insert_sql.extend([header.format(qualified_table_name), values_mark]) + insert_sql.append(header.format(qualified_table_name)) + if self._sql_client.capabilities.insert_values_writer_type == "default": + insert_sql.append(values_mark) if processed == len_rows: # On the last chunk we need to add the extra row read insert_sql.append("".join(chunk) + until_nl) @@ -76,7 +79,12 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st insert_sql.append("".join(chunk).strip()[:-1] + ";\n") else: # otherwise write all content in a single INSERT INTO - insert_sql.extend([header.format(qualified_table_name), values_mark, content]) + if self._sql_client.capabilities.insert_values_writer_type == "default": + insert_sql.extend( + [header.format(qualified_table_name), values_mark, content] + ) + elif self._sql_client.capabilities.insert_values_writer_type == "select_union": + insert_sql.extend([header.format(qualified_table_name), content]) if until_nl: insert_sql.append(until_nl) diff --git a/dlt/helpers/dbt/profiles.yml b/dlt/helpers/dbt/profiles.yml index 2414222cbd..7031f5de2c 100644 --- a/dlt/helpers/dbt/profiles.yml +++ b/dlt/helpers/dbt/profiles.yml @@ -141,4 +141,20 @@ athena: schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" database: "{{ env_var('DLT__AWS_DATA_CATALOG') }}" # aws_profile_name: "{{ env_var('DLT__CREDENTIALS__PROFILE_NAME', '') }}" - work_group: "{{ env_var('DLT__ATHENA_WORK_GROUP', '') }}" \ No newline at end of file + work_group: "{{ env_var('DLT__ATHENA_WORK_GROUP', '') }}" + + +# commented out because dbt for Synapse isn't currently properly supported. +# Leave config here for potential future use. +# synapse: +# target: analytics +# outputs: +# analytics: +# type: synapse +# driver: "{{ env_var('DLT__CREDENTIALS__DRIVER') }}" +# server: "{{ env_var('DLT__CREDENTIALS__HOST') }}" +# port: "{{ env_var('DLT__CREDENTIALS__PORT') | as_number }}" +# database: "{{ env_var('DLT__CREDENTIALS__DATABASE') }}" +# schema: "{{ var('destination_dataset_name', var('source_dataset_name')) }}" +# user: "{{ env_var('DLT__CREDENTIALS__USERNAME') }}" +# password: "{{ env_var('DLT__CREDENTIALS__PASSWORD') }}" \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index c5da40c604..4d079fc44d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8466,9 +8466,10 @@ qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] snowflake = ["snowflake-connector-python"] +synapse = ["pyodbc"] weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "cf751b2e1e9c66efde0a11774b5204e3206a14fd04ba4c79b2d37e38db5367ad" +content-hash = "26c595a857f17a5cbdb348f165c267d8910412325be4e522d0e91224c7fec588" diff --git a/pyproject.toml b/pyproject.toml index 6436ec23a7..d9d5858674 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,6 +96,7 @@ cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] mssql = ["pyodbc"] +synapse = ["pyodbc"] qdrant = ["qdrant-client"] [tool.poetry.scripts] diff --git a/tests/load/mssql/test_mssql_credentials.py b/tests/load/mssql/test_mssql_credentials.py index 0098d228f1..0e38791f22 100644 --- a/tests/load/mssql/test_mssql_credentials.py +++ b/tests/load/mssql/test_mssql_credentials.py @@ -1,18 +1,35 @@ import pyodbc import pytest -from dlt.common.configuration import resolve_configuration +from dlt.common.configuration import resolve_configuration, ConfigFieldMissingException from dlt.common.exceptions import SystemConfigurationException -from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, SUPPORTED_DRIVERS +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials -def test_parse_native_representation_unsupported_driver_specified() -> None: +def test_mssql_credentials_defaults() -> None: + creds = MsSqlCredentials() + assert creds.port == 1433 + assert creds.connect_timeout == 15 + assert MsSqlCredentials.__config_gen_annotations__ == ["port", "connect_timeout"] + # port should be optional + resolve_configuration(creds, explicit_value="mssql://loader:loader@localhost/dlt_data") + assert creds.port == 1433 + + +def test_parse_native_representation() -> None: # Case: unsupported driver specified. with pytest.raises(SystemConfigurationException): resolve_configuration( MsSqlCredentials( - "mssql://test_user:test_password@sql.example.com:12345/test_db?DRIVER=foo" + "mssql://test_user:test_pwd@sql.example.com/test_db?DRIVER=ODBC+Driver+13+for+SQL+Server" + ) + ) + # Case: password not specified. + with pytest.raises(ConfigFieldMissingException): + resolve_configuration( + MsSqlCredentials( + "mssql://test_user@sql.example.com/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server" ) ) @@ -21,33 +38,49 @@ def test_to_odbc_dsn_supported_driver_specified() -> None: # Case: supported driver specified — ODBC Driver 18 for SQL Server. creds = resolve_configuration( MsSqlCredentials( - "mssql://test_user:test_password@sql.example.com:12345/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server" + "mssql://test_user:test_pwd@sql.example.com/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server" ) ) dsn = creds.to_odbc_dsn() result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} assert result == { "DRIVER": "ODBC Driver 18 for SQL Server", - "SERVER": "sql.example.com,12345", + "SERVER": "sql.example.com,1433", "DATABASE": "test_db", "UID": "test_user", - "PWD": "test_password", + "PWD": "test_pwd", } # Case: supported driver specified — ODBC Driver 17 for SQL Server. creds = resolve_configuration( MsSqlCredentials( - "mssql://test_user:test_password@sql.example.com:12345/test_db?DRIVER=ODBC+Driver+17+for+SQL+Server" + "mssql://test_user:test_pwd@sql.example.com/test_db?DRIVER=ODBC+Driver+17+for+SQL+Server" ) ) dsn = creds.to_odbc_dsn() result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} assert result == { "DRIVER": "ODBC Driver 17 for SQL Server", + "SERVER": "sql.example.com,1433", + "DATABASE": "test_db", + "UID": "test_user", + "PWD": "test_pwd", + } + + # Case: port and supported driver specified. + creds = resolve_configuration( + MsSqlCredentials( + "mssql://test_user:test_pwd@sql.example.com:12345/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server" + ) + ) + dsn = creds.to_odbc_dsn() + result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} + assert result == { + "DRIVER": "ODBC Driver 18 for SQL Server", "SERVER": "sql.example.com,12345", "DATABASE": "test_db", "UID": "test_user", - "PWD": "test_password", + "PWD": "test_pwd", } @@ -55,7 +88,7 @@ def test_to_odbc_dsn_arbitrary_keys_specified() -> None: # Case: arbitrary query keys (and supported driver) specified. creds = resolve_configuration( MsSqlCredentials( - "mssql://test_user:test_password@sql.example.com:12345/test_db?FOO=a&BAR=b&DRIVER=ODBC+Driver+18+for+SQL+Server" + "mssql://test_user:test_pwd@sql.example.com:12345/test_db?FOO=a&BAR=b&DRIVER=ODBC+Driver+18+for+SQL+Server" ) ) dsn = creds.to_odbc_dsn() @@ -65,7 +98,7 @@ def test_to_odbc_dsn_arbitrary_keys_specified() -> None: "SERVER": "sql.example.com,12345", "DATABASE": "test_db", "UID": "test_user", - "PWD": "test_password", + "PWD": "test_pwd", "FOO": "a", "BAR": "b", } @@ -73,7 +106,7 @@ def test_to_odbc_dsn_arbitrary_keys_specified() -> None: # Case: arbitrary capitalization. creds = resolve_configuration( MsSqlCredentials( - "mssql://test_user:test_password@sql.example.com:12345/test_db?FOO=a&bar=b&Driver=ODBC+Driver+18+for+SQL+Server" + "mssql://test_user:test_pwd@sql.example.com:12345/test_db?FOO=a&bar=b&Driver=ODBC+Driver+18+for+SQL+Server" ) ) dsn = creds.to_odbc_dsn() @@ -83,30 +116,30 @@ def test_to_odbc_dsn_arbitrary_keys_specified() -> None: "SERVER": "sql.example.com,12345", "DATABASE": "test_db", "UID": "test_user", - "PWD": "test_password", + "PWD": "test_pwd", "FOO": "a", "BAR": "b", } -available_drivers = [d for d in pyodbc.drivers() if d in SUPPORTED_DRIVERS] +available_drivers = [d for d in pyodbc.drivers() if d in MsSqlCredentials.SUPPORTED_DRIVERS] @pytest.mark.skipif(not available_drivers, reason="no supported driver available") def test_to_odbc_dsn_driver_not_specified() -> None: # Case: driver not specified, but supported driver is available. creds = resolve_configuration( - MsSqlCredentials("mssql://test_user:test_password@sql.example.com:12345/test_db") + MsSqlCredentials("mssql://test_user:test_pwd@sql.example.com/test_db") ) dsn = creds.to_odbc_dsn() result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} assert result in [ { "DRIVER": d, - "SERVER": "sql.example.com,12345", + "SERVER": "sql.example.com,1433", "DATABASE": "test_db", "UID": "test_user", - "PWD": "test_password", + "PWD": "test_pwd", } - for d in SUPPORTED_DRIVERS + for d in MsSqlCredentials.SUPPORTED_DRIVERS ] diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index f7e0ce53ff..039ce99113 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -1,11 +1,10 @@ import pytest -from copy import deepcopy import sqlfluff from dlt.common.utils import uniq_id from dlt.common.schema import Schema -pytest.importorskip("dlt.destinations.mssql.mssql", reason="MSSQL ODBC driver not installed") +pytest.importorskip("dlt.destinations.impl.mssql.mssql", reason="MSSQL ODBC driver not installed") from dlt.destinations.impl.mssql.mssql import MsSqlClient from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index 11f59d5276..e919409311 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -37,6 +37,8 @@ def test_run_jaffle_package( pytest.skip( "dbt-athena requires database to be created and we don't do it in case of Jaffle" ) + if not destination_config.supports_dbt: + pytest.skip("dbt is not supported for this destination configuration") pipeline = destination_config.setup_pipeline("jaffle_jaffle", full_refresh=True) # get runner, pass the env from fixture dbt = dlt.dbt.package(pipeline, "https://github.com/dbt-labs/jaffle_shop.git", venv=dbt_venv) @@ -55,9 +57,10 @@ def test_run_jaffle_package( assert all(r.status == "pass" for r in tests) # get and display dataframe with customers - customers = select_data(pipeline, "SELECT * FROM customers") + qual_name = pipeline.sql_client().make_qualified_table_name + customers = select_data(pipeline, f"SELECT * FROM {qual_name('customers')}") assert len(customers) == 100 - orders = select_data(pipeline, "SELECT * FROM orders") + orders = select_data(pipeline, f"SELECT * FROM {qual_name('orders')}") assert len(orders) == 99 diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index c6db91efff..1dde56a6b1 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -264,6 +264,12 @@ def test_replace_table_clearing( # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy + if destination_config.destination == "synapse" and replace_strategy == "staging-optimized": + # The "staging-optimized" replace strategy makes Synapse suspend the CTAS + # queries used to recreate the staging table, and hang, when the number + # of load workers is greater than 1. + os.environ["LOAD__WORKERS"] = "1" + pipeline = destination_config.setup_pipeline( "test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True ) diff --git a/tests/load/synapse/__init__.py b/tests/load/synapse/__init__.py new file mode 100644 index 0000000000..34119d38cb --- /dev/null +++ b/tests/load/synapse/__init__.py @@ -0,0 +1,3 @@ +from tests.utils import skip_if_not_active + +skip_if_not_active("synapse") diff --git a/tests/load/synapse/test_synapse_configuration.py b/tests/load/synapse/test_synapse_configuration.py new file mode 100644 index 0000000000..4055cbab38 --- /dev/null +++ b/tests/load/synapse/test_synapse_configuration.py @@ -0,0 +1,46 @@ +import pytest + +from dlt.common.configuration import resolve_configuration +from dlt.common.exceptions import SystemConfigurationException + +from dlt.destinations.impl.synapse.configuration import ( + SynapseClientConfiguration, + SynapseCredentials, +) + + +def test_synapse_configuration() -> None: + # By default, unique indexes should not be created. + assert SynapseClientConfiguration().create_indexes is False + + +def test_parse_native_representation() -> None: + # Case: unsupported driver specified. + with pytest.raises(SystemConfigurationException): + resolve_configuration( + SynapseCredentials( + "synapse://test_user:test_pwd@test.sql.azuresynapse.net/test_db?DRIVER=ODBC+Driver+17+for+SQL+Server" + ) + ) + + +def test_to_odbc_dsn_longasmax() -> None: + # Case: LONGASMAX not specified in query (this is the expected scenario). + creds = resolve_configuration( + SynapseCredentials( + "synapse://test_user:test_pwd@test.sql.azuresynapse.net/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server" + ) + ) + dsn = creds.to_odbc_dsn() + result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} + assert result["LONGASMAX"] == "yes" + + # Case: LONGASMAX specified in query; specified value should be overridden. + creds = resolve_configuration( + SynapseCredentials( + "synapse://test_user:test_pwd@test.sql.azuresynapse.net/test_db?DRIVER=ODBC+Driver+18+for+SQL+Server&LONGASMAX=no" + ) + ) + dsn = creds.to_odbc_dsn() + result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} + assert result["LONGASMAX"] == "yes" diff --git a/tests/load/synapse/test_synapse_table_builder.py b/tests/load/synapse/test_synapse_table_builder.py new file mode 100644 index 0000000000..f58a7d5883 --- /dev/null +++ b/tests/load/synapse/test_synapse_table_builder.py @@ -0,0 +1,130 @@ +import os +import pytest +import sqlfluff +from copy import deepcopy +from sqlfluff.api.simple import APIParsingError + +from dlt.common.utils import uniq_id +from dlt.common.schema import Schema, TColumnHint + +from dlt.destinations.impl.synapse.synapse import SynapseClient +from dlt.destinations.impl.synapse.configuration import ( + SynapseClientConfiguration, + SynapseCredentials, +) + +from tests.load.utils import TABLE_UPDATE +from dlt.destinations.impl.synapse.synapse import HINT_TO_SYNAPSE_ATTR + + +@pytest.fixture +def schema() -> Schema: + return Schema("event") + + +@pytest.fixture +def client(schema: Schema) -> SynapseClient: + # return client without opening connection + client = SynapseClient( + schema, + SynapseClientConfiguration( + dataset_name="test_" + uniq_id(), credentials=SynapseCredentials() + ), + ) + assert client.config.create_indexes is False + return client + + +@pytest.fixture +def client_with_indexes_enabled(schema: Schema) -> SynapseClient: + # return client without opening connection + client = SynapseClient( + schema, + SynapseClientConfiguration( + dataset_name="test_" + uniq_id(), credentials=SynapseCredentials(), create_indexes=True + ), + ) + assert client.config.create_indexes is True + return client + + +def test_create_table(client: SynapseClient) -> None: + # non existing table + sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="tsql") + assert "event_test_table" in sql + assert '"col1" bigint NOT NULL' in sql + assert '"col2" float NOT NULL' in sql + assert '"col3" bit NOT NULL' in sql + assert '"col4" datetimeoffset NOT NULL' in sql + assert '"col5" nvarchar(max) NOT NULL' in sql + assert '"col6" decimal(38,9) NOT NULL' in sql + assert '"col7" varbinary(max) NOT NULL' in sql + assert '"col8" decimal(38,0)' in sql + assert '"col9" nvarchar(max) NOT NULL' in sql + assert '"col10" date NOT NULL' in sql + assert '"col11" time NOT NULL' in sql + assert '"col1_precision" smallint NOT NULL' in sql + assert '"col4_precision" datetimeoffset(3) NOT NULL' in sql + assert '"col5_precision" nvarchar(25)' in sql + assert '"col6_precision" decimal(6,2) NOT NULL' in sql + assert '"col7_precision" varbinary(19)' in sql + assert '"col11_precision" time(3) NOT NULL' in sql + assert "WITH ( HEAP )" in sql + + +def test_alter_table(client: SynapseClient) -> None: + # existing table has no columns + sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)[0] + sqlfluff.parse(sql, dialect="tsql") + canonical_name = client.sql_client.make_qualified_table_name("event_test_table") + assert sql.count(f"ALTER TABLE {canonical_name}\nADD") == 1 + assert "event_test_table" in sql + assert '"col1" bigint NOT NULL' in sql + assert '"col2" float NOT NULL' in sql + assert '"col3" bit NOT NULL' in sql + assert '"col4" datetimeoffset NOT NULL' in sql + assert '"col5" nvarchar(max) NOT NULL' in sql + assert '"col6" decimal(38,9) NOT NULL' in sql + assert '"col7" varbinary(max) NOT NULL' in sql + assert '"col8" decimal(38,0)' in sql + assert '"col9" nvarchar(max) NOT NULL' in sql + assert '"col10" date NOT NULL' in sql + assert '"col11" time NOT NULL' in sql + assert '"col1_precision" smallint NOT NULL' in sql + assert '"col4_precision" datetimeoffset(3) NOT NULL' in sql + assert '"col5_precision" nvarchar(25)' in sql + assert '"col6_precision" decimal(6,2) NOT NULL' in sql + assert '"col7_precision" varbinary(19)' in sql + assert '"col11_precision" time(3) NOT NULL' in sql + assert "WITH ( HEAP )" not in sql + + +@pytest.mark.parametrize("hint", ["primary_key", "unique"]) +def test_create_table_with_column_hint( + client: SynapseClient, client_with_indexes_enabled: SynapseClient, hint: TColumnHint +) -> None: + attr = HINT_TO_SYNAPSE_ATTR[hint] + + # Case: table without hint. + sql = client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)[0] + sqlfluff.parse(sql, dialect="tsql") + assert f" {attr} " not in sql + + # Case: table with hint, but client does not have indexes enabled. + mod_update = deepcopy(TABLE_UPDATE) + mod_update[0][hint] = True # type: ignore[typeddict-unknown-key] + sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] + sqlfluff.parse(sql, dialect="tsql") + assert f" {attr} " not in sql + + # Case: table with hint, client has indexes enabled. + sql = client_with_indexes_enabled._get_table_update_sql("event_test_table", mod_update, False)[ + 0 + ] + # We expect an error because "PRIMARY KEY NONCLUSTERED NOT ENFORCED" and + # "UNIQUE NOT ENFORCED" are invalid in the generic "tsql" dialect. + # They are however valid in the Synapse variant of the dialect. + with pytest.raises(APIParsingError): + sqlfluff.parse(sql, dialect="tsql") + assert f'"col1" bigint {attr} NOT NULL' in sql diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 153504bf4a..b8d2e31e3f 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -387,7 +387,8 @@ def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: "time", ): continue - if client.config.destination_type == "mssql" and c["data_type"] in ("complex"): + # mssql and synapse have no native data type for the complex type. + if client.config.destination_type in ("mssql", "synapse") and c["data_type"] in ("complex"): continue assert c["data_type"] == expected_c["data_type"] diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 96f0db09bb..4bdf08e23c 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -38,7 +38,7 @@ def client(request) -> Iterator[SqlJobClientBase]: @pytest.mark.parametrize( "client", - destinations_configs(default_sql_configs=True, exclude=["mssql"]), + destinations_configs(default_sql_configs=True, exclude=["mssql", "synapse"]), indirect=True, ids=lambda x: x.name, ) @@ -263,9 +263,15 @@ def test_execute_df(client: SqlJobClientBase) -> None: client.update_stored_schema() table_name = prepare_temp_table(client) f_q_table_name = client.sql_client.make_qualified_table_name(table_name) - insert_query = ",".join([f"({idx})" for idx in range(0, total_records)]) - client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES {insert_query};") + if client.capabilities.insert_values_writer_type == "default": + insert_query = ",".join([f"({idx})" for idx in range(0, total_records)]) + sql_stmt = f"INSERT INTO {f_q_table_name} VALUES {insert_query};" + elif client.capabilities.insert_values_writer_type == "select_union": + insert_query = " UNION ALL ".join([f"SELECT {idx}" for idx in range(0, total_records)]) + sql_stmt = f"INSERT INTO {f_q_table_name} {insert_query};" + + client.sql_client.execute_sql(sql_stmt) with client.sql_client.execute_query( f"SELECT * FROM {f_q_table_name} ORDER BY col ASC" ) as curr: diff --git a/tests/load/utils.py b/tests/load/utils.py index 6811ca59a6..55445e0b95 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -163,7 +163,7 @@ def destinations_configs( destination_configs += [ DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS - if destination != "athena" + if destination not in ("athena", "synapse") ] destination_configs += [ DestinationTestConfiguration(destination="duckdb", file_format="parquet") @@ -190,6 +190,10 @@ def destinations_configs( extra_info="iceberg", ) ] + # dbt for Synapse has some complications and I couldn't get it to pass all tests. + destination_configs += [ + DestinationTestConfiguration(destination="synapse", supports_dbt=False) + ] if default_vector_configs: # for now only weaviate @@ -465,7 +469,6 @@ def yield_client_with_storage( ) as client: client.initialize_storage() yield client - # print(dataset_name) client.sql_client.drop_dataset() if isinstance(client, WithStagingDataset): with client.with_staging_dataset(): diff --git a/tests/utils.py b/tests/utils.py index cf172f9733..211f87874d 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -45,6 +45,7 @@ "motherduck", "mssql", "qdrant", + "synapse", } NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant"} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS From 05b05305ee46108261789ed25442aec518b1cca6 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 18 Jan 2024 16:44:34 +0100 Subject: [PATCH 02/23] make var type consistent --- dlt/common/data_writers/escape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index b56a0d8f19..20932fec6c 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -101,7 +101,7 @@ def escape_mssql_literal(v: Any) -> Any: # 8000 is the max value for n in VARBINARY(n) # https://learn.microsoft.com/en-us/sql/t-sql/data-types/binary-and-varbinary-transact-sql if len(v) <= 8000: - n = len(v) + n = str(len(v)) else: n = "MAX" return f"CONVERT(VARBINARY({n}), '{v.hex()}', 2)" From dc7619ad6f778b55cefaa09a3d3ef194ae5bc07a Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 18 Jan 2024 17:12:32 +0100 Subject: [PATCH 03/23] simplify client init logic --- dlt/destinations/impl/synapse/synapse.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index 18d1fa81d4..0ad959f7ab 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -12,7 +12,7 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient from dlt.destinations.job_client_impl import SqlJobClientBase -from dlt.destinations.impl.mssql.mssql import MsSqlTypeMapper, MsSqlClient, HINT_TO_MSSQL_ATTR +from dlt.destinations.impl.mssql.mssql import MsSqlClient from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient @@ -29,11 +29,11 @@ class SynapseClient(MsSqlClient, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: - sql_client = SynapseSqlClient(config.normalize_dataset_name(schema), config.credentials) - InsertValuesJobClient.__init__(self, schema, config, sql_client) + super().__init__(schema, config) self.config: SynapseClientConfiguration = config - self.sql_client = sql_client - self.type_mapper = MsSqlTypeMapper(self.capabilities) + self.sql_client = SynapseSqlClient( + config.normalize_dataset_name(schema), config.credentials + ) self.active_hints = deepcopy(HINT_TO_SYNAPSE_ATTR) if not self.config.create_indexes: From 702dd28032fd6a1e36214d34131373afbbed03ba Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 21 Jan 2024 01:34:48 +0100 Subject: [PATCH 04/23] add support for table index type configuration --- dlt/common/data_writers/escape.py | 6 +- dlt/common/destination/reference.py | 4 +- dlt/common/schema/schema.py | 8 + dlt/common/schema/typing.py | 3 + dlt/common/schema/utils.py | 12 ++ dlt/destinations/impl/mssql/mssql.py | 2 + .../impl/synapse/configuration.py | 9 +- dlt/destinations/impl/synapse/factory.py | 7 + dlt/destinations/impl/synapse/synapse.py | 96 ++++++++++-- dlt/extract/decorators.py | 7 + dlt/extract/hints.py | 3 + tests/load/pipeline/test_table_indexing.py | 140 ++++++++++++++++++ .../synapse/test_synapse_table_builder.py | 13 +- 13 files changed, 292 insertions(+), 18 deletions(-) create mode 100644 tests/load/pipeline/test_table_indexing.py diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 20932fec6c..1de584de2e 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -98,9 +98,9 @@ def escape_mssql_literal(v: Any) -> Any: json.dumps(v), prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE ) if isinstance(v, bytes): - # 8000 is the max value for n in VARBINARY(n) - # https://learn.microsoft.com/en-us/sql/t-sql/data-types/binary-and-varbinary-transact-sql - if len(v) <= 8000: + from dlt.destinations.impl.mssql.mssql import VARBINARY_MAX_N + + if len(v) <= VARBINARY_MAX_N: n = str(len(v)) else: n = "MAX" diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 1c28dffa8c..59f13b30b9 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -34,7 +34,7 @@ from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName -from dlt.common.schema.utils import get_write_disposition, get_table_format +from dlt.common.schema.utils import get_write_disposition, get_table_format, get_table_index_type from dlt.common.configuration import configspec, with_config, resolve_configuration, known_sections from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config @@ -372,6 +372,8 @@ def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> table["write_disposition"] = get_write_disposition(self.schema.tables, table_name) if "table_format" not in table: table["table_format"] = get_table_format(self.schema.tables, table_name) + if "table_index_type" not in table: + table["table_index_type"] = get_table_index_type(self.schema.tables, table_name) return table except KeyError: raise UnknownTableException(table_name) diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index e95699b91e..ccfc038085 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -546,12 +546,20 @@ def data_tables(self, include_incomplete: bool = False) -> List[TTableSchema]: ) ] + def data_table_names(self) -> List[str]: + """Returns list of table table names. Excludes dlt table names.""" + return [t["name"] for t in self.data_tables()] + def dlt_tables(self) -> List[TTableSchema]: """Gets dlt tables""" return [ t for t in self._schema_tables.values() if t["name"].startswith(self._dlt_tables_prefix) ] + def dlt_table_names(self) -> List[str]: + """Returns list of dlt table names.""" + return [t["name"] for t in self.dlt_tables()] + def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 9a27cbe4bb..351d666553 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -62,6 +62,8 @@ """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg"] +TTableIndexType = Literal["heap", "clustered_columnstore_index"] +"Table index type. Currently only used for Synapse destination." TTypeDetections = Literal[ "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" ] @@ -165,6 +167,7 @@ class TTableSchema(TypedDict, total=False): columns: TTableSchemaColumns resource: Optional[str] table_format: Optional[TTableFormat] + table_index_type: Optional[TTableIndexType] class TPartialTableSchema(TTableSchema): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index dc243f50dd..5ea244148e 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -32,6 +32,7 @@ TColumnSchema, TColumnProp, TTableFormat, + TTableIndexType, TColumnHint, TTypeDetectionFunc, TTypeDetections, @@ -618,6 +619,14 @@ def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: ) +def get_table_index_type(tables: TSchemaTables, table_name: str) -> TTableIndexType: + """Returns table index type of a table if present. If not, looks up into parent table.""" + return cast( + TTableIndexType, + get_inherited_table_hint(tables, table_name, "table_index_type", allow_none=True), + ) + + def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: """Checks if `table` schema contains column with type _typ""" return any(c.get("data_type") == _typ for c in table["columns"].values()) @@ -724,6 +733,7 @@ def new_table( resource: str = None, schema_contract: TSchemaContract = None, table_format: TTableFormat = None, + table_index_type: TTableIndexType = None, ) -> TTableSchema: table: TTableSchema = { "name": table_name, @@ -742,6 +752,8 @@ def new_table( table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format + if table_index_type is not None: + table["table_index_type"] = table_index_type if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index e97389f185..b6af345e36 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -20,6 +20,8 @@ HINT_TO_MSSQL_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} +VARCHAR_MAX_N: int = 4000 +VARBINARY_MAX_N: int = 8000 class MsSqlTypeMapper(TypeMapper): diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index 0596cc2c46..966997b5a2 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -1,6 +1,7 @@ from typing import Final, Any, List, Dict, Optional, ClassVar from dlt.common.configuration import configspec +from dlt.common.schema.typing import TTableIndexType from dlt.destinations.impl.mssql.configuration import ( MsSqlCredentials, @@ -30,9 +31,15 @@ class SynapseClientConfiguration(MsSqlClientConfiguration): destination_type: Final[str] = "synapse" # type: ignore credentials: SynapseCredentials + # While Synapse uses CLUSTERED COLUMNSTORE INDEX tables by default, we use + # HEAP tables (no indexing) by default. HEAP is a more robust choice, because + # columnstore tables do not support varchar(max), nvarchar(max), and varbinary(max). + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index + default_table_index_type: Optional[TTableIndexType] = "heap" + # Determines if `primary_key` and `unique` column hints are applied. # Set to False by default because the PRIMARY KEY and UNIQUE constraints # are tricky in Synapse: they are NOT ENFORCED and can lead to innacurate # results if the user does not ensure all column values are unique. # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints - create_indexes: bool = False + create_indexes: Optional[bool] = False diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index fa7facc0ca..6bdf2946b6 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -1,6 +1,7 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.schema.typing import TTableIndexType from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.configuration import ( @@ -27,6 +28,7 @@ def client_class(self) -> t.Type["SynapseClient"]: def __init__( self, credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, + default_table_index_type: t.Optional[TTableIndexType] = "heap", create_indexes: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, @@ -39,11 +41,16 @@ def __init__( Args: credentials: Credentials to connect to the Synapse dedicated pool. Can be an instance of `SynapseCredentials` or a connection string in the format `synapse://user:password@host:port/database` + default_table_index_type: Table index type that is used if no + table index type is specified on the resource. This setting only + applies to data tables, dlt system tables are not affected + (they always have "heap" as table index type). create_indexes: Should unique indexes be created, defaults to False **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, + default_table_index_type=default_table_index_type, create_indexes=create_indexes, destination_name=destination_name, environment=environment, diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index 0ad959f7ab..e01e851d83 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -1,18 +1,24 @@ -from typing import ClassVar, Sequence, List, Dict, Any, Optional +from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast from copy import deepcopy +from textwrap import dedent from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import SupportsStagingDestination, NewLoadJob from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint -from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.typing import TTableSchemaColumns, TTableIndexType from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.insert_job_client import InsertValuesJobClient from dlt.destinations.job_client_impl import SqlJobClientBase -from dlt.destinations.impl.mssql.mssql import MsSqlClient +from dlt.destinations.impl.mssql.mssql import ( + MsSqlTypeMapper, + MsSqlClient, + VARCHAR_MAX_N, + VARBINARY_MAX_N, +) from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient @@ -23,9 +29,13 @@ "primary_key": "PRIMARY KEY NONCLUSTERED NOT ENFORCED", "unique": "UNIQUE NOT ENFORCED", } +TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR: Dict[TTableIndexType, str] = { + "heap": "HEAP", + "clustered_columnstore_index": "CLUSTERED COLUMNSTORE INDEX", +} -class SynapseClient(MsSqlClient, SupportsStagingDestination): +class SynapseClient(MsSqlClient): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: @@ -43,20 +53,54 @@ def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: + table = self.get_load_table(table_name) + if table is None: + table_index_type = self.config.default_table_index_type + else: + table_index_type = table.get("table_index_type") + if table_index_type == "clustered_columnstore_index": + new_columns = self._get_columstore_valid_columns(new_columns) + _sql_result = SqlJobClientBase._get_table_update_sql( self, table_name, new_columns, generate_alter ) if not generate_alter: - # Append WITH clause to create heap table instead of default - # columnstore table. Heap tables are a more robust choice, because - # columnstore tables do not support varchar(max), nvarchar(max), - # and varbinary(max). - # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index - sql_result = [_sql_result[0] + "\n WITH ( HEAP );"] + table_index_type_attr = TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR[table_index_type] + sql_result = [_sql_result[0] + f"\n WITH ( {table_index_type_attr} );"] else: sql_result = _sql_result return sql_result + def _get_columstore_valid_columns( + self, columns: Sequence[TColumnSchema] + ) -> Sequence[TColumnSchema]: + return [self._get_columstore_valid_column(c) for c in columns] + + def _get_columstore_valid_column(self, c: TColumnSchema) -> TColumnSchema: + """ + Returns TColumnSchema that maps to a Synapse data type that can participate in a columnstore index. + + varchar(max), nvarchar(max), and varbinary(max) are replaced with + varchar(n), nvarchar(n), and varbinary(n), respectively, where + n equals the user-specified precision, or the maximum allowed + value if the user did not specify a precision. + """ + varchar_source_types = [ + sct + for sct, dbt in MsSqlTypeMapper.sct_to_unbound_dbt.items() + if dbt in ("varchar(max)", "nvarchar(max)") + ] + varbinary_source_types = [ + sct + for sct, dbt in MsSqlTypeMapper.sct_to_unbound_dbt.items() + if dbt == "varbinary(max)" + ] + if c["data_type"] in varchar_source_types and "precision" not in c: + return {**c, **{"precision": VARCHAR_MAX_N}} + elif c["data_type"] in varbinary_source_types and "precision" not in c: + return {**c, **{"precision": VARBINARY_MAX_N}} + return c + def _create_replace_followup_jobs( self, table_chain: Sequence[TTableSchema] ) -> List[NewLoadJob]: @@ -64,6 +108,38 @@ def _create_replace_followup_jobs( return [SynapseStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) + def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: + table = super().get_load_table(table_name, staging) + if table is None: + return None + if table_name in self.schema.dlt_table_names(): + # dlt tables should always be heap tables, regardless of the user + # configuration. Why? "For small lookup tables, less than 60 million rows, + # consider using HEAP or clustered index for faster query performance." + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables + table["table_index_type"] = "heap" + if table["table_index_type"] is None: + table["table_index_type"] = self.config.default_table_index_type + return table + + def get_storage_table_index_type(self, table_name: str) -> TTableIndexType: + """Returns table index type of table in storage destination.""" + with self.sql_client as sql_client: + schema_name = sql_client.fully_qualified_dataset_name(escape=False) + sql = dedent(f""" + SELECT + CASE i.type_desc + WHEN 'HEAP' THEN 'heap' + WHEN 'CLUSTERED COLUMNSTORE' THEN 'clustered_columnstore_index' + END AS table_index_type + FROM sys.indexes i + INNER JOIN sys.tables t ON t.object_id = i.object_id + INNER JOIN sys.schemas s ON s.schema_id = t.schema_id + WHERE s.name = '{schema_name}' AND t.name = '{table_name}' + """) + table_index_type = sql_client.execute_sql(sql)[0][0] + return cast(TTableIndexType, table_index_type) + class SynapseStagingCopyJob(SqlStagingCopyJob): @classmethod diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index cf7426e683..573d3d3ad0 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -36,6 +36,7 @@ TAnySchemaColumns, TSchemaContract, TTableFormat, + TTableIndexType, ) from dlt.extract.utils import ( ensure_table_schema_columns_hint, @@ -256,6 +257,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> DltResource: ... @@ -273,6 +275,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... @@ -290,6 +293,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, @@ -308,6 +312,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> DltResource: ... @@ -324,6 +329,7 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: bool = False, @@ -403,6 +409,7 @@ def make_resource( merge_key=merge_key, schema_contract=schema_contract, table_format=table_format, + table_index_type=table_index_type, ) return DltResource.from_data( _data, diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 437dbbc6bd..36354eb0da 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -12,6 +12,7 @@ TWriteDisposition, TAnySchemaColumns, TTableFormat, + TTableIndexType, TSchemaContract, ) from dlt.common.typing import TDataItem @@ -274,6 +275,7 @@ def new_table_template( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, + table_index_type: TTableHintTemplate[TTableIndexType] = None, ) -> TResourceHints: validator, schema_contract = create_item_validator(columns, schema_contract) clean_columns = columns @@ -289,6 +291,7 @@ def new_table_template( columns=clean_columns, # type: ignore schema_contract=schema_contract, # type: ignore table_format=table_format, # type: ignore + table_index_type=table_index_type, # type: ignore ) if not table_name: new_template.pop("name") diff --git a/tests/load/pipeline/test_table_indexing.py b/tests/load/pipeline/test_table_indexing.py new file mode 100644 index 0000000000..5f62cddfee --- /dev/null +++ b/tests/load/pipeline/test_table_indexing.py @@ -0,0 +1,140 @@ +import os +import pytest +from typing import Iterator, List, Any, Union +from textwrap import dedent + +import dlt +from dlt.common.schema import TColumnSchema +from dlt.common.schema.typing import TTableIndexType, TSchemaTables +from dlt.common.schema.utils import get_table_index_type + +from dlt.destinations.sql_client import SqlClientBase + +from tests.load.utils import TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES +from tests.load.pipeline.utils import ( + destinations_configs, + DestinationTestConfiguration, +) + + +TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID = [ + ("heap", None), + # For "clustered_columnstore_index" tables, different code paths exist + # when no column schema is specified versus when a column schema is + # specified, so we test both. + ("clustered_columnstore_index", None), + ("clustered_columnstore_index", TABLE_UPDATE), +] + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["synapse"]), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID +) +def test_default_table_index_type_configuration( + destination_config: DestinationTestConfiguration, + table_index_type: TTableIndexType, + column_schema: Union[List[TColumnSchema], None], +) -> None: + # Configure default_table_index_type. + os.environ["DESTINATION__SYNAPSE__DEFAULT_TABLE_INDEX_TYPE"] = table_index_type + + @dlt.resource( + name="items_without_table_index_type_specified", + write_disposition="append", + columns=column_schema, + ) + def items_without_table_index_type_specified() -> Iterator[Any]: + yield TABLE_ROW_ALL_DATA_TYPES + + pipeline = destination_config.setup_pipeline( + f"test_default_table_index_type_{table_index_type}", + full_refresh=True, + ) + job_client = pipeline.destination_client() + # Assert configuration value gets properly propagated to job client configuration. + assert job_client.config.default_table_index_type == table_index_type # type: ignore[attr-defined] + + # Run the pipeline and create the tables. + pipeline.run(items_without_table_index_type_specified) + + # For all tables, assert the applied index type equals the expected index type. + # Child tables, if any, inherit the index type of their parent. + tables = pipeline.default_schema.tables + for table_name in tables: + applied_table_index_type = job_client.get_storage_table_index_type(table_name) # type: ignore[attr-defined] + if table_name in pipeline.default_schema.data_table_names(): + # For data tables, the applied table index type should be the default value. + assert applied_table_index_type == job_client.config.default_table_index_type # type: ignore[attr-defined] + elif table_name in pipeline.default_schema.dlt_table_names(): + # For dlt tables, the applied table index type should always be "heap". + assert applied_table_index_type == "heap" + + # Test overriding the default_table_index_type from a resource configuration. + if job_client.config.default_table_index_type == "heap": # type: ignore[attr-defined] + + @dlt.resource( + name="items_with_table_index_type_specified", + write_disposition="append", + table_index_type="clustered_columnstore_index", + columns=column_schema, + ) + def items_with_table_index_type_specified() -> Iterator[Any]: + yield TABLE_ROW_ALL_DATA_TYPES + + pipeline.run(items_with_table_index_type_specified) + applied_table_index_type = job_client.get_storage_table_index_type( # type: ignore[attr-defined] + "items_with_table_index_type_specified" + ) + # While the default is "heap", the applied index type should be "clustered_columnstore_index" + # because it was provided as argument to the resource. + assert applied_table_index_type == "clustered_columnstore_index" + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["synapse"]), + ids=lambda x: x.name, +) +@pytest.mark.parametrize( + "table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID +) +def test_resource_table_index_type_configuration( + destination_config: DestinationTestConfiguration, + table_index_type: TTableIndexType, + column_schema: Union[List[TColumnSchema], None], +) -> None: + @dlt.resource( + name="items_with_table_index_type_specified", + write_disposition="append", + table_index_type=table_index_type, + columns=column_schema, + ) + def items_with_table_index_type_specified() -> Iterator[Any]: + yield TABLE_ROW_ALL_DATA_TYPES + + pipeline = destination_config.setup_pipeline( + f"test_table_index_type_{table_index_type}", + full_refresh=True, + ) + + # Run the pipeline and create the tables. + pipeline.run(items_with_table_index_type_specified) + + # For all tables, assert the applied index type equals the expected index type. + # Child tables, if any, inherit the index type of their parent. + job_client = pipeline.destination_client() + tables = pipeline.default_schema.tables + for table_name in tables: + applied_table_index_type = job_client.get_storage_table_index_type(table_name) # type: ignore[attr-defined] + if table_name in pipeline.default_schema.data_table_names(): + # For data tables, the applied table index type should be the type + # configured in the resource. + assert applied_table_index_type == table_index_type + elif table_name in pipeline.default_schema.dlt_table_names(): + # For dlt tables, the applied table index type should always be "heap". + assert applied_table_index_type == "heap" diff --git a/tests/load/synapse/test_synapse_table_builder.py b/tests/load/synapse/test_synapse_table_builder.py index f58a7d5883..4719a8d003 100644 --- a/tests/load/synapse/test_synapse_table_builder.py +++ b/tests/load/synapse/test_synapse_table_builder.py @@ -14,7 +14,10 @@ ) from tests.load.utils import TABLE_UPDATE -from dlt.destinations.impl.synapse.synapse import HINT_TO_SYNAPSE_ATTR +from dlt.destinations.impl.synapse.synapse import ( + HINT_TO_SYNAPSE_ATTR, + TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR, +) @pytest.fixture @@ -70,7 +73,9 @@ def test_create_table(client: SynapseClient) -> None: assert '"col6_precision" decimal(6,2) NOT NULL' in sql assert '"col7_precision" varbinary(19)' in sql assert '"col11_precision" time(3) NOT NULL' in sql - assert "WITH ( HEAP )" in sql + table_index_type = client.config.default_table_index_type + table_index_type_attr = TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR[table_index_type] + assert f"WITH ( {table_index_type_attr} )" in sql def test_alter_table(client: SynapseClient) -> None: @@ -97,7 +102,9 @@ def test_alter_table(client: SynapseClient) -> None: assert '"col6_precision" decimal(6,2) NOT NULL' in sql assert '"col7_precision" varbinary(19)' in sql assert '"col11_precision" time(3) NOT NULL' in sql - assert "WITH ( HEAP )" not in sql + table_index_type = client.config.default_table_index_type + table_index_type_attr = TABLE_INDEX_TYPE_TO_SYNAPSE_ATTR[table_index_type] + assert f"WITH ( {table_index_type_attr} )" not in sql @pytest.mark.parametrize("hint", ["primary_key", "unique"]) From db73162fef46c98c73ea00daba686d53211c6f81 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 23 Jan 2024 14:59:10 +0100 Subject: [PATCH 05/23] add load concurrency handling and warning --- .../impl/synapse/configuration.py | 61 ++++++++++++++++++- dlt/destinations/impl/synapse/factory.py | 10 +-- dlt/pipeline/pipeline.py | 9 ++- .../load/pipeline/test_replace_disposition.py | 10 ++- 4 files changed, 76 insertions(+), 14 deletions(-) diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index 966997b5a2..b5eec82e9e 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -1,7 +1,8 @@ from typing import Final, Any, List, Dict, Optional, ClassVar from dlt.common.configuration import configspec -from dlt.common.schema.typing import TTableIndexType +from dlt.common.schema.typing import TTableIndexType, TWriteDisposition +from dlt.common import logger from dlt.destinations.impl.mssql.configuration import ( MsSqlCredentials, @@ -36,10 +37,66 @@ class SynapseClientConfiguration(MsSqlClientConfiguration): # columnstore tables do not support varchar(max), nvarchar(max), and varbinary(max). # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index default_table_index_type: Optional[TTableIndexType] = "heap" + """ + Table index type that is used if no table index type is specified on the resource. + This only affects data tables, dlt system tables ignore this setting and + are always created as "heap" tables. + """ - # Determines if `primary_key` and `unique` column hints are applied. # Set to False by default because the PRIMARY KEY and UNIQUE constraints # are tricky in Synapse: they are NOT ENFORCED and can lead to innacurate # results if the user does not ensure all column values are unique. # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints create_indexes: Optional[bool] = False + """Whether `primary_key` and `unique` column hints are applied.""" + + # Concurrency is disabled by overriding the configured number of workers to 1 at runtime. + auto_disable_concurrency: Optional[bool] = True + """Whether concurrency is automatically disabled in cases where it might cause issues.""" + + __config_gen_annotations__: ClassVar[List[str]] = [ + "default_table_index_type", + "create_indexes", + "auto_disable_concurrency", + ] + + def get_load_workers(self, write_disposition: TWriteDisposition, workers: int) -> int: + if ( + write_disposition == "replace" + and self.replace_strategy == "staging-optimized" + and workers > 1 + ): + print("auto_disable_concurrency:", self.auto_disable_concurrency) + warning_msg_shared = ( + 'Data is being loaded into Synapse with write disposition "replace"' + ' and replace strategy "staging-optimized", while the number of' + f" load workers ({workers}) > 1. This configuration is problematic" + " in some cases, because Synapse does not always handle concurrency well" + " with the CTAS queries that are used behind the scenes to implement" + ' the "staging-optimized" strategy.' + ) + if self.auto_disable_concurrency: + logger.warning( + warning_msg_shared + + " The number of load workers will be automatically adjusted" + " and set to 1 to eliminate concurrency and prevent potential" + " issues. If you don't want this to happen, set the" + " DESTINATION__SYNAPSE__AUTO_DISABLE_CONCURRENCY environment" + ' variable to "false", or add the following to your config TOML:' + "\n\n[destination.synapse]\nauto_disable_concurrency = false\n" + ) + workers = 1 # adjust workers + else: + logger.warning( + warning_msg_shared + + " If you experience your pipeline gets stuck and doesn't finish," + " try reducing the number of load workers by exporting the LOAD__WORKERS" + " environment variable or by setting it in your config TOML:" + "\n\n[load]\nworkers = 1 # a value of 1 disables all concurrency," + " but perhaps a higher value also works\n\n" + "Alternatively, you can set the DESTINATION__SYNAPSE__AUTO_DISABLE_CONCURRENCY" + ' environment variable to "true", or add the following to your config TOML' + " to automatically disable concurrency where needed:" + "\n\n[destination.synapse]\nauto_disable_concurrency = true\n" + ) + return workers diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index 6bdf2946b6..f77d8c11c2 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -30,6 +30,7 @@ def __init__( credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, default_table_index_type: t.Optional[TTableIndexType] = "heap", create_indexes: bool = False, + auto_disable_concurrency: t.Optional[bool] = True, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -41,17 +42,16 @@ def __init__( Args: credentials: Credentials to connect to the Synapse dedicated pool. Can be an instance of `SynapseCredentials` or a connection string in the format `synapse://user:password@host:port/database` - default_table_index_type: Table index type that is used if no - table index type is specified on the resource. This setting only - applies to data tables, dlt system tables are not affected - (they always have "heap" as table index type). - create_indexes: Should unique indexes be created, defaults to False + default_table_index_type: Maps directly to the default_table_index_type attribute of the SynapseClientConfiguration object. + create_indexes: Maps directly to the create_indexes attribute of the SynapseClientConfiguration object. + auto_disable_concurrency: Maps directly to the auto_disable_concurrency attribute of the SynapseClientConfiguration object. **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, default_table_index_type=default_table_index_type, create_indexes=create_indexes, + auto_disable_concurrency=auto_disable_concurrency, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 73c8f076d1..44a2cbdfdb 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -45,7 +45,7 @@ TAnySchemaColumns, TSchemaContract, ) -from dlt.common.schema.utils import normalize_schema_name +from dlt.common.schema.utils import normalize_schema_name, get_write_disposition from dlt.common.storages.exceptions import LoadPackageNotFound from dlt.common.typing import DictStrStr, TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -483,6 +483,13 @@ def load( # make sure that destination is set and client is importable and can be instantiated client, staging_client = self._get_destination_clients(self.default_schema) + # for synapse we might need to adjust the number of load workers + if self.destination.destination_name == "synapse": + write_disposition = get_write_disposition( + self.default_schema.tables, self.default_schema.data_table_names()[0] + ) + workers = client.config.get_load_workers(write_disposition, workers) # type: ignore[attr-defined] + # create default loader config and the loader load_config = LoaderConfiguration( workers=workers, diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index 1dde56a6b1..65d3646f2d 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -264,16 +264,14 @@ def test_replace_table_clearing( # use staging tables for replace os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy - if destination_config.destination == "synapse" and replace_strategy == "staging-optimized": - # The "staging-optimized" replace strategy makes Synapse suspend the CTAS - # queries used to recreate the staging table, and hang, when the number - # of load workers is greater than 1. - os.environ["LOAD__WORKERS"] = "1" - pipeline = destination_config.setup_pipeline( "test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True ) + if destination_config.destination == "synapse" and replace_strategy == "staging-optimized": + # this case requires load concurrency to be disabled (else the test gets stuck) + assert pipeline.destination_client().config.auto_disable_concurrency is True # type: ignore[attr-defined] + @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") def items_with_subitems(): data = { From 75be2ce54ccb486679ca1b177551c3097a2f3908 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 23 Jan 2024 20:26:06 +0100 Subject: [PATCH 06/23] rewrite naive code to prevent IndexError --- dlt/destinations/impl/synapse/configuration.py | 14 +++++++++----- dlt/pipeline/pipeline.py | 7 ++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index b5eec82e9e..119c55ad7a 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -1,8 +1,9 @@ from typing import Final, Any, List, Dict, Optional, ClassVar -from dlt.common.configuration import configspec -from dlt.common.schema.typing import TTableIndexType, TWriteDisposition from dlt.common import logger +from dlt.common.configuration import configspec +from dlt.common.schema.typing import TTableIndexType, TSchemaTables +from dlt.common.schema.utils import get_write_disposition from dlt.destinations.impl.mssql.configuration import ( MsSqlCredentials, @@ -60,13 +61,16 @@ class SynapseClientConfiguration(MsSqlClientConfiguration): "auto_disable_concurrency", ] - def get_load_workers(self, write_disposition: TWriteDisposition, workers: int) -> int: + def get_load_workers(self, tables: TSchemaTables, workers: int) -> int: + """Returns the adjusted number of load workers to prevent concurrency issues.""" + + write_dispositions = [get_write_disposition(tables, table_name) for table_name in tables] + n_replace_dispositions = len([d for d in write_dispositions if d == "replace"]) if ( - write_disposition == "replace" + n_replace_dispositions > 1 and self.replace_strategy == "staging-optimized" and workers > 1 ): - print("auto_disable_concurrency:", self.auto_disable_concurrency) warning_msg_shared = ( 'Data is being loaded into Synapse with write disposition "replace"' ' and replace strategy "staging-optimized", while the number of' diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 44a2cbdfdb..3a0a8f3931 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -45,7 +45,7 @@ TAnySchemaColumns, TSchemaContract, ) -from dlt.common.schema.utils import normalize_schema_name, get_write_disposition +from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.exceptions import LoadPackageNotFound from dlt.common.typing import DictStrStr, TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -485,10 +485,7 @@ def load( # for synapse we might need to adjust the number of load workers if self.destination.destination_name == "synapse": - write_disposition = get_write_disposition( - self.default_schema.tables, self.default_schema.data_table_names()[0] - ) - workers = client.config.get_load_workers(write_disposition, workers) # type: ignore[attr-defined] + workers = client.config.get_load_workers(self.default_schema.tables, workers) # type: ignore[attr-defined] # create default loader config and the loader load_config = LoaderConfiguration( From 014543aa5adb7669adead1cbda39cb21268c9070 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 25 Jan 2024 19:56:21 +0100 Subject: [PATCH 07/23] add support for staged Parquet loading --- dlt/destinations/impl/synapse/__init__.py | 4 +- .../impl/synapse/configuration.py | 8 +- dlt/destinations/impl/synapse/factory.py | 5 +- dlt/destinations/impl/synapse/synapse.py | 115 +++++++++++++++++- poetry.lock | 4 +- pyproject.toml | 2 +- tests/load/pipeline/test_pipelines.py | 17 +-- tests/load/pipeline/test_stage_loading.py | 35 +++++- tests/load/utils.py | 17 +++ 9 files changed, 182 insertions(+), 25 deletions(-) diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py index 175b011186..639d8a598f 100644 --- a/dlt/destinations/impl/synapse/__init__.py +++ b/dlt/destinations/impl/synapse/__init__.py @@ -9,8 +9,8 @@ def capabilities() -> DestinationCapabilitiesContext: caps.preferred_loader_file_format = "insert_values" caps.supported_loader_file_formats = ["insert_values"] - caps.preferred_staging_file_format = None - caps.supported_staging_file_formats = [] + caps.preferred_staging_file_format = "parquet" + caps.supported_staging_file_formats = ["parquet"] caps.insert_values_writer_type = "select_union" # https://stackoverflow.com/a/77014299 diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index 119c55ad7a..34b227a2ac 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -48,17 +48,21 @@ class SynapseClientConfiguration(MsSqlClientConfiguration): # are tricky in Synapse: they are NOT ENFORCED and can lead to innacurate # results if the user does not ensure all column values are unique. # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints - create_indexes: Optional[bool] = False + create_indexes: bool = False """Whether `primary_key` and `unique` column hints are applied.""" # Concurrency is disabled by overriding the configured number of workers to 1 at runtime. - auto_disable_concurrency: Optional[bool] = True + auto_disable_concurrency: bool = True """Whether concurrency is automatically disabled in cases where it might cause issues.""" + staging_use_msi: bool = False + """Whether the managed identity of the Synapse workspace is used to authorize access to the staging Storage Account.""" + __config_gen_annotations__: ClassVar[List[str]] = [ "default_table_index_type", "create_indexes", "auto_disable_concurrency", + "staging_use_msi", ] def get_load_workers(self, tables: TSchemaTables, workers: int) -> int: diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index f77d8c11c2..3d951f3d4a 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -30,7 +30,8 @@ def __init__( credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, default_table_index_type: t.Optional[TTableIndexType] = "heap", create_indexes: bool = False, - auto_disable_concurrency: t.Optional[bool] = True, + auto_disable_concurrency: bool = True, + staging_use_msi: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, **kwargs: t.Any, @@ -45,6 +46,7 @@ def __init__( default_table_index_type: Maps directly to the default_table_index_type attribute of the SynapseClientConfiguration object. create_indexes: Maps directly to the create_indexes attribute of the SynapseClientConfiguration object. auto_disable_concurrency: Maps directly to the auto_disable_concurrency attribute of the SynapseClientConfiguration object. + auto_disable_concurrency: Maps directly to the staging_use_msi attribute of the SynapseClientConfiguration object. **kwargs: Additional arguments passed to the destination config """ super().__init__( @@ -52,6 +54,7 @@ def __init__( default_table_index_type=default_table_index_type, create_indexes=create_indexes, auto_disable_concurrency=auto_disable_concurrency, + staging_use_msi=staging_use_msi, destination_name=destination_name, environment=environment, **kwargs, diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index e01e851d83..c29c0df3f5 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -1,17 +1,28 @@ +import os from typing import ClassVar, Sequence, List, Dict, Any, Optional, cast from copy import deepcopy from textwrap import dedent +from urllib.parse import urlparse, urlunparse from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import SupportsStagingDestination, NewLoadJob +from dlt.common.destination.reference import ( + SupportsStagingDestination, + NewLoadJob, + CredentialsConfiguration, +) from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint +from dlt.common.schema.utils import table_schema_has_type from dlt.common.schema.typing import TTableSchemaColumns, TTableIndexType +from dlt.common.configuration.specs import AzureCredentialsWithoutDefaults + +from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.job_client_impl import SqlJobClientBase +from dlt.destinations.job_client_impl import SqlJobClientBase, LoadJob, CopyRemoteFileLoadJob +from dlt.destinations.exceptions import LoadJobTerminalException from dlt.destinations.impl.mssql.mssql import ( MsSqlTypeMapper, @@ -35,7 +46,7 @@ } -class SynapseClient(MsSqlClient): +class SynapseClient(MsSqlClient, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: @@ -140,6 +151,21 @@ def get_storage_table_index_type(self, table_name: str) -> TTableIndexType: table_index_type = sql_client.execute_sql(sql)[0][0] return cast(TTableIndexType, table_index_type) + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: + job = super().start_file_load(table, file_path, load_id) + if not job: + assert NewReferenceJob.is_reference_job( + file_path + ), "Synapse must use staging to load files" + job = SynapseCopyFileLoadJob( + table, + file_path, + self.sql_client, + cast(AzureCredentialsWithoutDefaults, self.config.staging_config.credentials), + self.config.staging_use_msi, + ) + return job + class SynapseStagingCopyJob(SqlStagingCopyJob): @classmethod @@ -173,3 +199,86 @@ def generate_sql( ) return sql + + +class SynapseCopyFileLoadJob(CopyRemoteFileLoadJob): + def __init__( + self, + table: TTableSchema, + file_path: str, + sql_client: SqlClientBase[Any], + staging_credentials: Optional[AzureCredentialsWithoutDefaults] = None, + staging_use_msi: bool = False, + ) -> None: + self.staging_use_msi = staging_use_msi + super().__init__(table, file_path, sql_client, staging_credentials) + + def execute(self, table: TTableSchema, bucket_path: str) -> None: + # get format + ext = os.path.splitext(bucket_path)[1][1:] + if ext == "parquet": + if table_schema_has_type(table, "time"): + # Synapse interprets Parquet TIME columns as bigint, resulting in + # an incompatibility error. + raise LoadJobTerminalException( + self.file_name(), + "Synapse cannot load TIME columns from Parquet files. Switch to direct INSERT" + " file format or convert `datetime.time` objects in your data to `str` or" + " `datetime.datetime`", + ) + file_type = "PARQUET" + + # dlt-generated DDL statements will still create the table, but + # enabling AUTO_CREATE_TABLE prevents a MalformedInputException. + auto_create_table = "ON" + else: + raise ValueError(f"Unsupported file type {ext} for Synapse.") + + staging_credentials = self._staging_credentials + assert staging_credentials is not None + assert isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + azure_storage_account_name = staging_credentials.azure_storage_account_name + https_path = self._get_https_path(bucket_path, azure_storage_account_name) + table_name = table["name"] + + if self.staging_use_msi: + credential = "IDENTITY = 'Managed Identity'" + else: + sas_token = staging_credentials.azure_storage_sas_token + credential = f"IDENTITY = 'Shared Access Signature', SECRET = '{sas_token}'" + + # Copy data from staging file into Synapse table. + with self._sql_client.begin_transaction(): + dataset_name = self._sql_client.dataset_name + sql = dedent(f""" + COPY INTO [{dataset_name}].[{table_name}] + FROM '{https_path}' + WITH ( + FILE_TYPE = '{file_type}', + CREDENTIAL = ({credential}), + AUTO_CREATE_TABLE = '{auto_create_table}' + ) + """) + self._sql_client.execute_sql(sql) + + def exception(self) -> str: + # this part of code should be never reached + raise NotImplementedError() + + def _get_https_path(self, bucket_path: str, storage_account_name: str) -> str: + """ + Converts a path in the form of az:/// to + https://.blob.core.windows.net// + as required by Synapse. + """ + bucket_url = urlparse(bucket_path) + # "blob" endpoint has better performance than "dfs" endoint + # https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql?view=azure-sqldw-latest#external-locations + endpoint = "blob" + _path = "/" + bucket_url.netloc + bucket_url.path + https_url = bucket_url._replace( + scheme="https", + netloc=f"{storage_account_name}.{endpoint}.core.windows.net", + path=_path, + ) + return urlunparse(https_url) diff --git a/poetry.lock b/poetry.lock index 4d079fc44d..400bcb61e2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8466,10 +8466,10 @@ qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] snowflake = ["snowflake-connector-python"] -synapse = ["pyodbc"] +synapse = ["adlfs", "pyodbc"] weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "26c595a857f17a5cbdb348f165c267d8910412325be4e522d0e91224c7fec588" +content-hash = "75a5f533e9456898ad0157b699d76d9c5a1abf8f4cd04ed7be2235ae3198e16c" diff --git a/pyproject.toml b/pyproject.toml index d9d5858674..f6ae77b593 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,7 @@ cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] mssql = ["pyodbc"] -synapse = ["pyodbc"] +synapse = ["pyodbc", "adlfs"] qdrant = ["qdrant-client"] [tool.poetry.scripts] diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index d170fd553b..304f1a0d2f 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -788,7 +788,7 @@ def other_data(): column_schemas["col11_precision"]["precision"] = 0 # drop TIME from databases not supporting it via parquet - if destination_config.destination in ["redshift", "athena"]: + if destination_config.destination in ["redshift", "athena", "synapse"]: data_types.pop("col11") data_types.pop("col11_null") data_types.pop("col11_precision") @@ -827,15 +827,16 @@ def some_source(): assert len(package_info.jobs["completed_jobs"]) == expected_completed_jobs with pipeline.sql_client() as sql_client: + qual_name = sql_client.make_qualified_table_name assert [ - row[0] for row in sql_client.execute_sql("SELECT * FROM other_data ORDER BY 1") + row[0] + for row in sql_client.execute_sql(f"SELECT * FROM {qual_name('other_data')} ORDER BY 1") ] == [1, 2, 3, 4, 5] - assert [row[0] for row in sql_client.execute_sql("SELECT * FROM some_data ORDER BY 1")] == [ - 1, - 2, - 3, - ] - db_rows = sql_client.execute_sql("SELECT * FROM data_types") + assert [ + row[0] + for row in sql_client.execute_sql(f"SELECT * FROM {qual_name('some_data')} ORDER BY 1") + ] == [1, 2, 3] + db_rows = sql_client.execute_sql(f"SELECT * FROM {qual_name('data_types')}") assert len(db_rows) == 10 db_row = list(db_rows[0]) # "snowflake" and "bigquery" do not parse JSON form parquet string so double parse diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index de4a7f4c3b..ca27cf4b05 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -94,7 +94,13 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: # check item of first row in db with pipeline.sql_client() as sql_client: - rows = sql_client.execute_sql("SELECT url FROM issues WHERE id = 388089021 LIMIT 1") + if destination_config.destination in ["mssql", "synapse"]: + qual_name = sql_client.make_qualified_table_name + rows = sql_client.execute_sql( + f"SELECT TOP 1 url FROM {qual_name('issues')} WHERE id = 388089021" + ) + else: + rows = sql_client.execute_sql("SELECT url FROM issues WHERE id = 388089021 LIMIT 1") assert rows[0][0] == "https://api.github.com/repos/duckdb/duckdb/issues/71" if destination_config.supports_merge: @@ -109,10 +115,23 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: # check changes where merged in with pipeline.sql_client() as sql_client: - rows = sql_client.execute_sql("SELECT number FROM issues WHERE id = 1232152492 LIMIT 1") - assert rows[0][0] == 105 - rows = sql_client.execute_sql("SELECT number FROM issues WHERE id = 1142699354 LIMIT 1") - assert rows[0][0] == 300 + if destination_config.destination in ["mssql", "synapse"]: + qual_name = sql_client.make_qualified_table_name + rows_1 = sql_client.execute_sql( + f"SELECT TOP 1 number FROM {qual_name('issues')} WHERE id = 1232152492" + ) + rows_2 = sql_client.execute_sql( + f"SELECT TOP 1 number FROM {qual_name('issues')} WHERE id = 1142699354" + ) + else: + rows_1 = sql_client.execute_sql( + "SELECT number FROM issues WHERE id = 1232152492 LIMIT 1" + ) + rows_2 = sql_client.execute_sql( + "SELECT number FROM issues WHERE id = 1142699354 LIMIT 1" + ) + assert rows_1[0][0] == 105 + assert rows_2[0][0] == 300 # test append info = pipeline.run( @@ -161,6 +180,9 @@ def test_all_data_types(destination_config: DestinationTestConfiguration) -> Non ) and destination_config.file_format in ("parquet", "jsonl"): # Redshift copy doesn't support TIME column exclude_types.append("time") + if destination_config.destination == "synapse" and destination_config.file_format == "parquet": + # TIME columns are not supported for staged parquet loads into Synapse + exclude_types.append("time") if destination_config.destination == "redshift" and destination_config.file_format in ( "parquet", "jsonl", @@ -199,7 +221,8 @@ def my_source(): assert_load_info(info) with pipeline.sql_client() as sql_client: - db_rows = sql_client.execute_sql("SELECT * FROM data_types") + qual_name = sql_client.make_qualified_table_name + db_rows = sql_client.execute_sql(f"SELECT * FROM {qual_name('data_types')}") assert len(db_rows) == 10 db_row = list(db_rows[0]) # parquet is not really good at inserting json, best we get are strings in JSON columns diff --git a/tests/load/utils.py b/tests/load/utils.py index 55445e0b95..207e32209f 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -95,6 +95,7 @@ class DestinationTestConfiguration: bucket_url: Optional[str] = None stage_name: Optional[str] = None staging_iam_role: Optional[str] = None + staging_use_msi: bool = False extra_info: Optional[str] = None supports_merge: bool = True # TODO: take it from client base class force_iceberg: bool = False @@ -118,6 +119,7 @@ def setup(self) -> None: os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = self.bucket_url or "" os.environ["DESTINATION__STAGE_NAME"] = self.stage_name or "" os.environ["DESTINATION__STAGING_IAM_ROLE"] = self.staging_iam_role or "" + os.environ["DESTINATION__STAGING_USE_MSI"] = str(self.staging_use_msi) or "" os.environ["DESTINATION__FORCE_ICEBERG"] = str(self.force_iceberg) or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" @@ -254,6 +256,21 @@ def destinations_configs( bucket_url=AZ_BUCKET, extra_info="az-authorization", ), + DestinationTestConfiguration( + destination="synapse", + staging="filesystem", + file_format="parquet", + bucket_url=AZ_BUCKET, + extra_info="az-authorization", + ), + DestinationTestConfiguration( + destination="synapse", + staging="filesystem", + file_format="parquet", + bucket_url=AZ_BUCKET, + staging_use_msi=True, + extra_info="az-managed-identity", + ), ] if all_staging_configs: From 7868ca6bfd54ff691e8e84384a65c7b9c55a00f4 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 26 Jan 2024 21:36:42 +0100 Subject: [PATCH 08/23] made table index type logic Synapse specific through destination adapter --- dlt/common/destination/reference.py | 4 +- dlt/common/schema/typing.py | 3 -- dlt/common/schema/utils.py | 12 ----- dlt/destinations/adapters.py | 3 +- .../impl/qdrant/qdrant_adapter.py | 11 +--- dlt/destinations/impl/synapse/__init__.py | 2 + .../impl/synapse/configuration.py | 4 +- dlt/destinations/impl/synapse/factory.py | 4 +- dlt/destinations/impl/synapse/synapse.py | 23 ++++++--- .../impl/synapse/synapse_adapter.py | 50 +++++++++++++++++++ .../impl/weaviate/weaviate_adapter.py | 11 +--- dlt/destinations/utils.py | 16 ++++++ dlt/extract/decorators.py | 7 --- dlt/extract/hints.py | 3 -- .../test_table_indexing.py | 46 ++++++++--------- 15 files changed, 117 insertions(+), 82 deletions(-) create mode 100644 dlt/destinations/impl/synapse/synapse_adapter.py create mode 100644 dlt/destinations/utils.py rename tests/load/{pipeline => synapse}/test_table_indexing.py (81%) diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 59f13b30b9..1c28dffa8c 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -34,7 +34,7 @@ from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName -from dlt.common.schema.utils import get_write_disposition, get_table_format, get_table_index_type +from dlt.common.schema.utils import get_write_disposition, get_table_format from dlt.common.configuration import configspec, with_config, resolve_configuration, known_sections from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config @@ -372,8 +372,6 @@ def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> table["write_disposition"] = get_write_disposition(self.schema.tables, table_name) if "table_format" not in table: table["table_format"] = get_table_format(self.schema.tables, table_name) - if "table_index_type" not in table: - table["table_index_type"] = get_table_index_type(self.schema.tables, table_name) return table except KeyError: raise UnknownTableException(table_name) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 351d666553..9a27cbe4bb 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -62,8 +62,6 @@ """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg"] -TTableIndexType = Literal["heap", "clustered_columnstore_index"] -"Table index type. Currently only used for Synapse destination." TTypeDetections = Literal[ "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" ] @@ -167,7 +165,6 @@ class TTableSchema(TypedDict, total=False): columns: TTableSchemaColumns resource: Optional[str] table_format: Optional[TTableFormat] - table_index_type: Optional[TTableIndexType] class TPartialTableSchema(TTableSchema): diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 5ea244148e..dc243f50dd 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -32,7 +32,6 @@ TColumnSchema, TColumnProp, TTableFormat, - TTableIndexType, TColumnHint, TTypeDetectionFunc, TTypeDetections, @@ -619,14 +618,6 @@ def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: ) -def get_table_index_type(tables: TSchemaTables, table_name: str) -> TTableIndexType: - """Returns table index type of a table if present. If not, looks up into parent table.""" - return cast( - TTableIndexType, - get_inherited_table_hint(tables, table_name, "table_index_type", allow_none=True), - ) - - def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: """Checks if `table` schema contains column with type _typ""" return any(c.get("data_type") == _typ for c in table["columns"].values()) @@ -733,7 +724,6 @@ def new_table( resource: str = None, schema_contract: TSchemaContract = None, table_format: TTableFormat = None, - table_index_type: TTableIndexType = None, ) -> TTableSchema: table: TTableSchema = { "name": table_name, @@ -752,8 +742,6 @@ def new_table( table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format - if table_index_type is not None: - table["table_index_type"] = table_index_type if validate_schema: validate_dict_ignoring_xkeys( spec=TColumnSchema, diff --git a/dlt/destinations/adapters.py b/dlt/destinations/adapters.py index b8f12599dc..22c98d4f5a 100644 --- a/dlt/destinations/adapters.py +++ b/dlt/destinations/adapters.py @@ -2,5 +2,6 @@ from dlt.destinations.impl.weaviate import weaviate_adapter from dlt.destinations.impl.qdrant import qdrant_adapter +from dlt.destinations.impl.synapse import synapse_adapter -__all__ = ["weaviate_adapter", "qdrant_adapter"] +__all__ = ["weaviate_adapter", "qdrant_adapter", "synapse_adapter"] diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index 243cbd6c5b..215d87a920 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -2,6 +2,7 @@ from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.extract import DltResource, resource as make_resource +from dlt.destinations.utils import ensure_resource VECTORIZE_HINT = "x-qdrant-embed" @@ -31,15 +32,7 @@ def qdrant_adapter( >>> qdrant_adapter(data, embed="description") [DltResource with hints applied] """ - # wrap `data` in a resource if not an instance already - resource: DltResource - if not isinstance(data, DltResource): - resource_name: str = None - if not hasattr(data, "__name__"): - resource_name = "content" - resource = make_resource(data, name=resource_name) - else: - resource = data + resource = ensure_resource(data) column_hints: TTableSchemaColumns = {} diff --git a/dlt/destinations/impl/synapse/__init__.py b/dlt/destinations/impl/synapse/__init__.py index 639d8a598f..53dbabc090 100644 --- a/dlt/destinations/impl/synapse/__init__.py +++ b/dlt/destinations/impl/synapse/__init__.py @@ -3,6 +3,8 @@ from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION +from dlt.destinations.impl.synapse.synapse_adapter import synapse_adapter + def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index 34b227a2ac..cc0e40114b 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -2,7 +2,7 @@ from dlt.common import logger from dlt.common.configuration import configspec -from dlt.common.schema.typing import TTableIndexType, TSchemaTables +from dlt.common.schema.typing import TSchemaTables from dlt.common.schema.utils import get_write_disposition from dlt.destinations.impl.mssql.configuration import ( @@ -11,6 +11,8 @@ ) from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType + @configspec class SynapseCredentials(MsSqlCredentials): diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index 3d951f3d4a..0ac58001ca 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -1,13 +1,13 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.common.schema.typing import TTableIndexType -from dlt.destinations.impl.synapse import capabilities +from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.configuration import ( SynapseCredentials, SynapseClientConfiguration, ) +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType if t.TYPE_CHECKING: from dlt.destinations.impl.synapse.synapse import SynapseClient diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index c29c0df3f5..d34fef1ab4 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -12,8 +12,8 @@ ) from dlt.common.schema import TTableSchema, TColumnSchema, Schema, TColumnHint -from dlt.common.schema.utils import table_schema_has_type -from dlt.common.schema.typing import TTableSchemaColumns, TTableIndexType +from dlt.common.schema.utils import table_schema_has_type, get_inherited_table_hint +from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.configuration.specs import AzureCredentialsWithoutDefaults @@ -34,6 +34,10 @@ from dlt.destinations.impl.synapse import capabilities from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient from dlt.destinations.impl.synapse.configuration import SynapseClientConfiguration +from dlt.destinations.impl.synapse.synapse_adapter import ( + TABLE_INDEX_TYPE_HINT, + TTableIndexType, +) HINT_TO_SYNAPSE_ATTR: Dict[TColumnHint, str] = { @@ -68,7 +72,7 @@ def _get_table_update_sql( if table is None: table_index_type = self.config.default_table_index_type else: - table_index_type = table.get("table_index_type") + table_index_type = cast(TTableIndexType, table.get(TABLE_INDEX_TYPE_HINT)) if table_index_type == "clustered_columnstore_index": new_columns = self._get_columstore_valid_columns(new_columns) @@ -128,9 +132,16 @@ def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema # configuration. Why? "For small lookup tables, less than 60 million rows, # consider using HEAP or clustered index for faster query performance." # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables - table["table_index_type"] = "heap" - if table["table_index_type"] is None: - table["table_index_type"] = self.config.default_table_index_type + table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key] + elif table_name in self.schema.data_table_names(): + if TABLE_INDEX_TYPE_HINT not in table: + # If present in parent table, fetch hint from there. + table[TABLE_INDEX_TYPE_HINT] = get_inherited_table_hint( # type: ignore[typeddict-unknown-key] + self.schema.tables, table_name, TABLE_INDEX_TYPE_HINT, allow_none=True + ) + if table[TABLE_INDEX_TYPE_HINT] is None: # type: ignore[typeddict-item] + # Hint still not defined, fall back to default. + table[TABLE_INDEX_TYPE_HINT] = self.config.default_table_index_type # type: ignore[typeddict-unknown-key] return table def get_storage_table_index_type(self, table_name: str) -> TTableIndexType: diff --git a/dlt/destinations/impl/synapse/synapse_adapter.py b/dlt/destinations/impl/synapse/synapse_adapter.py new file mode 100644 index 0000000000..f135dd967a --- /dev/null +++ b/dlt/destinations/impl/synapse/synapse_adapter.py @@ -0,0 +1,50 @@ +from typing import Any, Literal, Set, get_args, Final + +from dlt.extract import DltResource, resource as make_resource +from dlt.extract.typing import TTableHintTemplate +from dlt.extract.hints import TResourceHints +from dlt.destinations.utils import ensure_resource + +TTableIndexType = Literal["heap", "clustered_columnstore_index"] +""" +Table [index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) used when creating the Synapse table. +This regards indexes specified at the table level, not the column level. +""" +TABLE_INDEX_TYPES: Set[TTableIndexType] = set(get_args(TTableIndexType)) + +TABLE_INDEX_TYPE_HINT: Literal["x-table-index-type"] = "x-table-index-type" + + +def synapse_adapter(data: Any, table_index_type: TTableIndexType = None) -> DltResource: + """Prepares data for the Synapse destination by specifying which table index + type should be used. + + Args: + data (Any): The data to be transformed. It can be raw data or an instance + of DltResource. If raw data, the function wraps it into a DltResource + object. + table_index_type (TTableIndexType, optional): The table index type used when creating + the Synapse table. + + Returns: + DltResource: A resource with applied Synapse-specific hints. + + Raises: + ValueError: If input for `table_index_type` is invalid. + + Examples: + >>> data = [{"name": "Anush", "description": "Integrations Hacker"}] + >>> synapse_adapter(data, table_index_type="clustered_columnstore_index") + [DltResource with hints applied] + """ + resource = ensure_resource(data) + + if table_index_type is not None: + if table_index_type not in TABLE_INDEX_TYPES: + allowed_types = ", ".join(TABLE_INDEX_TYPES) + raise ValueError( + f"Table index type {table_index_type} is invalid. Allowed table index" + f" types are: {allowed_types}." + ) + resource._hints[TABLE_INDEX_TYPE_HINT] = table_index_type # type: ignore[typeddict-unknown-key] + return resource diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index 2d5161d9e9..a290ac65b4 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -2,6 +2,7 @@ from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns from dlt.extract import DltResource, resource as make_resource +from dlt.destinations.utils import ensure_resource TTokenizationTMethod = Literal["word", "lowercase", "whitespace", "field"] TOKENIZATION_METHODS: Set[TTokenizationTMethod] = set(get_args(TTokenizationTMethod)) @@ -53,15 +54,7 @@ def weaviate_adapter( >>> weaviate_adapter(data, vectorize="description", tokenization={"description": "word"}) [DltResource with hints applied] """ - # wrap `data` in a resource if not an instance already - resource: DltResource - if not isinstance(data, DltResource): - resource_name: str = None - if not hasattr(data, "__name__"): - resource_name = "content" - resource = make_resource(data, name=resource_name) - else: - resource = data + resource = ensure_resource(data) column_hints: TTableSchemaColumns = {} if vectorize: diff --git a/dlt/destinations/utils.py b/dlt/destinations/utils.py new file mode 100644 index 0000000000..d4b945a840 --- /dev/null +++ b/dlt/destinations/utils.py @@ -0,0 +1,16 @@ +from typing import Any + +from dlt.extract import DltResource, resource as make_resource + + +def ensure_resource(data: Any) -> DltResource: + """Wraps `data` in a DltResource if it's not a DltResource already.""" + resource: DltResource + if not isinstance(data, DltResource): + resource_name: str = None + if not hasattr(data, "__name__"): + resource_name = "content" + resource = make_resource(data, name=resource_name) + else: + resource = data + return resource diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 573d3d3ad0..cf7426e683 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -36,7 +36,6 @@ TAnySchemaColumns, TSchemaContract, TTableFormat, - TTableIndexType, ) from dlt.extract.utils import ( ensure_table_schema_columns_hint, @@ -257,7 +256,6 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> DltResource: ... @@ -275,7 +273,6 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... @@ -293,7 +290,6 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: Literal[True] = True, @@ -312,7 +308,6 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, ) -> DltResource: ... @@ -329,7 +324,6 @@ def resource( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, standalone: bool = False, @@ -409,7 +403,6 @@ def make_resource( merge_key=merge_key, schema_contract=schema_contract, table_format=table_format, - table_index_type=table_index_type, ) return DltResource.from_data( _data, diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 36354eb0da..437dbbc6bd 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -12,7 +12,6 @@ TWriteDisposition, TAnySchemaColumns, TTableFormat, - TTableIndexType, TSchemaContract, ) from dlt.common.typing import TDataItem @@ -275,7 +274,6 @@ def new_table_template( merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, - table_index_type: TTableHintTemplate[TTableIndexType] = None, ) -> TResourceHints: validator, schema_contract = create_item_validator(columns, schema_contract) clean_columns = columns @@ -291,7 +289,6 @@ def new_table_template( columns=clean_columns, # type: ignore schema_contract=schema_contract, # type: ignore table_format=table_format, # type: ignore - table_index_type=table_index_type, # type: ignore ) if not table_name: new_template.pop("name") diff --git a/tests/load/pipeline/test_table_indexing.py b/tests/load/synapse/test_table_indexing.py similarity index 81% rename from tests/load/pipeline/test_table_indexing.py rename to tests/load/synapse/test_table_indexing.py index 5f62cddfee..097bde09f9 100644 --- a/tests/load/pipeline/test_table_indexing.py +++ b/tests/load/synapse/test_table_indexing.py @@ -5,16 +5,13 @@ import dlt from dlt.common.schema import TColumnSchema -from dlt.common.schema.typing import TTableIndexType, TSchemaTables -from dlt.common.schema.utils import get_table_index_type from dlt.destinations.sql_client import SqlClientBase +from dlt.destinations.impl.synapse import synapse_adapter +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType + from tests.load.utils import TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES -from tests.load.pipeline.utils import ( - destinations_configs, - DestinationTestConfiguration, -) TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID = [ @@ -27,16 +24,10 @@ ] -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["synapse"]), - ids=lambda x: x.name, -) @pytest.mark.parametrize( "table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID ) def test_default_table_index_type_configuration( - destination_config: DestinationTestConfiguration, table_index_type: TTableIndexType, column_schema: Union[List[TColumnSchema], None], ) -> None: @@ -51,10 +42,13 @@ def test_default_table_index_type_configuration( def items_without_table_index_type_specified() -> Iterator[Any]: yield TABLE_ROW_ALL_DATA_TYPES - pipeline = destination_config.setup_pipeline( - f"test_default_table_index_type_{table_index_type}", + pipeline = dlt.pipeline( + pipeline_name=f"test_default_table_index_type_{table_index_type}", + destination="synapse", + dataset_name=f"test_default_table_index_type_{table_index_type}", full_refresh=True, ) + job_client = pipeline.destination_client() # Assert configuration value gets properly propagated to job client configuration. assert job_client.config.default_table_index_type == table_index_type # type: ignore[attr-defined] @@ -80,13 +74,14 @@ def items_without_table_index_type_specified() -> Iterator[Any]: @dlt.resource( name="items_with_table_index_type_specified", write_disposition="append", - table_index_type="clustered_columnstore_index", columns=column_schema, ) def items_with_table_index_type_specified() -> Iterator[Any]: yield TABLE_ROW_ALL_DATA_TYPES - pipeline.run(items_with_table_index_type_specified) + pipeline.run( + synapse_adapter(items_with_table_index_type_specified, "clustered_columnstore_index") + ) applied_table_index_type = job_client.get_storage_table_index_type( # type: ignore[attr-defined] "items_with_table_index_type_specified" ) @@ -95,35 +90,34 @@ def items_with_table_index_type_specified() -> Iterator[Any]: assert applied_table_index_type == "clustered_columnstore_index" -@pytest.mark.parametrize( - "destination_config", - destinations_configs(default_sql_configs=True, subset=["synapse"]), - ids=lambda x: x.name, -) @pytest.mark.parametrize( "table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID ) def test_resource_table_index_type_configuration( - destination_config: DestinationTestConfiguration, table_index_type: TTableIndexType, column_schema: Union[List[TColumnSchema], None], ) -> None: @dlt.resource( name="items_with_table_index_type_specified", write_disposition="append", - table_index_type=table_index_type, columns=column_schema, ) def items_with_table_index_type_specified() -> Iterator[Any]: yield TABLE_ROW_ALL_DATA_TYPES - pipeline = destination_config.setup_pipeline( - f"test_table_index_type_{table_index_type}", + pipeline = dlt.pipeline( + pipeline_name=f"test_table_index_type_{table_index_type}", + destination="synapse", + dataset_name=f"test_table_index_type_{table_index_type}", full_refresh=True, ) + # An invalid value for `table_index_type` should raise a ValueError. + with pytest.raises(ValueError): + pipeline.run(synapse_adapter(items_with_table_index_type_specified, "foo")) # type: ignore[arg-type] + # Run the pipeline and create the tables. - pipeline.run(items_with_table_index_type_specified) + pipeline.run(synapse_adapter(items_with_table_index_type_specified, table_index_type)) # For all tables, assert the applied index type equals the expected index type. # Child tables, if any, inherit the index type of their parent. From b4cdd36e41af7e13849e255133a2654dde79ac7e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Fri, 26 Jan 2024 22:06:11 +0100 Subject: [PATCH 09/23] moved test function into tests folder and renamed test file --- dlt/destinations/impl/synapse/synapse.py | 18 -------------- ...xing.py => test_synapse_table_indexing.py} | 10 ++++---- tests/load/synapse/utils.py | 24 +++++++++++++++++++ 3 files changed, 30 insertions(+), 22 deletions(-) rename tests/load/synapse/{test_table_indexing.py => test_synapse_table_indexing.py} (91%) create mode 100644 tests/load/synapse/utils.py diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index d34fef1ab4..eb6eae3f20 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -144,24 +144,6 @@ def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema table[TABLE_INDEX_TYPE_HINT] = self.config.default_table_index_type # type: ignore[typeddict-unknown-key] return table - def get_storage_table_index_type(self, table_name: str) -> TTableIndexType: - """Returns table index type of table in storage destination.""" - with self.sql_client as sql_client: - schema_name = sql_client.fully_qualified_dataset_name(escape=False) - sql = dedent(f""" - SELECT - CASE i.type_desc - WHEN 'HEAP' THEN 'heap' - WHEN 'CLUSTERED COLUMNSTORE' THEN 'clustered_columnstore_index' - END AS table_index_type - FROM sys.indexes i - INNER JOIN sys.tables t ON t.object_id = i.object_id - INNER JOIN sys.schemas s ON s.schema_id = t.schema_id - WHERE s.name = '{schema_name}' AND t.name = '{table_name}' - """) - table_index_type = sql_client.execute_sql(sql)[0][0] - return cast(TTableIndexType, table_index_type) - def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: job = super().start_file_load(table, file_path, load_id) if not job: diff --git a/tests/load/synapse/test_table_indexing.py b/tests/load/synapse/test_synapse_table_indexing.py similarity index 91% rename from tests/load/synapse/test_table_indexing.py rename to tests/load/synapse/test_synapse_table_indexing.py index 097bde09f9..af4786af9f 100644 --- a/tests/load/synapse/test_table_indexing.py +++ b/tests/load/synapse/test_synapse_table_indexing.py @@ -12,6 +12,7 @@ from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType from tests.load.utils import TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES +from tests.load.synapse.utils import get_storage_table_index_type TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID = [ @@ -60,7 +61,7 @@ def items_without_table_index_type_specified() -> Iterator[Any]: # Child tables, if any, inherit the index type of their parent. tables = pipeline.default_schema.tables for table_name in tables: - applied_table_index_type = job_client.get_storage_table_index_type(table_name) # type: ignore[attr-defined] + applied_table_index_type = get_storage_table_index_type(job_client.sql_client, table_name) # type: ignore[attr-defined] if table_name in pipeline.default_schema.data_table_names(): # For data tables, the applied table index type should be the default value. assert applied_table_index_type == job_client.config.default_table_index_type # type: ignore[attr-defined] @@ -82,8 +83,9 @@ def items_with_table_index_type_specified() -> Iterator[Any]: pipeline.run( synapse_adapter(items_with_table_index_type_specified, "clustered_columnstore_index") ) - applied_table_index_type = job_client.get_storage_table_index_type( # type: ignore[attr-defined] - "items_with_table_index_type_specified" + applied_table_index_type = get_storage_table_index_type( + job_client.sql_client, # type: ignore[attr-defined] + "items_with_table_index_type_specified", ) # While the default is "heap", the applied index type should be "clustered_columnstore_index" # because it was provided as argument to the resource. @@ -124,7 +126,7 @@ def items_with_table_index_type_specified() -> Iterator[Any]: job_client = pipeline.destination_client() tables = pipeline.default_schema.tables for table_name in tables: - applied_table_index_type = job_client.get_storage_table_index_type(table_name) # type: ignore[attr-defined] + applied_table_index_type = get_storage_table_index_type(job_client.sql_client, table_name) # type: ignore[attr-defined] if table_name in pipeline.default_schema.data_table_names(): # For data tables, the applied table index type should be the type # configured in the resource. diff --git a/tests/load/synapse/utils.py b/tests/load/synapse/utils.py new file mode 100644 index 0000000000..cd53716878 --- /dev/null +++ b/tests/load/synapse/utils.py @@ -0,0 +1,24 @@ +from typing import cast +from textwrap import dedent + +from dlt.destinations.impl.synapse.sql_client import SynapseSqlClient +from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType + + +def get_storage_table_index_type(sql_client: SynapseSqlClient, table_name: str) -> TTableIndexType: + """Returns table index type of table in storage destination.""" + with sql_client: + schema_name = sql_client.fully_qualified_dataset_name(escape=False) + sql = dedent(f""" + SELECT + CASE i.type_desc + WHEN 'HEAP' THEN 'heap' + WHEN 'CLUSTERED COLUMNSTORE' THEN 'clustered_columnstore_index' + END AS table_index_type + FROM sys.indexes i + INNER JOIN sys.tables t ON t.object_id = i.object_id + INNER JOIN sys.schemas s ON s.schema_id = t.schema_id + WHERE s.name = '{schema_name}' AND t.name = '{table_name}' + """) + table_index_type = sql_client.execute_sql(sql)[0][0] + return cast(TTableIndexType, table_index_type) From 97f66e28681e53fbfa8fac060f4f85d5cf05b82d Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 27 Jan 2024 10:25:19 +0100 Subject: [PATCH 10/23] ensure test data gets removed --- tests/load/synapse/test_synapse_table_indexing.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/load/synapse/test_synapse_table_indexing.py b/tests/load/synapse/test_synapse_table_indexing.py index af4786af9f..e87b83fa3f 100644 --- a/tests/load/synapse/test_synapse_table_indexing.py +++ b/tests/load/synapse/test_synapse_table_indexing.py @@ -12,6 +12,9 @@ from dlt.destinations.impl.synapse.synapse_adapter import TTableIndexType from tests.load.utils import TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES +from tests.load.pipeline.utils import ( + drop_pipeline, +) # this import ensures all test data gets removed from tests.load.synapse.utils import get_storage_table_index_type From 90685e7105c3b3f7c2a5981359fd6453cdafc721 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sat, 27 Jan 2024 11:30:00 +0100 Subject: [PATCH 11/23] add pyarrow to synapse dependencies for parquet loading --- poetry.lock | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index 400bcb61e2..6b5625e10a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -8466,10 +8466,10 @@ qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["botocore", "s3fs"] snowflake = ["snowflake-connector-python"] -synapse = ["adlfs", "pyodbc"] +synapse = ["adlfs", "pyarrow", "pyodbc"] weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "75a5f533e9456898ad0157b699d76d9c5a1abf8f4cd04ed7be2235ae3198e16c" +content-hash = "61fa24ff52200b5bf97906a376826f00350abc8f6810fb2fcea73abaf245437f" diff --git a/pyproject.toml b/pyproject.toml index f6ae77b593..fab301ad02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -96,7 +96,7 @@ cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] mssql = ["pyodbc"] -synapse = ["pyodbc", "adlfs"] +synapse = ["pyodbc", "adlfs", "pyarrow"] qdrant = ["qdrant-client"] [tool.poetry.scripts] From 494e45b7b15ca041bdce15e66fcd52df59096b4d Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 28 Jan 2024 00:35:10 +0100 Subject: [PATCH 12/23] added user docs for synapse destination --- dlt/destinations/impl/synapse/README.md | 58 ----- .../docs/dlt-ecosystem/destinations/mssql.md | 14 +- .../dlt-ecosystem/destinations/synapse.md | 208 ++++++++++++++++++ 3 files changed, 214 insertions(+), 66 deletions(-) delete mode 100644 dlt/destinations/impl/synapse/README.md create mode 100644 docs/website/docs/dlt-ecosystem/destinations/synapse.md diff --git a/dlt/destinations/impl/synapse/README.md b/dlt/destinations/impl/synapse/README.md deleted file mode 100644 index b133faf67a..0000000000 --- a/dlt/destinations/impl/synapse/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Set up loader user -Execute the following SQL statements to set up the [loader](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) user: -```sql --- on master database - -CREATE LOGIN loader WITH PASSWORD = 'YOUR_LOADER_PASSWORD_HERE'; -``` - -```sql --- on minipool database - -CREATE USER loader FOR LOGIN loader; - --- DDL permissions -GRANT CREATE TABLE ON DATABASE :: minipool TO loader; -GRANT CREATE VIEW ON DATABASE :: minipool TO loader; - --- DML permissions -GRANT SELECT ON DATABASE :: minipool TO loader; -GRANT INSERT ON DATABASE :: minipool TO loader; -GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; -``` - -```sql --- https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation - -CREATE WORKLOAD GROUP DataLoads -WITH ( - MIN_PERCENTAGE_RESOURCE = 0 - ,CAP_PERCENTAGE_RESOURCE = 50 - ,REQUEST_MIN_RESOURCE_GRANT_PERCENT = 25 -); - -CREATE WORKLOAD CLASSIFIER [wgcELTLogin] -WITH ( - WORKLOAD_GROUP = 'DataLoads' - ,MEMBERNAME = 'loader' -); -``` - -# config.toml -```toml -[destination.synapse.credentials] -database = "minipool" -username = "loader" -host = "dlt-synapse-ci.sql.azuresynapse.net" -port = 1433 -driver = "ODBC Driver 18 for SQL Server" - -[destination.synapse] -create_indexes = false -``` - -# secrets.toml -```toml -[destination.synapse.credentials] -password = "YOUR_LOADER_PASSWORD_HERE" -``` \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index d64cf9b400..e98f8bf256 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -16,16 +16,14 @@ pip install dlt[mssql] ### Prerequisites -Microsoft ODBC driver for SQL Server must be installed to use this destination. -This can't be included with `dlt`s python dependencies so you must installed it separately on your system. +_Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. +This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). -See instructions here to [install Microsoft ODBC Driver 18 for SQL Server on Windows, Mac and Linux](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16) +Supported driver versions: +* `ODBC Driver 18 for SQL Server` +* `ODBC Driver 17 for SQL Server` -Following ODBC drivers are supported: -* ODBC Driver 18 for SQL Server -* ODBC Driver 17 for SQL Server - -[You can configure driver name explicitly](#additional-destination-options) as well. +You can [configure driver name](#additional-destination-options) explicitly as well. ### Create a pipeline diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md new file mode 100644 index 0000000000..4d66714ce3 --- /dev/null +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -0,0 +1,208 @@ +--- +title: Azure Synapse +description: Azure Synapse `dlt` destination +keywords: [synapse, destination, data warehouse] +--- + +# Synapse + +## Install dlt with Synapse +**To install the DLT library with Synapse dependencies:** +``` +pip install dlt[synapse] +``` + +## Setup guide + +### Prerequisites + +* **Microsoft ODBC Driver for SQL Server** + + _Microsoft ODBC Driver for SQL Server_ must be installed to use this destination. + This can't be included with `dlt`'s python dependencies, so you must install it separately on your system. You can find the official installation instructions [here](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16). + + Supported driver versions: + * `ODBC Driver 18 for SQL Server` + + > 💡 Older driver versions don't properly work, because they don't support the `LongAsMax` keyword that got [introduced](https://learn.microsoft.com/en-us/sql/connect/odbc/windows/features-of-the-microsoft-odbc-driver-for-sql-server-on-windows?view=sql-server-ver15#microsoft-odbc-driver-180-for-sql-server-on-windows) in `ODBC Driver 18 for SQL Server`. Synapse does not support the legacy ["long data types"](https://learn.microsoft.com/en-us/sql/t-sql/data-types/ntext-text-and-image-transact-sql), and requires "max data types" instead. `dlt` uses the `LongAsMax` keyword to automatically do the conversion. +* **Azure Synapse Workspace and dedicated SQL pool** + + You need an Azure Synapse workspace with a dedicated SQL pool to load data into. If you don't have one yet, you can use this [quickstart](https://learn.microsoft.com/en-us/azure/synapse-analytics/quickstart-create-sql-pool-studio). + +### Steps + +**1. Initialize a project with a pipeline that loads to Synapse by running** +``` +dlt init chess synapse +``` + +**2. Install the necessary dependencies for Synapse by running** +``` +pip install -r requirements.txt +``` +This will install `dlt` with the **synapse** extra that contains all dependencies required for the Synapse destination. + +**3. Create a loader user** + +Execute the following SQL statements to set up the [loader](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) user. Change the password and replace `yourpool` with the name of your dedicated SQL pool: +```sql +-- on master database, using a SQL admin account + +CREATE LOGIN loader WITH PASSWORD = 'your_loader_password'; +``` + +```sql +-- on yourpool database + +CREATE USER loader FOR LOGIN loader; + +-- DDL permissions +GRANT CREATE SCHEMA ON DATABASE :: yourpool TO loader; +GRANT CREATE TABLE ON DATABASE :: yourpool TO loader; +GRANT CREATE VIEW ON DATABASE :: yourpool TO loader; + +-- DML permissions +GRANT ADMINISTER DATABASE BULK OPERATIONS TO loader; -- only required when loading from staging Storage Account +``` + +Optionally, you can create a `WORKLOAD GROUP` and add the `loader` user as a member to manage [workload isolation](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-workload-isolation). See the [instructions](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql/data-loading-best-practices#create-a-loading-user) on setting up a loader user for an example of how to do this. + +**3. Enter your credentials into `.dlt/secrets.toml`.** + +Example, replace with your database connection info: +```toml +[destination.synapse.credentials] +database = "yourpool" +username = "loader" +password = "your_loader_password" +host = "your_synapse_workspace_name.sql.azuresynapse.net" +``` + +Equivalently, you can also pass a connection string as follows: + +```toml +# keep it at the top of your toml file! before any section starts +destination.synapse.credentials = "synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net/yourpool" +``` + +To pass credentials directly you can use the `credentials` argument of `dlt.destinations.synapse(...)`: +```python +pipeline = dlt.pipeline( + pipeline_name='chess', + destination=dlt.destinations.synapse( + credentials='synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net/yourpool' + ), + dataset_name='chess_data' +) +``` + +## Write disposition +All write dispositions are supported + +If you set the [`replace` strategy](../../general-usage/full-loading.md) to `staging-optimized`, the destination tables will be dropped and replaced by the staging tables with an `ALTER SCHEMA ... TRANSFER` command. Please note that this operation is **not** atomic—it involves multiple DDL commands and Synapse does not support DDL transactions. + +## Data loading +Data is loaded via `INSERT` statements by default. + +> 💡 Multi-row `INSERT INTO ... VALUES` statements are **not** possible in Synapse, because it doesn't support the [Table Value Constructor](https://learn.microsoft.com/en-us/sql/t-sql/queries/table-value-constructor-transact-sql). `dlt` uses `INSERT INTO ... SELECT ... UNION` statements as described [here](https://stackoverflow.com/a/73579830) to work around this limitation. + +## Supported file formats +* [insert-values](../file-formats/insert-format.md) is used by default +* [parquet](../file-formats/parquet.md) is used when [staging](#staging-support) is enabled + +## Data type limitations +* **Synapse cannot load `TIME` columns from `parquet` files**. `dlt` will fail such jobs permanently. Use the `insert_values` file format instead, or convert `datetime.time` objects to `str` or `datetime.datetime`, to load `TIME` columns. +* **Synapse does not have a complex/JSON/struct data type**. The `dlt` `complex` data type is mapped to the `nvarchar` type in Synapse. + +## Table index type +The [table index type](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index) of the created tables can be configured at the resource level with the `synapse_adapter`: + +```python +info = pipeline.run( + synapse_adapter( + data=your_resource, + table_index_type="clustered_columnstore_index", + ) +) +``` + +Possible values: +* `heap`: create [HEAP](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables) tables that do not have an index **(default)** +* `clustered_columnstore_index`: create [CLUSTERED COLUMNSTORE INDEX](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#clustered-columnstore-indexes) tables + + +> ❗ Important: +>* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)). +>* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`. +>* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions. +>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense + for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). +>* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." +>* Child tables, if any, inherent the table index type of their parent table. + +## Supported column hints + +Synapse supports the following [column hints](https://dlthub.com/docs/general-usage/schema#tables-and-columns): + +* `primary_key` - creates a `PRIMARY KEY NONCLUSTERED NOT ENFORCED` constraint on the column +* `unique` - creates a `UNIQUE NOT ENFORCED` constraint on the column + +> ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to innacurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). + +## Load concurrency issue +`dlt` uses threading to enable concurrent processing and [parallel loading](../../reference/performance.md#load). Concurrency does not work properly in all cases when using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), because Synapse suspends the CTAS queries that `dlt` uses behind the scenes and gets stuck. To prevent this from happening, `dlt` automatically sets the number of load workers to 1 to disable concurrency when replacing data using the `staging-optimized` strategy. Set `auto_disable_concurrency = "false"` if you don't want this to happen (see [additional destination options](#additional-destination-options)) + +## Staging support +Synapse supports Azure Blob Storage (both standard and [ADLS Gen2](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)) as a file staging destination. `dlt` first uploads Parquet files to the blob container, and then instructs Synapse to read the Parquet file and load its data into a Synapse table using the [COPY INTO](https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql) statement. + +Please refer to the [Azure Blob Storage filesystem documentation](./filesystem.md#azure-blob-storage) to learn how to configure credentials for the staging destination. By default, `dlt` will use these credentials for both the write into the blob container, and the read from it to load into Synapse. Managed Identity authentication can be enabled through the `staging_use_msi` option (see [additional destination options](#additional-destination-options)). + +To run Synapse with staging on Azure Blob Storage: + +```python +# Create a dlt pipeline that will load +# chess player data to the snowflake destination +# via staging on Azure Blob Storage +pipeline = dlt.pipeline( + pipeline_name='chess_pipeline', + destination='synapse', + staging='filesystem', # add this to activate the staging location + dataset_name='player_data' +) +``` + +## Additional destination options +The following settings can optionally be configured: +```toml +[destination.synapse] +default_table_index_type = "heap" +create_indexes = "false" +auto_disable_concurrency = "true" +staging_use_msi = "false" + +[destination.synapse.credentials] +port = "1433" +connect_timeout = 15 +``` + +`port` and `connect_timeout` can also be included in the connection string: + +```toml +# keep it at the top of your toml file! before any section starts +destination.synapse.credentials = "synapse://loader:your_loader_password@your_synapse_workspace_name.azuresynapse.net:1433/yourpool?connect_timeout=15" +``` + +Descriptions: +- `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. +- `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. +- `auto_disable_concurrency` determines if concurrency is automatically disabled in cases where it might cause issues. +- `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-priviliged role) assigned on the blob container if you set this option to `"true"`. +- `port` used for the ODBC connection. +- `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. + +### dbt support +Integration with [dbt](../transformations/dbt/dbt.md) is currently not supported. + +### Syncing of `dlt` state +This destination fully supports [dlt state sync](../../general-usage/state#syncing-state-with-destination). + From e8c6b1dcf08cfe03c469526c5f98c8d1159ad539 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Sun, 28 Jan 2024 13:22:44 +0100 Subject: [PATCH 13/23] refactor dbt test skipping to prevent unnecessary venv creation --- .github/workflows/test_destination_mssql.yml | 4 ++-- tests/load/pipeline/test_dbt_helper.py | 20 +++++++++----------- tests/load/utils.py | 11 ++++++++--- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index b8ea1db2d4..d1da25c067 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -71,11 +71,11 @@ jobs: run: pwd && echo "$DLT_SECRETS_TOML" > tests/.dlt/secrets.toml - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os != 'Windows' name: Run tests Linux/MAC - run: | - poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py + poetry run pytest tests/load if: runner.os == 'Windows' name: Run tests Windows shell: cmd diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index e919409311..91318d0f34 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -28,7 +28,9 @@ def dbt_venv() -> Iterator[Venv]: @pytest.mark.parametrize( - "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name + "destination_config", + destinations_configs(default_sql_configs=True, supports_dbt=True), + ids=lambda x: x.name, ) def test_run_jaffle_package( destination_config: DestinationTestConfiguration, dbt_venv: Venv @@ -37,8 +39,6 @@ def test_run_jaffle_package( pytest.skip( "dbt-athena requires database to be created and we don't do it in case of Jaffle" ) - if not destination_config.supports_dbt: - pytest.skip("dbt is not supported for this destination configuration") pipeline = destination_config.setup_pipeline("jaffle_jaffle", full_refresh=True) # get runner, pass the env from fixture dbt = dlt.dbt.package(pipeline, "https://github.com/dbt-labs/jaffle_shop.git", venv=dbt_venv) @@ -65,14 +65,13 @@ def test_run_jaffle_package( @pytest.mark.parametrize( - "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name + "destination_config", + destinations_configs(default_sql_configs=True, supports_dbt=True), + ids=lambda x: x.name, ) def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess - if not destination_config.supports_dbt: - pytest.skip("dbt is not supported for this destination configuration") - # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" @@ -117,16 +116,15 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven @pytest.mark.parametrize( - "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name + "destination_config", + destinations_configs(default_sql_configs=True, supports_dbt=True), + ids=lambda x: x.name, ) def test_run_chess_dbt_to_other_dataset( destination_config: DestinationTestConfiguration, dbt_venv: Venv ) -> None: from docs.examples.chess.chess import chess - if not destination_config.supports_dbt: - pytest.skip("dbt is not supported for this destination configuration") - # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" diff --git a/tests/load/utils.py b/tests/load/utils.py index 207e32209f..5fb706985d 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -152,6 +152,7 @@ def destinations_configs( subset: Sequence[str] = (), exclude: Sequence[str] = (), file_format: Optional[TLoaderFileFormat] = None, + supports_dbt: Optional[bool] = None, ) -> List[DestinationTestConfiguration]: # sanity check for item in subset: @@ -165,7 +166,7 @@ def destinations_configs( destination_configs += [ DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS - if destination not in ("athena", "synapse") + if destination not in ("athena", "mssql", "synapse") ] destination_configs += [ DestinationTestConfiguration(destination="duckdb", file_format="parquet") @@ -192,9 +193,9 @@ def destinations_configs( extra_info="iceberg", ) ] - # dbt for Synapse has some complications and I couldn't get it to pass all tests. destination_configs += [ - DestinationTestConfiguration(destination="synapse", supports_dbt=False) + DestinationTestConfiguration(destination="mssql", supports_dbt=False), + DestinationTestConfiguration(destination="synapse", supports_dbt=False), ] if default_vector_configs: @@ -347,6 +348,10 @@ def destinations_configs( destination_configs = [ conf for conf in destination_configs if conf.file_format == file_format ] + if supports_dbt is not None: + destination_configs = [ + conf for conf in destination_configs if conf.supports_dbt == supports_dbt + ] # filter out excluded configs destination_configs = [ From e1e9bb38c48df79b8467ef68ae9f93a781c301b1 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 29 Jan 2024 00:27:38 +0100 Subject: [PATCH 14/23] replace CTAS with CREATE TABLE to eliminate concurrency issues --- .../impl/synapse/configuration.py | 49 ------------------- dlt/destinations/impl/synapse/factory.py | 5 +- dlt/destinations/impl/synapse/synapse.py | 20 ++++---- dlt/pipeline/pipeline.py | 4 -- .../load/pipeline/test_replace_disposition.py | 4 -- 5 files changed, 12 insertions(+), 70 deletions(-) diff --git a/dlt/destinations/impl/synapse/configuration.py b/dlt/destinations/impl/synapse/configuration.py index cc0e40114b..bb1ba632dc 100644 --- a/dlt/destinations/impl/synapse/configuration.py +++ b/dlt/destinations/impl/synapse/configuration.py @@ -53,60 +53,11 @@ class SynapseClientConfiguration(MsSqlClientConfiguration): create_indexes: bool = False """Whether `primary_key` and `unique` column hints are applied.""" - # Concurrency is disabled by overriding the configured number of workers to 1 at runtime. - auto_disable_concurrency: bool = True - """Whether concurrency is automatically disabled in cases where it might cause issues.""" - staging_use_msi: bool = False """Whether the managed identity of the Synapse workspace is used to authorize access to the staging Storage Account.""" __config_gen_annotations__: ClassVar[List[str]] = [ "default_table_index_type", "create_indexes", - "auto_disable_concurrency", "staging_use_msi", ] - - def get_load_workers(self, tables: TSchemaTables, workers: int) -> int: - """Returns the adjusted number of load workers to prevent concurrency issues.""" - - write_dispositions = [get_write_disposition(tables, table_name) for table_name in tables] - n_replace_dispositions = len([d for d in write_dispositions if d == "replace"]) - if ( - n_replace_dispositions > 1 - and self.replace_strategy == "staging-optimized" - and workers > 1 - ): - warning_msg_shared = ( - 'Data is being loaded into Synapse with write disposition "replace"' - ' and replace strategy "staging-optimized", while the number of' - f" load workers ({workers}) > 1. This configuration is problematic" - " in some cases, because Synapse does not always handle concurrency well" - " with the CTAS queries that are used behind the scenes to implement" - ' the "staging-optimized" strategy.' - ) - if self.auto_disable_concurrency: - logger.warning( - warning_msg_shared - + " The number of load workers will be automatically adjusted" - " and set to 1 to eliminate concurrency and prevent potential" - " issues. If you don't want this to happen, set the" - " DESTINATION__SYNAPSE__AUTO_DISABLE_CONCURRENCY environment" - ' variable to "false", or add the following to your config TOML:' - "\n\n[destination.synapse]\nauto_disable_concurrency = false\n" - ) - workers = 1 # adjust workers - else: - logger.warning( - warning_msg_shared - + " If you experience your pipeline gets stuck and doesn't finish," - " try reducing the number of load workers by exporting the LOAD__WORKERS" - " environment variable or by setting it in your config TOML:" - "\n\n[load]\nworkers = 1 # a value of 1 disables all concurrency," - " but perhaps a higher value also works\n\n" - "Alternatively, you can set the DESTINATION__SYNAPSE__AUTO_DISABLE_CONCURRENCY" - ' environment variable to "true", or add the following to your config TOML' - " to automatically disable concurrency where needed:" - "\n\n[destination.synapse]\nauto_disable_concurrency = true\n" - ) - return workers diff --git a/dlt/destinations/impl/synapse/factory.py b/dlt/destinations/impl/synapse/factory.py index 0ac58001ca..b7eddd6ef7 100644 --- a/dlt/destinations/impl/synapse/factory.py +++ b/dlt/destinations/impl/synapse/factory.py @@ -30,7 +30,6 @@ def __init__( credentials: t.Union[SynapseCredentials, t.Dict[str, t.Any], str] = None, default_table_index_type: t.Optional[TTableIndexType] = "heap", create_indexes: bool = False, - auto_disable_concurrency: bool = True, staging_use_msi: bool = False, destination_name: t.Optional[str] = None, environment: t.Optional[str] = None, @@ -45,15 +44,13 @@ def __init__( a connection string in the format `synapse://user:password@host:port/database` default_table_index_type: Maps directly to the default_table_index_type attribute of the SynapseClientConfiguration object. create_indexes: Maps directly to the create_indexes attribute of the SynapseClientConfiguration object. - auto_disable_concurrency: Maps directly to the auto_disable_concurrency attribute of the SynapseClientConfiguration object. - auto_disable_concurrency: Maps directly to the staging_use_msi attribute of the SynapseClientConfiguration object. + staging_use_msi: Maps directly to the staging_use_msi attribute of the SynapseClientConfiguration object. **kwargs: Additional arguments passed to the destination config """ super().__init__( credentials=credentials, default_table_index_type=default_table_index_type, create_indexes=create_indexes, - auto_disable_concurrency=auto_disable_concurrency, staging_use_msi=staging_use_msi, destination_name=destination_name, environment=environment, diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index eb6eae3f20..268ffad933 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -4,6 +4,8 @@ from textwrap import dedent from urllib.parse import urlparse, urlunparse +from dlt import current + from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( SupportsStagingDestination, @@ -181,15 +183,15 @@ def generate_sql( f" {staging_table_name};" ) # recreate staging table - # In some cases, when multiple instances of this CTAS query are - # executed concurrently, Synapse suspends the queries and hangs. - # This can be prevented by setting the env var LOAD__WORKERS = "1". - sql.append( - f"CREATE TABLE {staging_table_name}" - " WITH ( DISTRIBUTION = ROUND_ROBIN, HEAP )" # distribution must be explicitly specified with CTAS - f" AS SELECT * FROM {table_name}" - " WHERE 1 = 0;" # no data, table structure only - ) + job_client = current.pipeline().destination_client() # type: ignore[operator] + with job_client.with_staging_dataset(): + # get table columns from schema + columns = [c for c in job_client.schema.get_table_columns(table["name"]).values()] + # generate CREATE TABLE statement + create_table_stmt = job_client._get_table_update_sql( + table["name"], columns, generate_alter=False + ) + sql.extend(create_table_stmt) return sql diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 3a0a8f3931..73c8f076d1 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -483,10 +483,6 @@ def load( # make sure that destination is set and client is importable and can be instantiated client, staging_client = self._get_destination_clients(self.default_schema) - # for synapse we might need to adjust the number of load workers - if self.destination.destination_name == "synapse": - workers = client.config.get_load_workers(self.default_schema.tables, workers) # type: ignore[attr-defined] - # create default loader config and the loader load_config = LoaderConfiguration( workers=workers, diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index 65d3646f2d..c6db91efff 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -268,10 +268,6 @@ def test_replace_table_clearing( "test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True ) - if destination_config.destination == "synapse" and replace_strategy == "staging-optimized": - # this case requires load concurrency to be disabled (else the test gets stuck) - assert pipeline.destination_client().config.auto_disable_concurrency is True # type: ignore[attr-defined] - @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") def items_with_subitems(): data = { From 99a0718c74dadf51e4ff95db6d02b83cb5d64797 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 29 Jan 2024 00:30:30 +0100 Subject: [PATCH 15/23] change test config type to reduce unnecessary tests --- tests/load/utils.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index 5fb706985d..ea4e2916cc 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -264,14 +264,6 @@ def destinations_configs( bucket_url=AZ_BUCKET, extra_info="az-authorization", ), - DestinationTestConfiguration( - destination="synapse", - staging="filesystem", - file_format="parquet", - bucket_url=AZ_BUCKET, - staging_use_msi=True, - extra_info="az-managed-identity", - ), ] if all_staging_configs: @@ -304,6 +296,14 @@ def destinations_configs( bucket_url=GCS_BUCKET, extra_info="gcs-authorization", ), + DestinationTestConfiguration( + destination="synapse", + staging="filesystem", + file_format="parquet", + bucket_url=AZ_BUCKET, + staging_use_msi=True, + extra_info="az-managed-identity", + ), ] # add local filesystem destinations if requested From 6d14d576a1c3c56e2d72646678cf2655eb929f07 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 29 Jan 2024 00:48:34 +0100 Subject: [PATCH 16/23] remove trailing whitespace --- tests/load/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/load/utils.py b/tests/load/utils.py index ea4e2916cc..805925ec6a 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -303,7 +303,7 @@ def destinations_configs( bucket_url=AZ_BUCKET, staging_use_msi=True, extra_info="az-managed-identity", - ), + ), ] # add local filesystem destinations if requested From b87dd1b744bc4c9fe3f2b6ac1cbea08c58296eb5 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 29 Jan 2024 16:08:22 +0100 Subject: [PATCH 17/23] refine staging table indexing --- dlt/destinations/impl/synapse/synapse.py | 32 +++++++++++++++---- .../dlt-ecosystem/destinations/synapse.md | 3 +- .../synapse/test_synapse_table_indexing.py | 11 ++++++- 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/dlt/destinations/impl/synapse/synapse.py b/dlt/destinations/impl/synapse/synapse.py index 268ffad933..33e6194602 100644 --- a/dlt/destinations/impl/synapse/synapse.py +++ b/dlt/destinations/impl/synapse/synapse.py @@ -70,12 +70,22 @@ def __init__(self, schema: Schema, config: SynapseClientConfiguration) -> None: def _get_table_update_sql( self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool ) -> List[str]: - table = self.get_load_table(table_name) + table = self.get_load_table(table_name, staging=self.in_staging_mode) if table is None: table_index_type = self.config.default_table_index_type else: table_index_type = cast(TTableIndexType, table.get(TABLE_INDEX_TYPE_HINT)) - if table_index_type == "clustered_columnstore_index": + if self.in_staging_mode: + final_table = self.get_load_table(table_name, staging=False) + final_table_index_type = cast( + TTableIndexType, final_table.get(TABLE_INDEX_TYPE_HINT) + ) + else: + final_table_index_type = table_index_type + if final_table_index_type == "clustered_columnstore_index": + # Even if the staging table has index type "heap", we still adjust + # the column data types to prevent errors when writing into the + # final table that has index type "clustered_columnstore_index". new_columns = self._get_columstore_valid_columns(new_columns) _sql_result = SqlJobClientBase._get_table_update_sql( @@ -129,12 +139,20 @@ def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema table = super().get_load_table(table_name, staging) if table is None: return None - if table_name in self.schema.dlt_table_names(): - # dlt tables should always be heap tables, regardless of the user - # configuration. Why? "For small lookup tables, less than 60 million rows, - # consider using HEAP or clustered index for faster query performance." - # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables + if staging and self.config.replace_strategy == "insert-from-staging": + # Staging tables should always be heap tables, because "when you are + # temporarily landing data in dedicated SQL pool, you may find that + # using a heap table makes the overall process faster." + # "staging-optimized" is not included, because in that strategy the + # staging table becomes the final table, so we should already create + # it with the desired index type. + table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key] + elif table_name in self.schema.dlt_table_names(): + # dlt tables should always be heap tables, because "for small lookup + # tables, less than 60 million rows, consider using HEAP or clustered + # index for faster query performance." table[TABLE_INDEX_TYPE_HINT] = "heap" # type: ignore[typeddict-unknown-key] + # https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables elif table_name in self.schema.data_table_names(): if TABLE_INDEX_TYPE_HINT not in table: # If present in parent table, fetch hint from there. diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index 4d66714ce3..dcfd92b9fb 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -135,8 +135,9 @@ Possible values: >* **Set `default_table_index_type` to `"clustered_columnstore_index"` if you want to change the default** (see [additional destination options](#additional-destination-options)). >* **CLUSTERED COLUMNSTORE INDEX tables do not support the `varchar(max)`, `nvarchar(max)`, and `varbinary(max)` data types.** If you don't specify the `precision` for columns that map to any of these types, `dlt` will use the maximum lengths `varchar(4000)`, `nvarchar(4000)`, and `varbinary(8000)`. >* **While Synapse creates CLUSTERED COLUMNSTORE INDEXES by default, `dlt` creates HEAP tables by default.** HEAP is a more robust choice, because it supports all data types and doesn't require conversions. ->* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense +>* **When using the `insert-from-staging` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are always created as HEAP tables**—any configuration of the table index types is ignored. The HEAP strategy makes sense for staging tables for reasons explained [here](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-tables-index#heap-tables). +>* **When using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), the staging tables are already created with the configured table index type**, because the staging table becomes the final table. >* **`dlt` system tables are always created as HEAP tables, regardless of any configuration.** This is in line with Microsoft's recommendation that "for small lookup tables, less than 60 million rows, consider using HEAP or clustered index for faster query performance." >* Child tables, if any, inherent the table index type of their parent table. diff --git a/tests/load/synapse/test_synapse_table_indexing.py b/tests/load/synapse/test_synapse_table_indexing.py index e87b83fa3f..df90933de4 100644 --- a/tests/load/synapse/test_synapse_table_indexing.py +++ b/tests/load/synapse/test_synapse_table_indexing.py @@ -98,13 +98,22 @@ def items_with_table_index_type_specified() -> Iterator[Any]: @pytest.mark.parametrize( "table_index_type,column_schema", TABLE_INDEX_TYPE_COLUMN_SCHEMA_PARAM_GRID ) +@pytest.mark.parametrize( + # Also test staging replace strategies, to make sure the final table index + # type is not affected by staging table index type adjustments. + "replace_strategy", + ["insert-from-staging", "staging-optimized"], +) def test_resource_table_index_type_configuration( table_index_type: TTableIndexType, column_schema: Union[List[TColumnSchema], None], + replace_strategy: str, ) -> None: + os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy + @dlt.resource( name="items_with_table_index_type_specified", - write_disposition="append", + write_disposition="replace", columns=column_schema, ) def items_with_table_index_type_specified() -> Iterator[Any]: From 1c817bddb475385205b3f332364199c76292c2c8 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Tue, 30 Jan 2024 15:09:35 +0100 Subject: [PATCH 18/23] use generic statement to prevent repeating info --- docs/website/docs/general-usage/full-loading.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/website/docs/general-usage/full-loading.md b/docs/website/docs/general-usage/full-loading.md index 92fdf064fd..4651d156f0 100644 --- a/docs/website/docs/general-usage/full-loading.md +++ b/docs/website/docs/general-usage/full-loading.md @@ -67,6 +67,4 @@ opportunities, you should use this strategy. The `staging-optimized` strategy be recreated with a [clone command](https://docs.snowflake.com/en/sql-reference/sql/create-clone) from the staging tables. This is a low cost and fast way to create a second independent table from the data of another. Learn more about [table cloning on snowflake](https://docs.snowflake.com/en/user-guide/object-clone). -For all other destinations the `staging-optimized` will fall back to the behavior of the `insert-from-staging` strategy. - - +For all other [destinations](../dlt-ecosystem/destinations/index.md), please look at their respective documentation pages to see if and how the `staging-optimized` strategy is implemented. If it is not implemented, `dlt` will fall back to the `insert-from-staging` strategy. From 2dd979eb8d6a79334c5e624c4d68a73e86dd5d5d Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 1 Feb 2024 20:13:09 +0100 Subject: [PATCH 19/23] remove outdated documentation --- docs/website/docs/dlt-ecosystem/destinations/synapse.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index dcfd92b9fb..8c1a7b29bc 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -150,9 +150,6 @@ Synapse supports the following [column hints](https://dlthub.com/docs/general-us > ❗ These hints are **disabled by default**. This is because the `PRIMARY KEY` and `UNIQUE` [constraints](https://learn.microsoft.com/en-us/azure/synapse-analytics/sql-data-warehouse/sql-data-warehouse-table-constraints) are tricky in Synapse: they are **not enforced** and can lead to innacurate results if the user does not ensure all column values are unique. For the column hints to take effect, the `create_indexes` configuration needs to be set to `True`, see [additional destination options](#additional-destination-options). -## Load concurrency issue -`dlt` uses threading to enable concurrent processing and [parallel loading](../../reference/performance.md#load). Concurrency does not work properly in all cases when using the `staging-optimized` [`replace` strategy](../../general-usage/full-loading.md), because Synapse suspends the CTAS queries that `dlt` uses behind the scenes and gets stuck. To prevent this from happening, `dlt` automatically sets the number of load workers to 1 to disable concurrency when replacing data using the `staging-optimized` strategy. Set `auto_disable_concurrency = "false"` if you don't want this to happen (see [additional destination options](#additional-destination-options)) - ## Staging support Synapse supports Azure Blob Storage (both standard and [ADLS Gen2](https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)) as a file staging destination. `dlt` first uploads Parquet files to the blob container, and then instructs Synapse to read the Parquet file and load its data into a Synapse table using the [COPY INTO](https://learn.microsoft.com/en-us/sql/t-sql/statements/copy-into-transact-sql) statement. @@ -178,7 +175,6 @@ The following settings can optionally be configured: [destination.synapse] default_table_index_type = "heap" create_indexes = "false" -auto_disable_concurrency = "true" staging_use_msi = "false" [destination.synapse.credentials] @@ -196,7 +192,6 @@ destination.synapse.credentials = "synapse://loader:your_loader_password@your_sy Descriptions: - `default_table_index_type` sets the [table index type](#table-index-type) that is used if no table index type is specified on the resource. - `create_indexes` determines if `primary_key` and `unique` [column hints](#supported-column-hints) are applied. -- `auto_disable_concurrency` determines if concurrency is automatically disabled in cases where it might cause issues. - `staging_use_msi` determines if the Managed Identity of the Synapse workspace is used to authorize access to the [staging](#staging-support) Storage Account. Ensure the Managed Identity has the [Storage Blob Data Reader](https://learn.microsoft.com/en-us/azure/role-based-access-control/built-in-roles#storage-blob-data-reader) role (or a higher-priviliged role) assigned on the blob container if you set this option to `"true"`. - `port` used for the ODBC connection. - `connect_timeout` sets the timeout for the `pyodbc` connection attempt, in seconds. From da5cdac7f8cce1c59a36322c2a3bdc8735591f6e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 1 Feb 2024 20:23:14 +0100 Subject: [PATCH 20/23] add synapse destination to sidebar --- docs/website/sidebars.js | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 2c9b55e6da..f92f43564a 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -87,6 +87,7 @@ const sidebars = { 'dlt-ecosystem/destinations/bigquery', 'dlt-ecosystem/destinations/duckdb', 'dlt-ecosystem/destinations/mssql', + 'dlt-ecosystem/destinations/synapse', 'dlt-ecosystem/destinations/filesystem', 'dlt-ecosystem/destinations/postgres', 'dlt-ecosystem/destinations/redshift', From d7d9e35cf49691b2e877d9e4b905b9db3e77de67 Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Thu, 1 Feb 2024 23:17:53 +0100 Subject: [PATCH 21/23] add support for additional table hints --- dlt/destinations/impl/synapse/synapse_adapter.py | 6 ++++-- dlt/extract/hints.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/dlt/destinations/impl/synapse/synapse_adapter.py b/dlt/destinations/impl/synapse/synapse_adapter.py index f135dd967a..24932736f9 100644 --- a/dlt/destinations/impl/synapse/synapse_adapter.py +++ b/dlt/destinations/impl/synapse/synapse_adapter.py @@ -1,4 +1,4 @@ -from typing import Any, Literal, Set, get_args, Final +from typing import Any, Literal, Set, get_args, Final, Dict from dlt.extract import DltResource, resource as make_resource from dlt.extract.typing import TTableHintTemplate @@ -39,6 +39,7 @@ def synapse_adapter(data: Any, table_index_type: TTableIndexType = None) -> DltR """ resource = ensure_resource(data) + additional_table_hints: Dict[str, TTableHintTemplate[Any]] = {} if table_index_type is not None: if table_index_type not in TABLE_INDEX_TYPES: allowed_types = ", ".join(TABLE_INDEX_TYPES) @@ -46,5 +47,6 @@ def synapse_adapter(data: Any, table_index_type: TTableIndexType = None) -> DltR f"Table index type {table_index_type} is invalid. Allowed table index" f" types are: {allowed_types}." ) - resource._hints[TABLE_INDEX_TYPE_HINT] = table_index_type # type: ignore[typeddict-unknown-key] + additional_table_hints[TABLE_INDEX_TYPE_HINT] = table_index_type + resource.apply_hints(additional_table_hints=additional_table_hints) return resource diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 437dbbc6bd..e483f035fc 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -1,5 +1,5 @@ from copy import copy, deepcopy -from typing import List, TypedDict, cast, Any +from typing import List, TypedDict, cast, Any, Optional, Dict from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table from dlt.common.schema.typing import ( @@ -125,6 +125,7 @@ def apply_hints( merge_key: TTableHintTemplate[TColumnNames] = None, incremental: Incremental[Any] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, + additional_table_hints: Optional[Dict[str, TTableHintTemplate[Any]]] = None, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -208,6 +209,14 @@ def apply_hints( t["incremental"] = None else: t["incremental"] = incremental + if additional_table_hints is not None: + # loop through provided hints and add, overwrite, or remove them + for k, v in additional_table_hints.items(): + if v: + t[k] = v # type: ignore[literal-required] + else: + t.pop(k, None) # type: ignore[misc] + self.set_hints(t) def set_hints(self, hints_template: TResourceHints) -> None: From bab216d8e09bc2012cceb419ed48ccfe9ac0d0ef Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 5 Feb 2024 14:45:39 +0100 Subject: [PATCH 22/23] correct content-hash after merge conflict resolution --- poetry.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index 5ea4d19f2b..915152e0c2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. [[package]] name = "about-time" @@ -8749,4 +8749,4 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "61fa24ff52200b5bf97906a376826f00350abc8f6810fb2fcea73abaf245437f" +content-hash = "7b829a75b59316147385e16456395bebf2155e68cdeac3f9fa70523c3c33924a" From c3efe33d8c71469de77c54e6b4ec44758185da2e Mon Sep 17 00:00:00 2001 From: Jorrit Sandbrink Date: Mon, 5 Feb 2024 14:47:14 +0100 Subject: [PATCH 23/23] only remove hint if it is None, not if it is empty --- dlt/extract/hints.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index e483f035fc..c1a39041d8 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -212,7 +212,7 @@ def apply_hints( if additional_table_hints is not None: # loop through provided hints and add, overwrite, or remove them for k, v in additional_table_hints.items(): - if v: + if v is not None: t[k] = v # type: ignore[literal-required] else: t.pop(k, None) # type: ignore[misc]