diff --git a/README.md b/README.md index 5cb681c570..09bfab0b12 100644 --- a/README.md +++ b/README.md @@ -113,4 +113,4 @@ The dlt project is quickly growing, and we're excited to have you join our commu ## License -DLT is released under the [Apache 2.0 License](LICENSE.txt). +`dlt` is released under the [Apache 2.0 License](LICENSE.txt). diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index 2332c0286c..af4f2f66e9 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -5,7 +5,7 @@ import click from dlt.version import __version__ -from dlt.common import json +from dlt.common.json import json from dlt.common.schema import Schema from dlt.common.typing import DictStrAny from dlt.common.runners import Venv diff --git a/dlt/cli/config_toml_writer.py b/dlt/cli/config_toml_writer.py index 8cf831d725..7ff7f735eb 100644 --- a/dlt/cli/config_toml_writer.py +++ b/dlt/cli/config_toml_writer.py @@ -4,7 +4,7 @@ from tomlkit.container import Container as TOMLContainer from collections.abc import Sequence as C_Sequence -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.configuration.specs import ( BaseConfiguration, is_base_configuration_inner_hint, diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 0eb73ad7a8..d66d884ff2 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -3,7 +3,7 @@ import dlt from dlt.cli.exceptions import CliCommandException -from dlt.common import json +from dlt.common.json import json from dlt.common.pipeline import resource_state, get_dlt_pipelines_dir, TSourceState from dlt.common.destination.reference import TDestinationReferenceArg from dlt.common.runners import Venv diff --git a/dlt/cli/utils.py b/dlt/cli/utils.py index 5ea4471d7e..8699116628 100644 --- a/dlt/cli/utils.py +++ b/dlt/cli/utils.py @@ -1,11 +1,8 @@ import ast import os -import tempfile from typing import Callable -from dlt.common import git from dlt.common.reflection.utils import set_ast_parents -from dlt.common.storages import FileStorage from dlt.common.typing import TFun from dlt.common.configuration import resolve_configuration from dlt.common.configuration.specs import RunConfiguration diff --git a/dlt/common/configuration/providers/google_secrets.py b/dlt/common/configuration/providers/google_secrets.py index 98cbbc4553..43a284c67c 100644 --- a/dlt/common/configuration/providers/google_secrets.py +++ b/dlt/common/configuration/providers/google_secrets.py @@ -1,9 +1,8 @@ import base64 import string import re -from typing import Tuple -from dlt.common import json +from dlt.common.json import json from dlt.common.configuration.specs import GcpServiceAccountCredentials from dlt.common.exceptions import MissingDependencyException from .toml import VaultTomlProvider diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 7c856e8c27..10e0b470de 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -6,7 +6,7 @@ from tomlkit.container import Container as TOMLContainer from typing import Any, Dict, Optional, Tuple, Type, Union -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.configuration.paths import get_dlt_settings_dir, get_dlt_data_dir from dlt.common.configuration.utils import auto_cast from dlt.common.configuration.specs import known_sections diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index f7cac78dca..52d33ec0d3 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -1,7 +1,6 @@ from typing import Optional, Dict, Any -from dlt.common import pendulum -from dlt.common.exceptions import MissingDependencyException +from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue from dlt.common.configuration.specs import ( CredentialsConfiguration, diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index 4d81a493a3..9927b81ebf 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -2,7 +2,8 @@ import sys from typing import Any, ClassVar, Final, List, Tuple, Union, Dict -from dlt.common import json, pendulum +from dlt.common.json import json +from dlt.common.pendulum import pendulum from dlt.common.configuration.specs.api_credentials import OAuth2Credentials from dlt.common.configuration.specs.exceptions import ( InvalidGoogleNativeCredentialsType, diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py index 5a7330447b..51e6b5615a 100644 --- a/dlt/common/configuration/utils.py +++ b/dlt/common/configuration/utils.py @@ -5,7 +5,7 @@ from typing import Any, Dict, Mapping, NamedTuple, Optional, Tuple, Type, Sequence from collections.abc import Mapping as C_Mapping -from dlt.common import json +from dlt.common.json import json from dlt.common.typing import AnyType, TAny from dlt.common.data_types import coerce_value, py_type_to_sc_type from dlt.common.configuration.providers import EnvironProvider @@ -122,8 +122,6 @@ def log_traces( default_value: Any, traces: Sequence[LookupTrace], ) -> None: - from dlt.common import logger - # if logger.is_logging() and logger.log_level() == "DEBUG" and config: # logger.debug(f"Field {key} with type {hint} in {type(config).__name__} {'NOT RESOLVED' if value is None else 'RESOLVED'}") # print(f"Field {key} with type {hint} in {type(config).__name__} {'NOT RESOLVED' if value is None else 'RESOLVED'}") diff --git a/dlt/common/data_types/type_helpers.py b/dlt/common/data_types/type_helpers.py index 61a0aa1dbf..d8ab9eb118 100644 --- a/dlt/common/data_types/type_helpers.py +++ b/dlt/common/data_types/type_helpers.py @@ -3,13 +3,13 @@ import dataclasses import datetime # noqa: I251 from collections.abc import Mapping as C_Mapping, Sequence as C_Sequence -from typing import Any, Type, Literal, Union, cast +from typing import Any, Type, Union from enum import Enum -from dlt.common import pendulum, json, Decimal, Wei from dlt.common.json import custom_pua_remove, json from dlt.common.json._simplejson import custom_encode as json_custom_encode -from dlt.common.arithmetics import InvalidOperation +from dlt.common.wei import Wei +from dlt.common.arithmetics import InvalidOperation, Decimal from dlt.common.data_types.typing import TDataType from dlt.common.time import ( ensure_pendulum_datetime, diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 3200350f0b..e812afdaf1 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -4,6 +4,8 @@ from datetime import date, datetime, time # noqa: I251 from dlt.common.json import json +from dlt.common.pendulum import pendulum +from dlt.common.time import reduce_pendulum_datetime_precision # use regex to escape characters in single pass SQL_ESCAPE_DICT = {"'": "''", "\\": "\\\\", "\n": "\\n", "\r": "\\r"} @@ -152,3 +154,17 @@ def escape_databricks_literal(v: Any) -> Any: return "NULL" return str(v) + + +def format_datetime_literal(v: pendulum.DateTime, precision: int = 6, no_tz: bool = False) -> str: + """Converts `v` to ISO string, optionally without timezone spec (in UTC) and with given `precision`""" + if no_tz: + v = v.in_timezone(tz="UTC").replace(tzinfo=None) + v = reduce_pendulum_datetime_precision(v, precision) + # yet another precision translation + timespec: str = "microseconds" + if precision < 6: + timespec = "milliseconds" + elif precision < 3: + timespec = "seconds" + return v.isoformat(sep=" ", timespec=timespec) diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index b952b39ed2..60457f103e 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -17,7 +17,7 @@ TypeVar, ) -from dlt.common import json +from dlt.common.json import json from dlt.common.configuration import configspec, known_sections, with_config from dlt.common.configuration.specs import BaseConfiguration from dlt.common.data_writers.exceptions import DataWriterNotFound, InvalidDataItem @@ -176,6 +176,9 @@ def writer_spec(cls) -> FileWriterSpec: class InsertValuesWriter(DataWriter): def __init__(self, f: IO[Any], caps: DestinationCapabilitiesContext = None) -> None: + assert ( + caps is not None + ), "InsertValuesWriter requires destination capabilities to be present" super().__init__(f, caps) self._chunks_written = 0 self._headers_lookup: Dict[str, int] = None @@ -272,7 +275,7 @@ def __init__( coerce_timestamps: Optional[Literal["s", "ms", "us", "ns"]] = None, allow_truncated_timestamps: bool = False, ) -> None: - super().__init__(f, caps) + super().__init__(f, caps or DestinationCapabilitiesContext.generic_capabilities("parquet")) from dlt.common.libs.pyarrow import pyarrow self.writer: Optional[pyarrow.parquet.ParquetWriter] = None @@ -287,7 +290,15 @@ def __init__( self.allow_truncated_timestamps = allow_truncated_timestamps def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter": - from dlt.common.libs.pyarrow import pyarrow + from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_timestamp + + # if timestamps are not explicitly coerced, use destination resolution + # TODO: introduce maximum timestamp resolution, using timestamp_precision too aggressive + # if not self.coerce_timestamps: + # self.coerce_timestamps = get_py_arrow_timestamp( + # self._caps.timestamp_precision, "UTC" + # ).unit + # self.allow_truncated_timestamps = True return pyarrow.parquet.ParquetWriter( self._f, @@ -331,7 +342,9 @@ def write_data(self, rows: Sequence[Any]) -> None: for key in self.complex_indices: for row in rows: if (value := row.get(key)) is not None: - row[key] = json.dumps(value) + # TODO: make this configurable + if value is not None and not isinstance(value, str): + row[key] = json.dumps(value) table = pyarrow.Table.from_pylist(rows, schema=self.schema) # Write diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index ddcc5d1146..9318dca535 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -26,6 +26,7 @@ from dlt.common import logger from dlt.common.schema import Schema, TTableSchema, TSchemaTables +from dlt.common.schema.typing import MERGE_STRATEGIES from dlt.common.schema.exceptions import SchemaException from dlt.common.schema.utils import ( get_write_disposition, @@ -344,6 +345,12 @@ def _verify_schema(self) -> None: table_name, self.capabilities.max_identifier_length, ) + if table.get("write_disposition") == "merge": + if "x-merge-strategy" in table and table["x-merge-strategy"] not in MERGE_STRATEGIES: # type: ignore[typeddict-item] + raise SchemaException( + f'"{table["x-merge-strategy"]}" is not a valid merge strategy. ' # type: ignore[typeddict-item] + f"""Allowed values: {', '.join(['"' + s + '"' for s in MERGE_STRATEGIES])}.""" + ) if has_column_with_prop(table, "hard_delete"): if len(get_columns_names_with_prop(table, "hard_delete")) > 1: raise SchemaException( diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index 371c74e54a..cf68e5d3d4 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -12,7 +12,7 @@ except ImportError: PydanticBaseModel = None # type: ignore[misc] -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.arithmetics import Decimal from dlt.common.wei import Wei from dlt.common.utils import map_nested_in_place @@ -99,19 +99,19 @@ def _datetime_decoder(obj: str) -> datetime: # Backwards compatibility for data encoded with previous dlt version # fromisoformat does not support Z suffix (until py3.11) obj = obj[:-1] + "+00:00" - return pendulum.DateTime.fromisoformat(obj) # type: ignore[attr-defined, no-any-return] + return pendulum.DateTime.fromisoformat(obj) # define decoder for each prefix DECODERS: List[Callable[[Any], Any]] = [ Decimal, _datetime_decoder, - pendulum.Date.fromisoformat, # type: ignore[attr-defined] + pendulum.Date.fromisoformat, UUID, HexBytes, base64.b64decode, Wei, - pendulum.Time.fromisoformat, # type: ignore[attr-defined] + pendulum.Time.fromisoformat, ] # how many decoders? PUA_CHARACTER_MAX = len(DECODERS) diff --git a/dlt/common/libs/numpy.py b/dlt/common/libs/numpy.py index ccf255c6a8..0f3d1dc612 100644 --- a/dlt/common/libs/numpy.py +++ b/dlt/common/libs/numpy.py @@ -3,4 +3,4 @@ try: import numpy except ModuleNotFoundError: - raise MissingDependencyException("DLT Numpy Helpers", ["numpy"]) + raise MissingDependencyException("dlt Numpy Helpers", ["numpy"]) diff --git a/dlt/common/libs/pandas.py b/dlt/common/libs/pandas.py index 7a94dcf6e2..022aa9b9cd 100644 --- a/dlt/common/libs/pandas.py +++ b/dlt/common/libs/pandas.py @@ -4,7 +4,7 @@ try: import pandas except ModuleNotFoundError: - raise MissingDependencyException("DLT Pandas Helpers", ["pandas"]) + raise MissingDependencyException("dlt Pandas Helpers", ["pandas"]) def pandas_to_arrow(df: pandas.DataFrame) -> Any: diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 3380157600..58ddf69cea 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -15,7 +15,7 @@ ) from dlt import version -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.exceptions import MissingDependencyException from dlt.common.schema.typing import DLT_NAME_PREFIX, TTableSchemaColumns @@ -119,7 +119,7 @@ def get_pyarrow_int(precision: Optional[int]) -> Any: return pyarrow.int64() -def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: +def get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: """Returns (data_type, precision, scale) tuple from pyarrow.DataType""" if pyarrow.types.is_string(dtype) or pyarrow.types.is_large_string(dtype): return dict(data_type="text") @@ -226,14 +226,14 @@ def should_normalize_arrow_schema( ) -> Tuple[bool, Mapping[str, str], Dict[str, str], TTableSchemaColumns]: rename_mapping = get_normalized_arrow_fields_mapping(schema, naming) rev_mapping = {v: k for k, v in rename_mapping.items()} - dlt_table_prefix = naming.normalize_table_identifier(DLT_NAME_PREFIX) + dlt_tables = list(map(naming.normalize_table_identifier, ("_dlt_id", "_dlt_load_id"))) # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns # that should happen in the normalizer columns = { name: column for name, column in columns.items() - if not name.startswith(dlt_table_prefix) or name in rev_mapping + if name not in dlt_tables or name in rev_mapping } # check if nothing to rename @@ -322,7 +322,7 @@ def py_arrow_to_table_schema_columns(schema: pyarrow.Schema) -> TTableSchemaColu result[field.name] = { "name": field.name, "nullable": field.nullable, - **_get_column_type_from_py_arrow(field.type), + **get_column_type_from_py_arrow(field.type), } return result diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index e33bf2ab35..da38ac60a7 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -1,13 +1,21 @@ +from functools import lru_cache from typing import Dict, List, Mapping, Optional, Sequence, Tuple, cast, TypedDict, Any -from dlt.common.data_types.typing import TDataType +from dlt.common.json import json from dlt.common.normalizers.exceptions import InvalidJsonNormalizer from dlt.common.normalizers.typing import TJSONNormalizer from dlt.common.normalizers.utils import generate_dlt_id, DLT_ID_LENGTH_BYTES from dlt.common.typing import DictStrAny, DictStrStr, TDataItem, StrAny from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TColumnName, TSimpleRegex -from dlt.common.schema.utils import column_name_validator +from dlt.common.schema.typing import ( + TTableSchema, + TColumnSchema, + TColumnName, + TSimpleRegex, + DLT_NAME_PREFIX, +) +from dlt.common.schema.utils import column_name_validator, get_validity_column_names +from dlt.common.schema.exceptions import ColumnNameConflictException from dlt.common.utils import digest128, update_dict_nested from dlt.common.normalizers.json import ( TNormalizedRowIterator, @@ -127,6 +135,18 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_row_dicts(dict_row, _r_lvl) return cast(TDataItemRow, out_rec_row), out_rec_list + @staticmethod + def get_row_hash(row: Dict[str, Any]) -> str: + """Returns hash of row. + + Hash includes column names and values and is ordered by column name. + Excludes dlt system columns. + Can be used as deterministic row identifier. + """ + row_filtered = {k: v for k, v in row.items() if not k.startswith(DLT_NAME_PREFIX)} + row_str = json.dumps(row_filtered, sort_keys=True) + return digest128(row_str, DLT_ID_LENGTH_BYTES) + @staticmethod def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> str: # create deterministic unique id of the child row taking into account that all lists are ordered @@ -220,10 +240,14 @@ def _normalize_row( parent_row_id: Optional[str] = None, pos: Optional[int] = None, _r_lvl: int = 0, + row_hash: bool = False, ) -> TNormalizedRowIterator: schema = self.schema table = schema.naming.shorten_fragments(*parent_path, *ident_path) - + # compute row hash and set as row id + if row_hash: + row_id = self.get_row_hash(dict_row) # type: ignore[arg-type] + dict_row["_dlt_id"] = row_id # flatten current row and extract all lists to recur into flattened_row, lists = self._flatten(table, dict_row, _r_lvl) # always extend row @@ -296,10 +320,18 @@ def normalize_data_item( row = cast(TDataItemRowRoot, item) # identify load id if loaded data must be processed after loading incrementally row["_dlt_load_id"] = load_id + # determine if row hash should be used as dlt id + row_hash = False + if self._is_scd2_table(self.schema, table_name): + row_hash = self._dlt_id_is_row_hash(self.schema, table_name) + self._validate_validity_column_names( + self._get_validity_column_names(self.schema, table_name), item + ) yield from self._normalize_row( cast(TDataItemRowChild, row), {}, (self.schema.naming.normalize_table_identifier(table_name),), + row_hash=row_hash, ) @classmethod @@ -333,3 +365,33 @@ def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConf "./normalizers/json/config", validator_f=column_name_validator(schema.naming), ) + + @staticmethod + @lru_cache(maxsize=None) + def _is_scd2_table(schema: Schema, table_name: str) -> bool: + if table_name in schema.data_table_names(): + if schema.get_table(table_name).get("x-merge-strategy") == "scd2": + return True + return False + + @staticmethod + @lru_cache(maxsize=None) + def _get_validity_column_names(schema: Schema, table_name: str) -> List[Optional[str]]: + return get_validity_column_names(schema.get_table(table_name)) + + @staticmethod + @lru_cache(maxsize=None) + def _dlt_id_is_row_hash(schema: Schema, table_name: str) -> bool: + return schema.get_table(table_name)["columns"].get("_dlt_id", dict()).get("x-row-version", False) # type: ignore[return-value] + + @staticmethod + def _validate_validity_column_names( + validity_column_names: List[Optional[str]], item: TDataItem + ) -> None: + """Raises exception if configured validity column name appears in data item.""" + for validity_column_name in validity_column_names: + if validity_column_name in item.keys(): + raise ColumnNameConflictException( + "Found column in data item with same name as validity column" + f' "{validity_column_name}".' + ) diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 7c117d4612..8baf872752 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -36,7 +36,12 @@ from dlt.common.destination.exceptions import DestinationHasFailedJobs from dlt.common.exceptions import PipelineStateNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract +from dlt.common.schema.typing import ( + TColumnNames, + TColumnSchema, + TWriteDispositionConfig, + TSchemaContract, +) from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.time import ensure_pendulum_datetime, precise_time @@ -521,7 +526,7 @@ def run( dataset_name: str = None, credentials: Any = None, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: Sequence[TColumnSchema] = None, primary_key: TColumnNames = None, schema: Schema = None, @@ -544,7 +549,7 @@ def __call__( dataset_name: str = None, credentials: Any = None, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: Sequence[TColumnSchema] = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, diff --git a/dlt/common/runners/pool_runner.py b/dlt/common/runners/pool_runner.py index 491c74cd18..c691347529 100644 --- a/dlt/common/runners/pool_runner.py +++ b/dlt/common/runners/pool_runner.py @@ -4,13 +4,14 @@ from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor, Future from typing_extensions import ParamSpec -from dlt.common import logger, sleep +from dlt.common import logger from dlt.common.configuration.container import Container from dlt.common.runtime import init from dlt.common.runners.runnable import Runnable, TExecutor from dlt.common.runners.configuration import PoolRunnerConfiguration from dlt.common.runners.typing import TRunMetrics from dlt.common.runtime import signals +from dlt.common.runtime.signals import sleep from dlt.common.exceptions import SignalReceivedException diff --git a/dlt/common/runtime/prometheus.py b/dlt/common/runtime/prometheus.py index 07c960efe7..9bc89211be 100644 --- a/dlt/common/runtime/prometheus.py +++ b/dlt/common/runtime/prometheus.py @@ -3,7 +3,6 @@ from prometheus_client.metrics import MetricWrapperBase from dlt.common.configuration.specs import RunConfiguration -from dlt.common import logger from dlt.common.runtime.exec_info import dlt_version_info from dlt.common.typing import DictStrAny, StrAny diff --git a/dlt/common/runtime/segment.py b/dlt/common/runtime/segment.py index 70b81fb4f4..ac64591072 100644 --- a/dlt/common/runtime/segment.py +++ b/dlt/common/runtime/segment.py @@ -6,13 +6,11 @@ import atexit import base64 import requests -from concurrent.futures import ThreadPoolExecutor from typing import Literal, Optional from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common import logger from dlt.common.managed_thread_pool import ManagedThreadPool - from dlt.common.configuration.specs import RunConfiguration from dlt.common.runtime.exec_info import get_execution_context, TExecutionContext from dlt.common.typing import DictStrAny, StrAny diff --git a/dlt/common/runtime/slack.py b/dlt/common/runtime/slack.py index b1e090098d..75c01aac25 100644 --- a/dlt/common/runtime/slack.py +++ b/dlt/common/runtime/slack.py @@ -2,7 +2,8 @@ def send_slack_message(incoming_hook: str, message: str, is_markdown: bool = True) -> None: - from dlt.common import json, logger + from dlt.common import logger + from dlt.common.json import json """Sends a `message` to Slack `incoming_hook`, by default formatted as markdown.""" r = requests.post( diff --git a/dlt/common/schema/detections.py b/dlt/common/schema/detections.py index 30b23706af..c9e0e05be9 100644 --- a/dlt/common/schema/detections.py +++ b/dlt/common/schema/detections.py @@ -3,7 +3,8 @@ from hexbytes import HexBytes -from dlt.common import pendulum, Wei +from dlt.common.pendulum import pendulum +from dlt.common.wei import Wei from dlt.common.data_types import TDataType from dlt.common.time import parse_iso_like_datetime diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 96341ab8b4..678f4de15e 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -152,3 +152,7 @@ class UnknownTableException(SchemaException): def __init__(self, table_name: str) -> None: self.table_name = table_name super().__init__(f"Trying to access unknown table {table_name}.") + + +class ColumnNameConflictException(SchemaException): + pass diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index c738f1753e..740e578ef2 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,6 +1,5 @@ from copy import copy, deepcopy from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal -from dlt.common import json from dlt.common.schema.migrations import migrate_schema from dlt.common.utils import extend_list_deduplicated diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index ec60e4c365..e1022cfa84 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -7,7 +7,6 @@ Optional, Sequence, Set, - Tuple, Type, TypedDict, NewType, @@ -34,6 +33,8 @@ LOADS_TABLE_NAME = "_dlt_loads" STATE_TABLE_NAME = "_dlt_pipeline_state" DLT_NAME_PREFIX = "_dlt" +DEFAULT_VALIDITY_COLUMN_NAMES = ["_dlt_valid_from", "_dlt_valid_to"] +"""Default values for validity column names used in `scd2` merge strategy.""" TColumnProp = Literal[ "name", @@ -64,7 +65,6 @@ "dedup_sort", ] """Known hints of a column used to declare hint regexes.""" -TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg", "parquet", "jsonl"] TTypeDetections = Literal[ "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" @@ -86,7 +86,6 @@ "root_key", ] ) -WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition)) class TColumnType(TypedDict, total=False): @@ -155,6 +154,27 @@ class NormalizerInfo(TypedDict, total=True): new_table: bool +TWriteDisposition = Literal["skip", "append", "replace", "merge"] +TLoaderMergeStrategy = Literal["delete-insert", "scd2"] + + +WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition)) +MERGE_STRATEGIES: Set[TLoaderMergeStrategy] = set(get_args(TLoaderMergeStrategy)) + + +class TWriteDispositionDict(TypedDict): + disposition: TWriteDisposition + + +class TMergeDispositionDict(TWriteDispositionDict, total=False): + strategy: Optional[TLoaderMergeStrategy] + validity_column_names: Optional[List[str]] + row_version_column_name: Optional[str] + + +TWriteDispositionConfig = Union[TWriteDisposition, TWriteDispositionDict, TMergeDispositionDict] + + # TypedDict that defines properties of a table diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 4c1071a8a9..8da9029124 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -5,7 +5,7 @@ from copy import deepcopy, copy from typing import Dict, List, Sequence, Tuple, Type, Any, cast, Iterable, Optional, Union -from dlt.common import json +from dlt.common.json import json from dlt.common.data_types import TDataType from dlt.common.exceptions import DictValidationException from dlt.common.normalizers.naming import NamingConvention @@ -34,6 +34,7 @@ TTypeDetectionFunc, TTypeDetections, TWriteDisposition, + TLoaderMergeStrategy, TSchemaContract, TSortOrder, ) @@ -47,6 +48,7 @@ RE_NON_ALPHANUMERIC_UNDERSCORE = re.compile(r"[^a-zA-Z\d_]") DEFAULT_WRITE_DISPOSITION: TWriteDisposition = "append" +DEFAULT_MERGE_STRATEGY: TLoaderMergeStrategy = "delete-insert" def is_valid_schema_name(name: str) -> bool: @@ -516,6 +518,13 @@ def get_dedup_sort_tuple( return (dedup_sort_col, dedup_sort_order) +def get_validity_column_names(table: TTableSchema) -> List[Optional[str]]: + return [ + get_first_column_name_with_prop(table, "x-valid-from"), + get_first_column_name_with_prop(table, "x-valid-to"), + ] + + def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTables: aggregated_update: TSchemaTables = {} for schema_update in schema_updates: diff --git a/dlt/common/storages/__init__.py b/dlt/common/storages/__init__.py index e5feeaba57..7bb3c0cf97 100644 --- a/dlt/common/storages/__init__.py +++ b/dlt/common/storages/__init__.py @@ -9,6 +9,7 @@ LoadPackageInfo, PackageStorage, TJobState, + create_load_id, ) from .data_item_storage import DataItemStorage from .load_storage import LoadStorage @@ -40,6 +41,7 @@ "LoadPackageInfo", "PackageStorage", "TJobState", + "create_load_id", "fsspec_from_config", "fsspec_filesystem", ] diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index b1cbc11bf9..3a2b483970 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -24,7 +24,7 @@ from fsspec.core import url_to_fs from dlt import version -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.configuration.specs import ( GcpCredentials, AwsCredentials, diff --git a/dlt/common/storages/fsspecs/google_drive.py b/dlt/common/storages/fsspecs/google_drive.py index 3bc4b1d7d7..1be862668c 100644 --- a/dlt/common/storages/fsspecs/google_drive.py +++ b/dlt/common/storages/fsspecs/google_drive.py @@ -1,7 +1,7 @@ import posixpath from typing import Any, Dict, List, Literal, Optional, Tuple -from dlt.common import json +from dlt.common.json import json from dlt.common.configuration.specs import GcpCredentials, GcpOAuthCredentials from dlt.common.exceptions import MissingDependencyException diff --git a/dlt/common/storages/load_package.py b/dlt/common/storages/load_package.py index 3ca5056d8e..1c76fd39cd 100644 --- a/dlt/common/storages/load_package.py +++ b/dlt/common/storages/load_package.py @@ -21,17 +21,16 @@ cast, Any, Tuple, - TYPE_CHECKING, TypedDict, ) +from typing_extensions import NotRequired -from dlt.common import pendulum, json - +from dlt.common.pendulum import pendulum +from dlt.common.json import json from dlt.common.configuration import configspec from dlt.common.configuration.specs import ContainerInjectableContext from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated from dlt.common.configuration.container import Container - from dlt.common.data_writers import DataWriter, new_file_id from dlt.common.destination import TLoaderFileFormat from dlt.common.exceptions import TerminalValueError @@ -46,16 +45,18 @@ bump_state_version_if_modified, TVersionedState, default_versioned_state, + json_decode_state, + json_encode_state, ) -from typing_extensions import NotRequired +from dlt.common.time import precise_time TJobFileFormat = Literal["sql", "reference", TLoaderFileFormat] """Loader file formats with internal job types""" class TLoadPackageState(TVersionedState, total=False): - created_at: str - """Timestamp when the loadpackage was created""" + created_at: DateTime + """Timestamp when the load package was created""" """A section of state that does not participate in change merging and version control""" destination_state: NotRequired[Dict[str, Any]] @@ -104,6 +105,16 @@ def default_load_package_state() -> TLoadPackageState: } +def create_load_id() -> str: + """Creates new package load id which is the current unix timestamp converted to string. + Load ids must have the following properties: + - They must maintain increase order over time for a particular dlt schema loaded to particular destination and dataset + `dlt` executes packages in order of load ids + `dlt` considers a state with the highest load id to be the most up to date when restoring state from destination + """ + return str(precise_time()) + + # folders to manage load jobs in a single load package TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) @@ -404,18 +415,23 @@ def complete_job(self, load_id: str, file_name: str) -> str: # Create and drop entities # - def create_package(self, load_id: str) -> None: + def create_package(self, load_id: str, initial_state: TLoadPackageState = None) -> None: self.storage.create_folder(load_id) # create processing directories self.storage.create_folder(os.path.join(load_id, PackageStorage.NEW_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.COMPLETED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.FAILED_JOBS_FOLDER)) self.storage.create_folder(os.path.join(load_id, PackageStorage.STARTED_JOBS_FOLDER)) - # ensure created timestamp is set in state when load package is created - state = self.get_load_package_state(load_id) + # use initial state or create a new by loading non existing state + state = self.get_load_package_state(load_id) if initial_state is None else initial_state if not state.get("created_at"): - state["created_at"] = pendulum.now().to_iso8601_string() - self.save_load_package_state(load_id, state) + # try to parse load_id as unix timestamp + try: + created_at = float(load_id) + except Exception: + created_at = precise_time() + state["created_at"] = pendulum.from_timestamp(created_at) + self.save_load_package_state(load_id, state) def complete_loading_package(self, load_id: str, load_state: TLoadPackageStatus) -> str: """Completes loading the package by writing marker file with`package_state. Returns path to the completed package""" @@ -424,6 +440,7 @@ def complete_loading_package(self, load_id: str, load_state: TLoadPackageStatus) self.storage.save( os.path.join(load_path, PackageStorage.PACKAGE_COMPLETED_FILE_NAME), load_state ) + # TODO: also modify state return load_path def remove_completed_jobs(self, load_id: str) -> None: @@ -472,7 +489,7 @@ def get_load_package_state(self, load_id: str) -> TLoadPackageState: raise LoadPackageNotFound(load_id) try: state_dump = self.storage.load(self.get_load_package_state_path(load_id)) - state = json.loads(state_dump) + state = json_decode_state(state_dump) return migrate_load_package_state( state, state["_state_engine_version"], LOAD_PACKAGE_STATE_ENGINE_VERSION ) @@ -486,7 +503,7 @@ def save_load_package_state(self, load_id: str, state: TLoadPackageState) -> Non bump_loadpackage_state_version_if_modified(state) self.storage.save( self.get_load_package_state_path(load_id), - json.dumps(state), + json_encode_state(state), ) def get_load_package_state_path(self, load_id: str) -> str: diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index 8b5109d9e2..97e62201e5 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -2,7 +2,7 @@ from typing import Iterable, List, Optional, Sequence from dlt.common.data_writers.exceptions import DataWriterNotFound -from dlt.common import json +from dlt.common.json import json from dlt.common.configuration import known_sections from dlt.common.configuration.inject import with_config from dlt.common.destination import ALL_SUPPORTED_FILE_FORMATS, TLoaderFileFormat @@ -105,6 +105,14 @@ def create_item_storage( pass raise + def import_extracted_package( + self, load_id: str, extract_package_storage: PackageStorage + ) -> None: + # pass the original state + self.new_packages.create_package( + load_id, extract_package_storage.get_load_package_state(load_id) + ) + def list_new_jobs(self, load_id: str) -> Sequence[str]: """Lists all jobs in new jobs folder of normalized package storage and checks if file formats are supported""" new_jobs = self.normalized_packages.list_new_jobs(load_id) diff --git a/dlt/common/storages/schema_storage.py b/dlt/common/storages/schema_storage.py index 23b695b839..1afed18929 100644 --- a/dlt/common/storages/schema_storage.py +++ b/dlt/common/storages/schema_storage.py @@ -1,7 +1,8 @@ import yaml from typing import Iterator, List, Mapping, Tuple, cast -from dlt.common import json, logger +from dlt.common import logger +from dlt.common.json import json from dlt.common.configuration import with_config from dlt.common.configuration.accessors import config from dlt.common.schema.utils import to_pretty_json, to_pretty_yaml diff --git a/dlt/common/time.py b/dlt/common/time.py index d3c8f9746c..161205deb8 100644 --- a/dlt/common/time.py +++ b/dlt/common/time.py @@ -208,10 +208,12 @@ def to_seconds(td: Optional[TimedeltaSeconds]) -> Optional[float]: return td -T = TypeVar("T", bound=Union[pendulum.DateTime, pendulum.Time]) +TTimeWithPrecision = TypeVar("TTimeWithPrecision", bound=Union[pendulum.DateTime, pendulum.Time]) -def reduce_pendulum_datetime_precision(value: T, microsecond_precision: int) -> T: - if microsecond_precision >= 6: +def reduce_pendulum_datetime_precision( + value: TTimeWithPrecision, precision: int +) -> TTimeWithPrecision: + if precision >= 6: return value - return value.replace(microsecond=value.microsecond // 10 ** (6 - microsecond_precision) * 10 ** (6 - microsecond_precision)) # type: ignore + return value.replace(microsecond=value.microsecond // 10 ** (6 - precision) * 10 ** (6 - precision)) # type: ignore diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 4ddde87758..1d3020f4dd 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -296,7 +296,7 @@ def _is_recursive_merge(a: StrAny, b: StrAny) -> bool: if key in dst: if _is_recursive_merge(dst[key], src[key]): # If the key for both `dst` and `src` are both Mapping types (e.g. dict), then recurse. - update_dict_nested(dst[key], src[key]) + update_dict_nested(dst[key], src[key], keep_dst_values=keep_dst_values) elif dst[key] is src[key]: # If a key exists in both objects and the values are `same`, the value from the `dst` object will be used. pass diff --git a/dlt/common/versioned_state.py b/dlt/common/versioned_state.py index 6f45df83c4..52a26c6943 100644 --- a/dlt/common/versioned_state.py +++ b/dlt/common/versioned_state.py @@ -1,9 +1,12 @@ import base64 import hashlib +import binascii from copy import copy +from typing import TypedDict, List, Tuple, Mapping -from dlt.common import json -from typing import TypedDict, List, Tuple +from dlt.common.json import json +from dlt.common.typing import DictStrAny +from dlt.common.utils import compressed_b64decode, compressed_b64encode class TVersionedState(TypedDict, total=False): @@ -19,7 +22,7 @@ def generate_state_version_hash(state: TVersionedState, exclude_attrs: List[str] exclude_attrs.extend(["_state_version", "_state_engine_version", "_version_hash"]) for attr in exclude_attrs: state_copy.pop(attr, None) # type: ignore - content = json.typed_dumpb(state_copy, sort_keys=True) # type: ignore + content = json.typed_dumpb(state_copy, sort_keys=True) h = hashlib.sha3_256(content) return base64.b64encode(h.digest()).decode("ascii") @@ -42,3 +45,24 @@ def bump_state_version_if_modified( def default_versioned_state() -> TVersionedState: return {"_state_version": 0, "_state_engine_version": 1} + + +def json_encode_state(state: TVersionedState) -> str: + return json.typed_dumps(state) + + +def json_decode_state(state_str: str) -> DictStrAny: + return json.typed_loads(state_str) # type: ignore[no-any-return] + + +def compress_state(state: TVersionedState) -> str: + return compressed_b64encode(json.typed_dumpb(state)) + + +def decompress_state(state_str: str) -> DictStrAny: + try: + state_bytes = compressed_b64decode(state_str) + except binascii.Error: + return json.typed_loads(state_str) # type: ignore[no-any-return] + else: + return json.typed_loadb(state_bytes) # type: ignore[no-any-return] diff --git a/dlt/destinations/decorators.py b/dlt/destinations/decorators.py index a920d336a2..8e0b5d5ee8 100644 --- a/dlt/destinations/decorators.py +++ b/dlt/destinations/decorators.py @@ -7,14 +7,15 @@ from functools import wraps from dlt.common import logger +from dlt.common.destination import TLoaderFileFormat +from dlt.common.typing import TDataItems +from dlt.common.schema import TTableSchema + from dlt.destinations.impl.destination.factory import destination as _destination from dlt.destinations.impl.destination.configuration import ( TDestinationCallableParams, CustomDestinationClientConfiguration, ) -from dlt.common.destination import TLoaderFileFormat -from dlt.common.typing import TDataItems -from dlt.common.schema import TTableSchema def destination( diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 86448bd011..0ac042a056 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -9,7 +9,8 @@ from google.api_core import retry from google.cloud.bigquery.retry import _RETRYABLE_REASONS -from dlt.common import json, logger +from dlt.common import logger +from dlt.common.json import json from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import ( FollowupJob, diff --git a/dlt/destinations/impl/databricks/sql_client.py b/dlt/destinations/impl/databricks/sql_client.py index 68ea863cc4..7e2487593d 100644 --- a/dlt/destinations/impl/databricks/sql_client.py +++ b/dlt/destinations/impl/databricks/sql_client.py @@ -8,8 +8,6 @@ ) from databricks.sql.exc import Error as DatabricksSqlError -from dlt.common import pendulum -from dlt.common import logger from dlt.common.destination import DestinationCapabilitiesContext from dlt.destinations.exceptions import ( DatabaseTerminalException, diff --git a/dlt/destinations/impl/destination/factory.py b/dlt/destinations/impl/destination/factory.py index 8395c66ac8..3ae6f2e876 100644 --- a/dlt/destinations/impl/destination/factory.py +++ b/dlt/destinations/impl/destination/factory.py @@ -1,14 +1,13 @@ import typing as t import inspect from importlib import import_module - from types import ModuleType -from dlt.common.typing import AnyFun +from dlt.common import logger +from dlt.common.typing import AnyFun from dlt.common.destination import Destination, DestinationCapabilitiesContext, TLoaderFileFormat from dlt.common.configuration import known_sections, with_config, get_fun_spec from dlt.common.configuration.exceptions import ConfigurationValueError -from dlt.common import logger from dlt.common.utils import get_callable_name, is_inner_callable from dlt.destinations.exceptions import DestinationTransientException diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 0d91220d88..bafac210cc 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -14,7 +14,7 @@ List, ) -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.storages import FileStorage from dlt.common.destination import DestinationCapabilitiesContext diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index febfe38ec9..5a5e5f8cfd 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -1,7 +1,9 @@ from types import TracebackType from typing import ClassVar, Optional, Sequence, List, Dict, Type, Iterable, Any, IO -from dlt.common import json, pendulum, logger +from dlt.common import logger +from dlt.common.json import json +from dlt.common.pendulum import pendulum from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.destination import DestinationCapabilitiesContext diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 6486a75e6e..ab2bea54ef 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -24,7 +24,9 @@ from weaviate.gql.get import GetBuilder from weaviate.util import generate_uuid5 -from dlt.common import json, pendulum, logger +from dlt.common import logger +from dlt.common.json import json +from dlt.common.pendulum import pendulum from dlt.common.typing import StrAny, TFun from dlt.common.time import ensure_pendulum_datetime from dlt.common.schema import Schema, TTableSchema, TSchemaTables, TTableSchemaColumns @@ -491,7 +493,8 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: while True: state_records = self.get_records( self.schema.state_table_name, - sort={"path": ["created_at"], "order": "desc"}, + # search by package load id which is guaranteed to increase over time + sort={"path": ["_dlt_load_id"], "order": "desc"}, where={ "path": ["pipeline_name"], "operator": "Equal", diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index ea0d10d11d..7f1403eb30 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -23,7 +23,9 @@ import zlib import re -from dlt.common import json, pendulum, logger +from dlt.common import logger +from dlt.common.json import json +from dlt.common.pendulum import pendulum from dlt.common.data_types import TDataType from dlt.common.schema.typing import ( COLUMN_HINTS, @@ -363,7 +365,7 @@ def get_stored_state(self, pipeline_name: str) -> StateInfo: query = ( f"SELECT {self.state_table_columns} FROM {state_table} AS s JOIN {loads_table} AS l ON" " l.load_id = s._dlt_load_id WHERE pipeline_name = %s AND l.status = 0 ORDER BY" - " created_at DESC" + " l.load_id DESC" ) with self.sql_client.execute_query(query, pipeline_name) as cur: row = cur.fetchone() diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py index 8e017fc791..218f73cc59 100644 --- a/dlt/destinations/job_impl.py +++ b/dlt/destinations/job_impl.py @@ -3,7 +3,7 @@ import tempfile # noqa: 251 from typing import Dict, Iterable, List -from dlt.common import json +from dlt.common.json import json from dlt.common.destination.reference import NewLoadJob, FollowupJob, TLoadJobState, LoadJob from dlt.common.schema import Schema, TTableSchema from dlt.common.storages import FileStorage diff --git a/dlt/destinations/path_utils.py b/dlt/destinations/path_utils.py index 047cb274e0..5b2ba9d183 100644 --- a/dlt/destinations/path_utils.py +++ b/dlt/destinations/path_utils.py @@ -1,9 +1,10 @@ # this can probably go some other place, but it is shared by destinations, so for now it is here from typing import List, Sequence, Tuple -import pendulum import re +from dlt.common.pendulum import pendulum + from dlt.destinations.exceptions import InvalidFilesystemLayout, CantExtractTablePrefix # TODO: ensure layout only has supported placeholders diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 9c5a080278..eadedb742e 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -1,13 +1,20 @@ from typing import Any, Dict, List, Sequence, Tuple, cast, TypedDict, Optional import yaml +from dlt.common.data_writers.escape import format_datetime_literal from dlt.common.logger import pretty_format_exception -from dlt.common.schema.typing import TTableSchema, TSortOrder +from dlt.common.pendulum import pendulum +from dlt.common.schema.typing import ( + TTableSchema, + TSortOrder, +) from dlt.common.schema.utils import ( get_columns_names_with_prop, get_first_column_name_with_prop, get_dedup_sort_tuple, + get_validity_column_names, + DEFAULT_MERGE_STRATEGY, ) from dlt.common.storages.load_storage import ParsedLoadJobFileName from dlt.common.utils import uniq_id @@ -15,6 +22,11 @@ from dlt.destinations.exceptions import MergeDispositionException from dlt.destinations.job_impl import NewLoadJobImpl from dlt.destinations.sql_client import SqlClientBase +from dlt.pipeline.current import load_package as current_load_package + + +HIGH_TS = pendulum.datetime(9999, 12, 31) +"""High timestamp used to indicate active records in `scd2` merge strategy.""" class SqlJobParams(TypedDict, total=False): @@ -139,25 +151,17 @@ class SqlMergeJob(SqlBaseJob): failed_text: str = "Tried to generate a merge sql job for the following tables:" @classmethod - def generate_sql( + def generate_sql( # type: ignore[return] cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None, ) -> List[str]: - """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset. - - The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). - The root table is merged using primary_key and merge_key hints which can be compound and be both specified. In that case the OR clause is generated. - The child tables are merged based on propagated `root_key` which is a type of foreign key but always leading to a root table. - - First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset. - At the end we copy the data from the staging dataset into destination dataset. - - If a hard_delete column is specified, records flagged as deleted will be excluded from the copy into the destination dataset. - If a dedup_sort column is specified in conjunction with a primary key, records will be sorted before deduplication, so the "latest" record remains. - """ - return cls.gen_merge_sql(table_chain, sql_client) + merge_strategy = table_chain[0].get("x-merge-strategy", DEFAULT_MERGE_STRATEGY) + if merge_strategy == "delete-insert": + return cls.gen_merge_sql(table_chain, sql_client) + elif merge_strategy == "scd2": + return cls.gen_scd2_sql(table_chain, sql_client) @classmethod def _gen_key_table_clauses( @@ -339,6 +343,18 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: def gen_merge_sql( cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any] ) -> List[str]: + """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset. + + The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). + The root table is merged using primary_key and merge_key hints which can be compound and be both specified. In that case the OR clause is generated. + The child tables are merged based on propagated `root_key` which is a type of foreign key but always leading to a root table. + + First we store the root_keys of root table elements to be deleted in the temp table. Then we use the temp table to delete records from root and all child tables in the destination dataset. + At the end we copy the data from the staging dataset into destination dataset. + + If a hard_delete column is specified, records flagged as deleted will be excluded from the copy into the destination dataset. + If a dedup_sort column is specified in conjunction with a primary key, records will be sorted before deduplication, so the "latest" record remains. + """ sql: List[str] = [] root_table = table_chain[0] @@ -486,3 +502,87 @@ def gen_merge_sql( sql.append(f"INSERT INTO {table_name}({col_str}) {select_sql};") return sql + + @classmethod + def gen_scd2_sql( + cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any] + ) -> List[str]: + """Generates SQL statements for the `scd2` merge strategy. + + The root table can be inserted into and updated. + Updates only take place when a record retires (because there is a new version + or it is deleted) and only affect the "valid to" column. + Child tables are insert-only. + """ + sql: List[str] = [] + root_table = table_chain[0] + root_table_name = sql_client.make_qualified_table_name(root_table["name"]) + with sql_client.with_staging_dataset(staging=True): + staging_root_table_name = sql_client.make_qualified_table_name(root_table["name"]) + + # get column names + escape_id = sql_client.capabilities.escape_identifier + from_, to = list(map(escape_id, get_validity_column_names(root_table))) # validity columns + hash_ = escape_id( + get_first_column_name_with_prop(root_table, "x-row-version") + ) # row hash column + + # define values for validity columns + boundary_ts = format_datetime_literal( + current_load_package()["state"]["created_at"], + sql_client.capabilities.timestamp_precision, + ) + active_record_ts = format_datetime_literal( + HIGH_TS, sql_client.capabilities.timestamp_precision + ) + + # retire updated and deleted records + sql.append(f""" + UPDATE {root_table_name} SET {to} = '{boundary_ts}' + WHERE NOT EXISTS ( + SELECT s.{hash_} FROM {staging_root_table_name} AS s + WHERE {root_table_name}.{hash_} = s.{hash_} + ) AND {to} = '{active_record_ts}'; + """) + + # insert new active records in root table + columns = map(escape_id, list(root_table["columns"].keys())) + col_str = ", ".join([c for c in columns if c not in (from_, to)]) + sql.append(f""" + INSERT INTO {root_table_name} ({col_str}, {from_}, {to}) + SELECT {col_str}, '{boundary_ts}' AS {from_}, '{active_record_ts}' AS {to} + FROM {staging_root_table_name} AS s + WHERE NOT EXISTS (SELECT s.{hash_} FROM {root_table_name} AS f WHERE f.{hash_} = s.{hash_}); + """) + + # insert list elements for new active records in child tables + child_tables = table_chain[1:] + if child_tables: + unique_column: str = None + # use unique hint to create temp table with all identifiers to delete + unique_columns = get_columns_names_with_prop(root_table, "unique") + if not unique_columns: + raise MergeDispositionException( + sql_client.fully_qualified_dataset_name(), + staging_root_table_name, + [t["name"] for t in table_chain], + f"There is no unique column (ie _dlt_id) in top table {root_table['name']} so" + " it is not possible to link child tables to it.", + ) + # get first unique column + unique_column = escape_id(unique_columns[0]) + # TODO: - based on deterministic child hashes (OK) + # - if row hash changes all is right + # - if it does not we only capture new records, while we should replace existing with those in stage + # - this write disposition is way more similar to regular merge (how root tables are handled is different, other tables handled same) + for table in child_tables: + table_name = sql_client.make_qualified_table_name(table["name"]) + with sql_client.with_staging_dataset(staging=True): + staging_table_name = sql_client.make_qualified_table_name(table["name"]) + sql.append(f""" + INSERT INTO {table_name} + SELECT * + FROM {staging_table_name} AS s + WHERE NOT EXISTS (SELECT 1 FROM {table_name} AS f WHERE f.{unique_column} = s.{unique_column}); + """) + return sql diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 28a2aca633..bc85cb4a03 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -33,8 +33,8 @@ from dlt.common.schema.schema import Schema from dlt.common.schema.typing import ( TColumnNames, - TTableSchemaColumns, TWriteDisposition, + TWriteDispositionConfig, TAnySchemaColumns, TSchemaContract, TTableFormat, @@ -286,7 +286,7 @@ def resource( /, name: str = None, table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -304,7 +304,7 @@ def resource( /, name: str = None, table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -322,7 +322,7 @@ def resource( /, name: TTableHintTemplate[str] = None, table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -341,7 +341,7 @@ def resource( /, name: str = None, table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -358,7 +358,7 @@ def resource( /, name: TTableHintTemplate[str] = None, table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -400,7 +400,9 @@ def resource( table_name (TTableHintTemplate[str], optional): An table name, if different from `name`. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. - write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + write_disposition (TTableHintTemplate[TWriteDispositionConfig], optional): Controls how to write data to a table. Accepts a shorthand string literal or configuration dictionary. + Allowed shorthand string literals: `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + Write behaviour can be further customized through a configuration dictionary. For example, to obtain an SCD2 table provide `write_disposition={"disposition": "merge", "strategy": "scd2"}`. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. columns (Sequence[TAnySchemaColumns], optional): A list, dict or pydantic model of column schemas. diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 02dd06eaf3..cc2b03c50b 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -24,7 +24,7 @@ TAnySchemaColumns, TColumnNames, TSchemaContract, - TWriteDisposition, + TWriteDispositionConfig, ) from dlt.common.storages import NormalizeStorageConfiguration, LoadPackageInfo, SchemaStorage from dlt.common.storages.load_package import ParsedLoadJobFileName @@ -47,7 +47,7 @@ def data_to_sources( schema: Schema = None, table_name: str = None, parent_table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema_contract: TSchemaContract = None, diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 421250951e..b4afc5b1f8 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -213,7 +213,14 @@ class ObjectExtractor(Extractor): class ArrowExtractor(Extractor): - """Extracts arrow data items into parquet""" + """Extracts arrow data items into parquet. Normalizes arrow items column names. + Compares the arrow schema to actual dlt table schema to reorder the columns and to + insert missing columns (without data). + + We do things that normalizer should do here so we do not need to load and save parquet + files again later. + + """ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: static_table_name = self._get_static_table_name(resource, meta) @@ -284,7 +291,7 @@ def _write_item( items: TDataItems, columns: TTableSchemaColumns = None, ) -> None: - columns = columns or self.schema.tables[table_name]["columns"] + columns = columns or self.schema.get_table_columns(table_name) # Note: `items` is always a list here due to the conversion in `write_table` items = [ pyarrow.normalize_py_arrow_item(item, columns, self.naming, self._caps) @@ -312,23 +319,28 @@ def _compute_table( # normalize arrow table before merging arrow_table = self.schema.normalize_table_identifiers(arrow_table) # issue warnings when overriding computed with arrow + override_warn: bool = False for col_name, column in arrow_table["columns"].items(): if src_column := computed_table["columns"].get(col_name): for hint_name, hint in column.items(): if (src_hint := src_column.get(hint_name)) is not None: if src_hint != hint: - logger.warning( + override_warn = True + logger.info( f"In resource: {resource.name}, when merging arrow schema on" f" column {col_name}. The hint {hint_name} value" - f" {src_hint} defined in resource is overwritten from arrow" + f" {src_hint} defined in resource will overwrite arrow hint" f" with value {hint}." ) + if override_warn: + logger.warning( + f"In resource: {resource.name}, when merging arrow schema with dlt schema," + " several column hints were different. dlt schema hints were kept and arrow" + " schema and data were unmodified. It is up to destination to coerce the" + " differences when loading. Change log level to INFO for more details." + ) - # we must override the columns to preserve the order in arrow table - arrow_table["columns"] = update_dict_nested( - arrow_table["columns"], computed_table["columns"], keep_dst_values=True - ) - + update_dict_nested(arrow_table["columns"], computed_table["columns"]) return arrow_table def _compute_and_update_table( diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 01a99a23fe..97da7dab9c 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -1,20 +1,28 @@ from copy import copy, deepcopy from typing import TypedDict, cast, Any, Optional, Dict +from dlt.common import logger from dlt.common.schema.typing import ( TColumnNames, TColumnProp, TPartialTableSchema, TTableSchema, TTableSchemaColumns, - TWriteDisposition, + TWriteDispositionConfig, + TMergeDispositionDict, TAnySchemaColumns, TTableFormat, TSchemaContract, + DEFAULT_VALIDITY_COLUMN_NAMES, ) -from dlt.common import logger -from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_column, new_column, new_table -from dlt.common.typing import TDataItem, DictStrAny, DictStrStr +from dlt.common.schema.utils import ( + DEFAULT_WRITE_DISPOSITION, + DEFAULT_MERGE_STRATEGY, + merge_column, + new_column, + new_table, +) +from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys from dlt.extract.exceptions import ( @@ -30,7 +38,7 @@ class TResourceHints(TypedDict, total=False): name: TTableHintTemplate[str] # description: TTableHintTemplate[str] - write_disposition: TTableHintTemplate[TWriteDisposition] + write_disposition: TTableHintTemplate[TWriteDispositionConfig] # table_sealed: Optional[bool] parent: TTableHintTemplate[str] columns: TTableHintTemplate[TTableSchemaColumns] @@ -57,7 +65,7 @@ def __init__(self, hints: TResourceHints, create_table_variant: bool) -> None: def make_hints( table_name: TTableHintTemplate[str] = None, parent_table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -131,13 +139,13 @@ def table_name(self, value: TTableHintTemplate[str]) -> None: self.apply_hints(table_name=value) @property - def write_disposition(self) -> TTableHintTemplate[TWriteDisposition]: + def write_disposition(self) -> TTableHintTemplate[TWriteDispositionConfig]: if self._hints is None or self._hints.get("write_disposition") is None: return DEFAULT_WRITE_DISPOSITION return self._hints.get("write_disposition") @write_disposition.setter - def write_disposition(self, value: TTableHintTemplate[TWriteDisposition]) -> None: + def write_disposition(self, value: TTableHintTemplate[TWriteDispositionConfig]) -> None: self.apply_hints(write_disposition=value) @property @@ -176,8 +184,7 @@ def compute_table_schema(self, item: TDataItem = None, meta: Any = None) -> TTab for k, v in table_template.items() if k not in NATURAL_CALLABLES } # type: ignore - table_schema = self._merge_keys(resolved_template) - table_schema["resource"] = self.name + table_schema = self._create_table_schema(resolved_template, self.name) validate_dict_ignoring_xkeys( spec=TTableSchema, doc=table_schema, @@ -189,7 +196,7 @@ def apply_hints( self, table_name: TTableHintTemplate[str] = None, parent_table_name: TTableHintTemplate[str] = None, - write_disposition: TTableHintTemplate[TWriteDisposition] = None, + write_disposition: TTableHintTemplate[TWriteDispositionConfig] = None, columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, @@ -391,17 +398,76 @@ def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSche partial["columns"][key][hint] = True @staticmethod - def _merge_keys(t_: TResourceHints) -> TPartialTableSchema: - """Merges resolved keys into columns""" - partial = cast(TPartialTableSchema, t_) - # assert not callable(t_["merge_key"]) - # assert not callable(t_["primary_key"]) - if "primary_key" in t_: - DltResourceHints._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore - if "merge_key" in t_: - DltResourceHints._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore - - return partial + def _merge_keys(dict_: Dict[str, Any]) -> None: + """Merges primary and merge keys into columns in place.""" + + if "primary_key" in dict_: + DltResourceHints._merge_key("primary_key", dict_.pop("primary_key"), dict_) # type: ignore + if "merge_key" in dict_: + DltResourceHints._merge_key("merge_key", dict_.pop("merge_key"), dict_) # type: ignore + + @staticmethod + def _merge_write_disposition_dict(dict_: Dict[str, Any]) -> None: + """Merges write disposition dictionary into write disposition shorthand and x-hints in place.""" + + if dict_["write_disposition"]["disposition"] == "merge": + DltResourceHints._merge_merge_disposition_dict(dict_) + # reduce merge disposition from dict to shorthand + dict_["write_disposition"] = dict_["write_disposition"]["disposition"] + + @staticmethod + def _merge_merge_disposition_dict(dict_: Dict[str, Any]) -> None: + """Merges merge disposition dict into x-hints on in place.""" + + mddict: TMergeDispositionDict = deepcopy(dict_["write_disposition"]) + if mddict is not None: + dict_["x-merge-strategy"] = ( + mddict["strategy"] if "strategy" in mddict else DEFAULT_MERGE_STRATEGY + ) + # add columns for `scd2` merge strategy + if dict_.get("x-merge-strategy") == "scd2": + if mddict.get("validity_column_names") is None: + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + else: + from_, to = mddict["validity_column_names"] + dict_["columns"][from_] = { + "name": from_, + "data_type": "timestamp", + "nullable": ( + True + ), # validity columns are empty when first loaded into staging table + "x-valid-from": True, + } + dict_["columns"][to] = { + "name": to, + "data_type": "timestamp", + "nullable": True, + "x-valid-to": True, + } + if mddict.get("row_version_column_name") is None: + hash_ = "_dlt_id" + else: + hash_ = mddict["row_version_column_name"] + dict_["columns"][hash_] = { + "name": hash_, + "nullable": False, + "x-row-version": True, + } + + @staticmethod + def _create_table_schema(resource_hints: TResourceHints, resource_name: str) -> TTableSchema: + """Creates table schema from resource hints and resource name.""" + + dict_ = cast(Dict[str, Any], resource_hints) + DltResourceHints._merge_keys(dict_) + dict_["resource"] = resource_name + if "write_disposition" in dict_: + if isinstance(dict_["write_disposition"], str): + dict_["write_disposition"] = { + "disposition": dict_["write_disposition"] + } # wrap in dict + DltResourceHints._merge_write_disposition_dict(dict_) + return cast(TTableSchema, dict_) @staticmethod def validate_dynamic_hints(template: TResourceHints) -> None: diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index e74e87d094..ef7523b207 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -8,8 +8,9 @@ import dlt +from dlt.common import logger from dlt.common.exceptions import MissingDependencyException -from dlt.common import pendulum, logger +from dlt.common.pendulum import pendulum from dlt.common.jsonpath import compile_path from dlt.common.typing import ( TDataItem, diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 29b20de7b8..d117b4f1d8 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -4,7 +4,7 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.utils import digest128 from dlt.common.json import json -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.common.typing import TDataItem from dlt.common.jsonpath import find_values, JSONPathFields, compile_path from dlt.extract.incremental.exceptions import ( diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py index 3e01a020ba..de777ad60e 100644 --- a/dlt/extract/storage.py +++ b/dlt/extract/storage.py @@ -3,7 +3,6 @@ from dlt.common.data_writers import TDataItemFormat, DataWriterMetrics, DataWriter, FileWriterSpec from dlt.common.schema import Schema -from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.storages import ( NormalizeStorageConfiguration, NormalizeStorage, @@ -11,10 +10,9 @@ FileStorage, PackageStorage, LoadPackageInfo, + create_load_id, ) from dlt.common.storages.exceptions import LoadPackageNotFound -from dlt.common.typing import TDataItems -from dlt.common.time import precise_time from dlt.common.utils import uniq_id @@ -68,7 +66,7 @@ def create_load_package(self, schema: Schema, reuse_exiting_package: bool = True break load_id = None if not load_id: - load_id = str(precise_time()) + load_id = create_load_id() self.new_packages.create_package(load_id) # always save schema self.new_packages.save_schema(load_id, schema) diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 6677475499..89fe06349b 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -24,11 +24,13 @@ import dlt -from dlt.common import pendulum from dlt.common import logger +from dlt.common.pendulum import pendulum from dlt.common.runtime.telemetry import with_telemetry + from dlt.common.destination import TLoaderFileFormat -from dlt.common.schema.typing import TWriteDisposition, TSchemaContract +from dlt.common.schema.typing import TWriteDispositionConfig, TSchemaContract + from dlt.common.utils import uniq_id from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention from dlt.common.configuration.container import Container @@ -165,7 +167,7 @@ def run( pipeline: Pipeline, data: Any, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, loader_file_format: TLoaderFileFormat = None, schema_contract: TSchemaContract = None, pipeline_name: str = None, @@ -180,7 +182,7 @@ def run( data (Any): The data to run the pipeline with table_name (str, optional): The name of the table to which the data should be loaded within the `dataset`. - write_disposition (TWriteDisposition, optional): Same as + write_disposition (TWriteDispositionConfig, optional): Same as in `run` command. loader_file_format (TLoaderFileFormat, optional): The file format the loader will use to create the @@ -210,7 +212,7 @@ def _run( pipeline: Pipeline, data: Any, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, loader_file_format: TLoaderFileFormat = None, schema_contract: TSchemaContract = None, pipeline_name: str = None, @@ -223,7 +225,7 @@ def _run( table_name (str, optional): The name of the table to which the data should be loaded within the `dataset`. - write_disposition (TWriteDisposition, optional): + write_disposition (TWriteDispositionConfig, optional): Same as in `run` command. loader_file_format (TLoaderFileFormat, optional): The file format the loader will use to create @@ -320,7 +322,7 @@ def add_run( *, decompose: Literal["none", "serialize", "parallel", "parallel-isolated"] = "none", table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, loader_file_format: TLoaderFileFormat = None, schema_contract: TSchemaContract = None, **kwargs: Any, @@ -358,7 +360,7 @@ def add_run( Parallel tasks are executed in different pipelines, all derived from the original one, but with the state isolated from each other. table_name: (str): The name of the table to which the data should be loaded within the `dataset` - write_disposition (TWriteDisposition, optional): Same as in `run` command. Defaults to None. + write_disposition (TWriteDispositionConfig, optional): Same as in `run` command. Defaults to None. loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional): The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. schema_contract (TSchemaContract, optional): On override for the schema contract settings, diff --git a/dlt/helpers/dbt/dbt_utils.py b/dlt/helpers/dbt/dbt_utils.py index b4097e4434..bf14504eaa 100644 --- a/dlt/helpers/dbt/dbt_utils.py +++ b/dlt/helpers/dbt/dbt_utils.py @@ -3,7 +3,8 @@ from typing import Any, Sequence, Optional, Union import warnings -from dlt.common import json, logger +from dlt.common import logger +from dlt.common.json import json from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import StrAny diff --git a/dlt/helpers/streamlit_app/__init__.py b/dlt/helpers/streamlit_app/__init__.py index b304195a5a..bfeb099ef2 100644 --- a/dlt/helpers/streamlit_app/__init__.py +++ b/dlt/helpers/streamlit_app/__init__.py @@ -5,7 +5,7 @@ import streamlit except ModuleNotFoundError: raise MissingDependencyException( - "DLT Streamlit Helpers", + "dlt Streamlit Helpers", ["streamlit"], - "DLT Helpers for Streamlit should be run within a streamlit app.", + "dlt Helpers for Streamlit should be run within a streamlit app.", ) diff --git a/dlt/helpers/streamlit_app/blocks/load_info.py b/dlt/helpers/streamlit_app/blocks/load_info.py index 134b5ad5a4..9482cb5afa 100644 --- a/dlt/helpers/streamlit_app/blocks/load_info.py +++ b/dlt/helpers/streamlit_app/blocks/load_info.py @@ -2,7 +2,7 @@ import humanize import streamlit as st -from dlt.common import pendulum +from dlt.common.pendulum import pendulum from dlt.helpers.streamlit_app.utils import query_data_live from dlt.helpers.streamlit_app.widgets import stat diff --git a/dlt/helpers/streamlit_app/blocks/query.py b/dlt/helpers/streamlit_app/blocks/query.py index a03e9a0cd9..e0cb0100a4 100644 --- a/dlt/helpers/streamlit_app/blocks/query.py +++ b/dlt/helpers/streamlit_app/blocks/query.py @@ -35,9 +35,9 @@ def maybe_run_query( import altair as alt except ModuleNotFoundError: raise MissingDependencyException( - "DLT Streamlit Helpers", + "dlt Streamlit Helpers", ["altair"], - "DLT Helpers for Streamlit should be run within a streamlit" + "dlt Helpers for Streamlit should be run within a streamlit" " app.", ) diff --git a/dlt/helpers/streamlit_app/blocks/resource_state.py b/dlt/helpers/streamlit_app/blocks/resource_state.py index 86b8effc98..dabbea4d46 100644 --- a/dlt/helpers/streamlit_app/blocks/resource_state.py +++ b/dlt/helpers/streamlit_app/blocks/resource_state.py @@ -1,10 +1,10 @@ from typing import Union - -import dlt -import pendulum import streamlit as st import yaml +import dlt +from dlt.common.pendulum import pendulum + def date_to_iso( dumper: yaml.SafeDumper, data: Union[pendulum.Date, pendulum.DateTime] diff --git a/dlt/load/load.py b/dlt/load/load.py index b1f786274e..c5790d467b 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -5,7 +5,8 @@ from concurrent.futures import Executor import os -from dlt.common import sleep, logger +from dlt.common import logger +from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.accessors import config diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 1e4e55effd..742125850d 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -1,7 +1,8 @@ from typing import List, Dict, Set, Any from abc import abstractmethod -from dlt.common import json, logger +from dlt.common import logger +from dlt.common.json import json from dlt.common.data_writers import DataWriterMetrics from dlt.common.data_writers.writers import ArrowToObjectAdapter from dlt.common.json import custom_pua_decode, may_have_pua @@ -326,8 +327,12 @@ def _write_with_dlt_columns( return [schema_update] - def _fix_schema_precisions(self, root_table_name: str) -> List[TSchemaUpdate]: - """Reduce precision of timestamp columns if needed, according to destination caps""" + def _fix_schema_precisions( + self, root_table_name: str, arrow_schema: Any + ) -> List[TSchemaUpdate]: + """Update precision of timestamp columns to the precision of parquet being normalized. + Reduce the precision if it is out of range of destination timestamp precision. + """ schema = self.schema table = schema.tables[root_table_name] max_precision = self.config.destination_capabilities.timestamp_precision @@ -335,9 +340,15 @@ def _fix_schema_precisions(self, root_table_name: str) -> List[TSchemaUpdate]: new_cols: TTableSchemaColumns = {} for key, column in table["columns"].items(): if column.get("data_type") in ("timestamp", "time"): - if (prec := column.get("precision")) and prec > max_precision: - new_cols[key] = dict(column, precision=max_precision) # type: ignore[assignment] - + if prec := column.get("precision"): + # apply the arrow schema precision to dlt column schema + data_type = pyarrow.get_column_type_from_py_arrow(arrow_schema.field(key).type) + if data_type["data_type"] in ("timestamp", "time"): + prec = data_type["precision"] + # limit with destination precision + if prec > max_precision: + prec = max_precision + new_cols[key] = dict(column, precision=prec) # type: ignore[assignment] if not new_cols: return [] return [ @@ -345,8 +356,6 @@ def _fix_schema_precisions(self, root_table_name: str) -> List[TSchemaUpdate]: ] def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSchemaUpdate]: - base_schema_update = self._fix_schema_precisions(root_table_name) - # read schema and counts from file metadata from dlt.common.libs.pyarrow import get_parquet_metadata @@ -355,6 +364,9 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> List[TSch ) as f: num_rows, arrow_schema = get_parquet_metadata(f) file_metrics = DataWriterMetrics(extracted_items_file, num_rows, f.tell(), 0, 0) + # when parquet files is saved, timestamps will be truncated and coerced. take the updated values + # and apply them to dlt schema + base_schema_update = self._fix_schema_precisions(root_table_name, arrow_schema) add_dlt_id = self.config.parquet_normalizer.add_dlt_id add_dlt_load_id = self.config.parquet_normalizer.add_dlt_load_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index 0125d5a525..5e3315d10f 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -3,7 +3,8 @@ from typing import Callable, List, Dict, NamedTuple, Sequence, Tuple, Set, Optional from concurrent.futures import Future, Executor -from dlt.common import logger, sleep +from dlt.common import logger +from dlt.common.runtime.signals import sleep from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.configuration.container import Container @@ -377,7 +378,9 @@ def spool_schema_files(self, load_id: str, schema: Schema, files: Sequence[str]) # delete existing folder for the case that this is a retry self.load_storage.new_packages.delete_package(load_id, not_exists_ok=True) # normalized files will go here before being atomically renamed - self.load_storage.new_packages.create_package(load_id) + self.load_storage.import_extracted_package( + load_id, self.normalize_storage.extracted_packages + ) logger.info(f"Created new load package {load_id} on loading volume") try: # process parallel @@ -391,7 +394,9 @@ def spool_schema_files(self, load_id: str, schema: Schema, files: Sequence[str]) ) # start from scratch self.load_storage.new_packages.delete_package(load_id) - self.load_storage.new_packages.create_package(load_id) + self.load_storage.import_extracted_package( + load_id, self.normalize_storage.extracted_packages + ) self.spool_files(load_id, schema.clone(update_normalizers=True), self.map_single, files) return load_id diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 6b14eaf777..c9e7b5097c 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -2,7 +2,7 @@ from typing_extensions import TypeVar from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContract +from dlt.common.schema.typing import TColumnSchema, TWriteDispositionConfig, TSchemaContract from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config @@ -201,7 +201,7 @@ def run( dataset_name: str = None, credentials: Any = None, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: Sequence[TColumnSchema] = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, @@ -243,7 +243,9 @@ def run( * `@dlt.resource`: resource contains the full table schema and that includes the table name. `table_name` will override this property. Use with care! * `@dlt.source`: source contains several resources each with a table schema. `table_name` will override all table names within the source and load the data into single table. - write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + write_disposition (TWriteDispositionConfig, optional): Controls how to write data to a table. Accepts a shorthand string literal or configuration dictionary. + Allowed shorthand string literals: `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + Write behaviour can be further customized through a configuration dictionary. For example, to obtain an SCD2 table provide `write_disposition={"disposition": "merge", "strategy": "scd2"}`. Please note that in case of `dlt.resource` the table schema value will be overwritten and in case of `dlt.source`, the values in all resources will be overwritten. columns (Sequence[TColumnSchema], optional): A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 00e45b96e7..bdade1308f 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -19,7 +19,9 @@ ) from dlt import version -from dlt.common import json, logger, pendulum +from dlt.common import logger +from dlt.common.json import json +from dlt.common.pendulum import pendulum from dlt.common.configuration import inject_section, known_sections from dlt.common.configuration.specs import RunConfiguration, CredentialsConfiguration from dlt.common.configuration.container import Container @@ -42,7 +44,7 @@ from dlt.common.schema.typing import ( TColumnNames, TSchemaTables, - TWriteDisposition, + TWriteDispositionConfig, TAnySchemaColumns, TSchemaContract, ) @@ -97,6 +99,7 @@ from dlt.common.schema import Schema from dlt.common.utils import is_interactive from dlt.common.warnings import deprecated, Dlt04DeprecationWarning +from dlt.common.versioned_state import json_encode_state, json_decode_state from dlt.extract import DltSource from dlt.extract.exceptions import SourceExhausted @@ -136,8 +139,6 @@ mark_state_extracted, migrate_pipeline_state, state_resource, - json_encode_state, - json_decode_state, default_pipeline_state, ) from dlt.pipeline.warnings import credentials_argument_deprecated @@ -393,7 +394,7 @@ def extract( *, table_name: str = None, parent_table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, @@ -561,7 +562,7 @@ def run( dataset_name: str = None, credentials: Any = None, table_name: str = None, - write_disposition: TWriteDisposition = None, + write_disposition: TWriteDispositionConfig = None, columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, @@ -605,7 +606,9 @@ def run( * `@dlt.resource`: resource contains the full table schema and that includes the table name. `table_name` will override this property. Use with care! * `@dlt.source`: source contains several resources each with a table schema. `table_name` will override all table names within the source and load the data into single table. - write_disposition (Literal["skip", "append", "replace", "merge"], optional): Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + write_disposition (TWriteDispositionConfig, optional): Controls how to write data to a table. Accepts a shorthand string literal or configuration dictionary. + Allowed shorthand string literals: `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + Write behaviour can be further customized through a configuration dictionary. For example, to obtain an SCD2 table provide `write_disposition={"disposition": "merge", "strategy": "scd2"}`. Please note that in case of `dlt.resource` the table schema value will be overwritten and in case of `dlt.source`, the values in all resources will be overwritten. columns (Sequence[TColumnSchema], optional): A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. diff --git a/dlt/pipeline/platform.py b/dlt/pipeline/platform.py index 0955e91b51..fe419d5146 100644 --- a/dlt/pipeline/platform.py +++ b/dlt/pipeline/platform.py @@ -1,15 +1,16 @@ """Implements SupportsTracking""" from typing import Any, cast, TypedDict, List import requests -from dlt.common.managed_thread_pool import ManagedThreadPool from urllib.parse import urljoin -from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, TPipelineStep, SupportsPipeline -from dlt.common import json from dlt.common import logger +from dlt.common.json import json from dlt.common.pipeline import LoadInfo +from dlt.common.managed_thread_pool import ManagedThreadPool from dlt.common.schema.typing import TStoredSchema +from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, TPipelineStep, SupportsPipeline + _THREAD_POOL: ManagedThreadPool = ManagedThreadPool(1) TRACE_URL_SUFFIX = "/trace" STATE_URL_SUFFIX = "/state" diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 5366b9c46d..d38010f842 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -1,18 +1,17 @@ -import binascii from copy import copy -from typing import Tuple, cast, List -import pendulum +from typing import Tuple, cast import dlt -from dlt.common import json +from dlt.common.pendulum import pendulum from dlt.common.typing import DictStrAny from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import WithStateSync, Destination -from dlt.common.utils import compressed_b64decode, compressed_b64encode from dlt.common.versioned_state import ( generate_state_version_hash, bump_state_version_if_modified, default_versioned_state, + compress_state, + decompress_state, ) from dlt.common.pipeline import TPipelineState @@ -39,27 +38,6 @@ } -def json_encode_state(state: TPipelineState) -> str: - return json.typed_dumps(state) - - -def json_decode_state(state_str: str) -> DictStrAny: - return json.typed_loads(state_str) # type: ignore[no-any-return] - - -def compress_state(state: TPipelineState) -> str: - return compressed_b64encode(json.typed_dumpb(state)) - - -def decompress_state(state_str: str) -> DictStrAny: - try: - state_bytes = compressed_b64decode(state_str) - except binascii.Error: - return json.typed_loads(state_str) # type: ignore[no-any-return] - else: - return json.typed_loadb(state_bytes) # type: ignore[no-any-return] - - def generate_pipeline_state_version_hash(state: TPipelineState) -> str: return generate_state_version_hash(state, exclude_attrs=["_local"]) diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index b610d1751f..fc15654949 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -7,7 +7,7 @@ from typing import Any, List, NamedTuple, Optional, Protocol, Sequence import humanize -from dlt.common import pendulum, json +from dlt.common.pendulum import pendulum from dlt.common.configuration import is_secret_hint from dlt.common.configuration.exceptions import ContextDefaultCannotBeCreated from dlt.common.configuration.specs.config_section_context import ConfigSectionContext diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index 990c59050e..e6f8db36d6 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -3,7 +3,8 @@ from typing import Any import humanize -from dlt.common import pendulum, logger +from dlt.common import logger +from dlt.common.pendulum import pendulum from dlt.common.utils import digest128 from dlt.common.runtime.exec_info import github_info from dlt.common.runtime.segment import track as dlthub_telemetry_track diff --git a/dlt/sources/helpers/rest_client/auth.py b/dlt/sources/helpers/rest_client/auth.py index 99421e2c60..620135d410 100644 --- a/dlt/sources/helpers/rest_client/auth.py +++ b/dlt/sources/helpers/rest_client/auth.py @@ -12,19 +12,19 @@ Iterable, TYPE_CHECKING, ) -from dlt.sources.helpers import requests from requests.auth import AuthBase from requests import PreparedRequest # noqa: I251 -import pendulum - -from dlt.common.exceptions import MissingDependencyException from dlt.common import logger +from dlt.common.exceptions import MissingDependencyException from dlt.common.configuration.specs.base_configuration import configspec from dlt.common.configuration.specs import CredentialsConfiguration from dlt.common.configuration.specs.exceptions import NativeValueError +from dlt.common.pendulum import pendulum from dlt.common.typing import TSecretStrValue +from dlt.sources.helpers import requests + if TYPE_CHECKING: from cryptography.hazmat.primitives.asymmetric.types import PrivateKeyTypes else: diff --git a/dlt/sources/helpers/rest_client/client.py b/dlt/sources/helpers/rest_client/client.py index 1c3e4d97b6..03b785c9f0 100644 --- a/dlt/sources/helpers/rest_client/client.py +++ b/dlt/sources/helpers/rest_client/client.py @@ -10,12 +10,12 @@ ) import copy from urllib.parse import urlparse - from requests import Session as BaseSession # noqa: I251 from requests import Response, Request -from dlt.common import logger from dlt.common import jsonpath +from dlt.common import logger + from dlt.sources.helpers.requests.retry import Client from .typing import HTTPMethodBasic, HTTPMethod, Hooks diff --git a/dlt/sources/helpers/transform.py b/dlt/sources/helpers/transform.py index d038ec58ee..32843e2aa2 100644 --- a/dlt/sources/helpers/transform.py +++ b/dlt/sources/helpers/transform.py @@ -110,3 +110,37 @@ def _transformer(item: TDataItem) -> TDataItem: return item return _transformer + + +def add_row_hash_to_table(row_hash_column_name: str) -> TDataItem: + """Computes content hash for each row of panda frame, arrow table or batch and adds it as `row_hash_column_name` column. + + Internally arrow tables and batches are converted to pandas DataFrame and then `hash_pandas_object` is used to + generate a series with row hashes. Hashes are converted to signed int64 and added to original table. Data may be modified. + For SCD2 use with a resource configuration that assigns custom row version column to `row_hash_column_name` + """ + from dlt.common.libs import pyarrow + from dlt.common.libs.pyarrow import pyarrow as pa + from dlt.common.libs.pandas import pandas as pd + + def _unwrap(table: TDataItem) -> TDataItem: + if is_arrow := pyarrow.is_arrow_item(table): + df = table.to_pandas(deduplicate_objects=False) + else: + df = table + + hash_ = pd.util.hash_pandas_object(df) + + if is_arrow: + table = pyarrow.append_column( + table, + row_hash_column_name, + pa.Array.from_pandas(hash_, type=pa.int64(), safe=False), + ) + else: + hash_np = hash_.values.astype("int64", copy=False, casting="unsafe") + table[row_hash_column_name] = hash_np + + return table + + return _unwrap diff --git a/dlt/version.py b/dlt/version.py index f8ca3cb873..aa87021bf7 100644 --- a/dlt/version.py +++ b/dlt/version.py @@ -14,7 +14,7 @@ def get_installed_requirement_string(package: str = DLT_PKG_NAME) -> str: # PEP 610 https://packaging.python.org/en/latest/specifications/direct-url/#specification direct_url = dist.read_text("direct_url.json") if direct_url is not None: - from dlt.common import json + from dlt.common.json import json # `url` contain the location of the distribution url = urlparse(json.loads(direct_url)["url"]) diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py index afda16a51a..046e566efd 100644 --- a/docs/examples/nested_data/nested_data.py +++ b/docs/examples/nested_data/nested_data.py @@ -24,7 +24,7 @@ from bson.decimal128 import Decimal128 from bson.objectid import ObjectId -from pendulum import _datetime +from pendulum import _datetime # noqa: I251 from pymongo import MongoClient import dlt diff --git a/docs/tools/lint_setup/template.py b/docs/tools/lint_setup/template.py index c72c4dba62..bebc0e9ab0 100644 --- a/docs/tools/lint_setup/template.py +++ b/docs/tools/lint_setup/template.py @@ -7,12 +7,11 @@ import os -import pendulum from datetime import datetime # noqa: I251 -from pendulum import DateTime +from pendulum import DateTime # noqa: I251 import dlt -from dlt.common import json +from dlt.common import json, pendulum from dlt.common.typing import TimedeltaSeconds, TAnyDateTime, TDataItem, TDataItems from dlt.common.schema.typing import TTableSchema, TTableSchemaColumns diff --git a/docs/website/blog/2024-03-11-moving-away-from-segment.md b/docs/website/blog/2024-03-11-moving-away-from-segment.md index 4f4b7d0a80..e3e44ce027 100644 --- a/docs/website/blog/2024-03-11-moving-away-from-segment.md +++ b/docs/website/blog/2024-03-11-moving-away-from-segment.md @@ -10,7 +10,7 @@ authors: tags: [Pub/Sub, dlt, Segment, Streaming] --- :::info -TL;DR: This blog post introduces a cost-effective solution for event streaming that results in up to 18x savings. The solution leverages Cloud Pub/Sub and DLT to build an efficient event streaming pipeline. +TL;DR: This blog post introduces a cost-effective solution for event streaming that results in up to 18x savings. The solution leverages Cloud Pub/Sub and dlt to build an efficient event streaming pipeline. ::: ## The Segment Problem @@ -18,19 +18,19 @@ Event tracking is a complicated problem for which there exist many solutions. On :::note -💡 With Segment, you pay 1-1.2 cents for every tracked users. +💡 With Segment, you pay 1-1.2 cents for every tracked users. Let’s take a back-of-napkin example: for 100.000 users, ingesting their events data would cost **$1000.** **The bill:** -* **Minimum 10,000 monthly tracked users (0-10K)** + $120. +* **Minimum 10,000 monthly tracked users (0-10K)** + $120. * **Additional 1,000 monthly tracked users (10K - 25K)** + $12 / 1000 user. * **Additional 1,000 monthly tracked users (25k - 100K)** + $11 / 1000 user. * **Additional 1,000 monthly tracked users (100k +)** + $10 / 1000 user. ::: -The price of **$1000/month** for 100k tracked users doesn’t seem excessive, given the complexity of the task at hand. +The price of **$1000/month** for 100k tracked users doesn’t seem excessive, given the complexity of the task at hand. However, similar results can be achieved on GCP by combining different services. If those 100k users produce 1-2m events, **those costs would stay in the $10-60 range.** @@ -45,18 +45,18 @@ Our proposed solution to replace Segment involves using dlt with Cloud Pub/Sub t In this architecture, a publisher initiates the process by pushing events to a Pub/Sub topic. Specifically, in the context of dlt, the library acts as the publisher, directing user telemetry data to a designated topic within Pub/Sub. -A subscriber is attached to the topic. Pub/Sub offers a push-based [subscriber](https://cloud.google.com/pubsub/docs/subscription-overview) that proactively receives messages from the topic and writes them to Cloud Storage. The subscriber is configured to aggregate all messages received within a 10-minute window and then forward them to a designated storage bucket. +A subscriber is attached to the topic. Pub/Sub offers a push-based [subscriber](https://cloud.google.com/pubsub/docs/subscription-overview) that proactively receives messages from the topic and writes them to Cloud Storage. The subscriber is configured to aggregate all messages received within a 10-minute window and then forward them to a designated storage bucket. Once the data is written to the Cloud Storage this triggers a Cloud Function. The Cloud Function reads the data from the storage bucket and uses dlt to ingest the data into BigQuery. ## Code Walkthrough -This section dives into a comprehensive code walkthrough that illustrates the step-by-step process of implementing our proposed event streaming pipeline. +This section dives into a comprehensive code walkthrough that illustrates the step-by-step process of implementing our proposed event streaming pipeline. Implementing the pipeline requires the setup of various resources, including storage buckets and serverless functions. To streamline the procurement of these resources, we'll leverage Terraform—an Infrastructure as Code (IaC) tool. ### Prerequisites -Before we embark on setting up the pipeline, there are essential tools that need to be installed to ensure a smooth implementation process. +Before we embark on setting up the pipeline, there are essential tools that need to be installed to ensure a smooth implementation process. - **Firstly**, follow the official guide to install [Terraform](https://developer.hashicorp.com/terraform/tutorials/aws-get-started/install-cli), a tool for automating the deployment of cloud infrastructure. - **Secondly**, install the [Google Cloud Pub/Sub API client library](https://cloud.google.com/sdk/docs/install) which is required for publishing events to Cloud Pub/Sub. @@ -93,7 +93,7 @@ To set up our pipeline, start by cloning the [GitHub Repository](https://github. │ └── variables.tf ``` -Within this structure, the **Terraform** directory houses all the Terraform code required to set up the necessary resources on Google Cloud. +Within this structure, the **Terraform** directory houses all the Terraform code required to set up the necessary resources on Google Cloud. Meanwhile, the **cloud_functions** folder includes the code for the Cloud Function that will be deployed. This function will read the data from storage and use dlt to ingest data into BigQuery. The code for the function can be found in `cloud_functions/main.py` file. @@ -133,7 +133,7 @@ variable "service_account_email" { ### Step 3: Procure Cloud Resources -We are now ready to set up some cloud resources. To get started, navigate into the **terraform** directory and `terraform init`. The command initializes the working directory containing Terraform configuration files. +We are now ready to set up some cloud resources. To get started, navigate into the **terraform** directory and `terraform init`. The command initializes the working directory containing Terraform configuration files. With the initialization complete, you're ready to proceed with the creation of your cloud resources. To do this, run the following Terraform commands in sequence. These commands instruct Terraform to plan and apply the configurations defined in your `.tf` files, setting up the infrastructure on Google Cloud as specified. @@ -174,7 +174,7 @@ python publisher.py ### Step 5: Results -Once the publisher sends events to the Pub/Sub Topic, the pipeline is activated. These are asynchronous calls, so there's a delay between message publication and their appearance in BigQuery. +Once the publisher sends events to the Pub/Sub Topic, the pipeline is activated. These are asynchronous calls, so there's a delay between message publication and their appearance in BigQuery. The average completion time of the pipeline is approximately 12 minutes, accounting for the 10-minute time interval after which the subscriber pushes data to storage plus the Cloud Function execution time. The push interval of the subscriber can be adjusted by changing the **max_duration** in `pubsub.tf` @@ -197,7 +197,7 @@ On average the cost for our proposed pipeline are as follows: - Our web tracking user:event ratio is 1:15, so the Segment cost equivalent would be **$55**. - Our telemetry device:event ratio is 1:60, so the Segment cost equivalent would be **$220**. -So with our setup, as long as we keep events-to-user ratio **under 270**, we will have cost savings over Segment. In reality, it gets even better because GCP offers a very generous free tier that resets every month, where Segment costs more at low volumes. +So with our setup, as long as we keep events-to-user ratio **under 270**, we will have cost savings over Segment. In reality, it gets even better because GCP offers a very generous free tier that resets every month, where Segment costs more at low volumes. **GCP Cost Calculation:** Currently, our telemetry tracks 50,000 anonymized devices each month on a 1:60 device-to-event ratio. Based on these data volumes we can estimate the cost of our proposed pipeline. diff --git a/docs/website/blog/2024-03-25-reverse_etl_dlt.md b/docs/website/blog/2024-03-25-reverse_etl_dlt.md index 19785c6e66..2dc32cc914 100644 --- a/docs/website/blog/2024-03-25-reverse_etl_dlt.md +++ b/docs/website/blog/2024-03-25-reverse_etl_dlt.md @@ -62,7 +62,7 @@ Building a destination: [docs](https://dlthub.com/devel/dlt-ecosystem/destinatio SQL source: [docs](https://dlthub.com/devel/dlt-ecosystem/verified-sources/sql_database) In this example, you will see why it’s faster to build a custom destination than set up a separate tool. -DLT allows you to define custom destination functions. You'll write a function that extracts the relevant data from your dataframe and formats it for the Notion API. +dlt allows you to define custom destination functions. You'll write a function that extracts the relevant data from your dataframe and formats it for the Notion API. This example assumes you have set up Google Sheets API access and obtained the necessary credentials to authenticate. diff --git a/docs/website/blog/2024-03-26-second-data-setup b/docs/website/blog/2024-03-26-second-data-setup new file mode 100644 index 0000000000..12b032eef2 --- /dev/null +++ b/docs/website/blog/2024-03-26-second-data-setup @@ -0,0 +1,123 @@ +--- +slug: second-data-setup +title: The Second Data Warehouse, aka the "disaster recovery" project +image: https://storage.googleapis.com/dlt-blog-images/second_house.png +authors: + name: Adrian Brudaru + title: Open source Data Engineer + url: https://github.com/adrianbr + image_url: https://avatars.githubusercontent.com/u/5762770?v=4 +tags: [data setup, disaster recovery] +--- + +# The things i've seen + +The last 5 years before working on dlt, I spent as a data engineering freelancer. +Before freelancing, I was working for "sexy but poor" startups where building fast and cheap was a religion. + +In this time, I had the pleasure of doing many first time setups, and a few "rebuilds" or "second time setups". + +In fact, my first freelancing project was a "disaster recovery" one. + +A "second time build" or "disaster recovery project" refers to the process of re-designing, re-building, or significantly +overhauling a data warehouse or data infrastructure after the initial setup has failed to meet the organization's needs. + +![dipping your toes in disaster](https://storage.googleapis.com/dlt-blog-images/disaster-2.png) + +## The first time builds gone wrong + +There's usually no need for a second time build, if the first time build works. Rather, a migration might cut it. +A second time build usually happens only if +- the first time build does not work, either now or for the next requirements. +- the first time build cannot be "migrated" or "fixed" due to fundamental flaws. + +Let's take some examples from my experiences. +Example 1: A serial talker takes a lead role at a large, growing startup. They speak like management, so management trusts. A few years later + - half the pipelines are running on Pentaho + windows, the other are python 2, 3 and written by agencies. + - The data engineering team quit. They had enough. + - The remaining data engineers do what they want - a custom framework - or they threaten to quit, taking the only knowledge of the pipelines with them. + - Solution: Re-write all pipelines in python3, replace custom framework with airflow, add tests, github, and other best pratices. + +Example 2: A large international manufacturing company needed a data warehouse. + - Microsoft sold them their tech+ consultants. + - 2 years later, it's done but doesn't work (query time impossible) + - Solution: Teach the home DE team to use redshift and migrate. + +Example 3: A non technical professional takes a lead data role and uses a tool to do everything. + - same as above but the person also hired a team of juniors + - since there was no sudden ragequit, the situation persisted for a few years + - after they left, the remaining team removed the tool and re-built. + +Example 4: A first time data hire introduces a platform-like tool that's sql centric and has no versioning, api, or programmatic control. + - after writing 30k+ lines of wet sql, scheduling and making them dependent on each other in this UI tool (without lineage), the person can no longer maintain the reports + - Quits after arguing with management. + - Solution: Reverse engineer existing reports, account for bugs and unfulfilled requirements, build them from scratch, occasionally searching the mass of sql. Outcome was under 2k lines. + +Example 5: A VC company wants to make a tool that reads metrics from business apps like google ads, Stripe. + - They end up at the largest local agency, who recommends them a single - tenant SaaS MDS for 90k to set up and a pathway from there + - They agreed and then asked me to review. The agency person was aggressive and queried my knowledge on unrelated things, in an attempt to dismiss my assessment. + - Turns out the agency was selling "installing 5tran and cleaning the data" for 5k+ per source, and some implementation partners time. + - I think the VC later hired a non technical freelancer to do the work. + +# Who can build a first time setup that scales into the future? + +The non-negotiable skills needed are +- Programming. You can use ETL tools for ingestion, but they rarely solve the problem fully (under 20% in my respondent network - these are generally <30 people companies) +- Modelling. Architecture first, sql second, tools third. +- Requirement collection. You should consult your stakeholders on the data available to represent their process, and reach a good result. Usually the stakeholders are not experts and will not be able to give good requirements. + +## Who's to blame and what can we do about it? + +I believe the blame is quite shared. The common denominators seem to be +- A lack of technical knowledge, +- tools to fill the gap. +- and a warped or dishonest self representation (by vendor or professional) + +As for what to do about it: +If you were a hiring manager, ensure that your first data hire has all the skills at their disposal, and make sure they don't just talk the talk but walk the walk. Ask for references or test them. + +But you aren't a hiring manager - those folks don't read data blogs. + +So here's what you can do +- Ensure all 3 skills are available - they do not need to all be in one person. You could hire a freelance DE to build first, and a technical analyst to fulfil requests and extend the stack. +- Let vendors write about first data hire, and "follow the money" - Check if the advice aligns with their financial incentive. If it does, get a second opinion. +- Choose tooling that scales across different stages of a data stack lifecycle, so the problem doesn't occur. +- Use vendor agnostic components where possible (for example, dlt + sqlmesh + sql glot can create a db-agnostic stack that enables you to switch between dbs) +- Behave better - the temptation to oversell yourself is there, but you could check yourself and look for a position where you can learn. Your professional network could be your biggest help in your career, don't screw them over. +- Use independent freelancers for consulting. They live off reputation, so look for the recommended ones. + +## How to do a disaster recovery? + +The problem usually originates from the lack of a skill, which downstreams into implementations that don't scale. +However, the solution is often not as simple as adding the skill, because various workarounds were created to bridge that gap, and those workarounds have people working on them. + +Simply adding that missing skill to the team to build the missing piece would create a redundancy, which in its resolution would kick out the existing workarounds. +But workarounds are maintained by roles, so the original implementer will usually feel their position threatened; +This can easily escalate to a people conflict which often leads with the workaround maker quitting (or getting fired). + +How to manage the emotions? +- Be considerate of people's feelings - you are brought in to replace their work, so make it a cooperative experience where they can be the hero. +- Ask for help when you are not sure about who has the decision over an area. + +How to manage the technical side? +- Ensure you have all the skills needed to deliver a data stack on the team. +- If the existing solution produces correct results, use it as requirements for the next - for example, you could write tests that check that business rules are correctly implemented. +- Clarify with stakeholders how much the old solution should be maintained - it will likely free up people to work on the new one. +- Identify team skills that can help towards the new solution and consider them when choosing the technology stack. + + +## What I wish I knew + +Each "disaster recovery" project was more than just a technical reboot; it was a testament to the team's adaptability, +the foresight in planning for scalability, and, importantly, the humility to recognize and rectify mistakes. +"What I Wish I Knew Then" is about the understanding that building a data infrastructure is as much about +building a culture of continuous learning and improvement as it is about the code and systems themselves. + + +### Want to discuss? + +Agencies and freelancers are often the heavy-lifters that are brought in to do such setups. +Is this something you are currently doing? +Tell us about your challenges so we may better support you. + +[Join our slack community](https://dlthub.com/community) to take part in the conversation. \ No newline at end of file diff --git a/docs/website/docs/dlt-ecosystem/destinations/athena.md b/docs/website/docs/dlt-ecosystem/destinations/athena.md index 7f10519c20..96edfb3d70 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/athena.md +++ b/docs/website/docs/dlt-ecosystem/destinations/athena.md @@ -9,7 +9,7 @@ keywords: [aws, athena, glue catalog] The Athena destination stores data as Parquet files in S3 buckets and creates [external tables in AWS Athena](https://docs.aws.amazon.com/athena/latest/ug/creating-tables.html). You can then query those tables with Athena SQL commands, which will scan the entire folder of Parquet files and return the results. This destination works very similarly to other SQL-based destinations, with the exception that the merge write disposition is not supported at this time. The `dlt` metadata will be stored in the same bucket as the Parquet files, but as iceberg tables. Athena also supports writing individual data tables as Iceberg tables, so they may be manipulated later. A common use case would be to strip GDPR data from them. ## Install dlt with Athena -**To install the DLT library with Athena dependencies:** +**To install the dlt library with Athena dependencies:** ```sh pip install dlt[athena] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 1e80146a7a..54d5abae6d 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -8,7 +8,7 @@ keywords: [bigquery, destination, data warehouse] ## Install dlt with BigQuery -**To install the DLT library with BigQuery dependencies:** +**To install the dlt library with BigQuery dependencies:** ```sh pip install dlt[bigquery] diff --git a/docs/website/docs/dlt-ecosystem/destinations/databricks.md b/docs/website/docs/dlt-ecosystem/destinations/databricks.md index c852cbcc7c..b601809935 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/databricks.md +++ b/docs/website/docs/dlt-ecosystem/destinations/databricks.md @@ -10,7 +10,7 @@ keywords: [Databricks, destination, data warehouse] *Big thanks to Evan Phillips and [swishbi.com](https://swishbi.com/) for contributing code, time, and a test environment.* ## Install dlt with Databricks -**To install the DLT library with Databricks dependencies:** +**To install the dlt library with Databricks dependencies:** ```sh pip install dlt[databricks] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/dremio.md b/docs/website/docs/dlt-ecosystem/destinations/dremio.md index deb5947a06..0be01e8e32 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/dremio.md +++ b/docs/website/docs/dlt-ecosystem/destinations/dremio.md @@ -7,7 +7,7 @@ keywords: [dremio, iceberg, aws, glue catalog] # Dremio ## Install dlt with Dremio -**To install the DLT library with Dremio and s3 dependencies:** +**To install the dlt library with Dremio and s3 dependencies:** ```sh pip install dlt[dremio,s3] ``` @@ -86,7 +86,7 @@ Data loading happens by copying a staged parquet files from an object storage bu Dremio does not support `CREATE SCHEMA` DDL statements. -Therefore, "Metastore" data sources, such as Hive or Glue, require that the dataset schema exists prior to running the DLT pipeline. `full_refresh=True` is unsupported for these data sources. +Therefore, "Metastore" data sources, such as Hive or Glue, require that the dataset schema exists prior to running the dlt pipeline. `full_refresh=True` is unsupported for these data sources. "Object Storage" data sources do not have this limitation. diff --git a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md index 79e26554f6..e4f8732507 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/duckdb.md +++ b/docs/website/docs/dlt-ecosystem/destinations/duckdb.md @@ -7,7 +7,7 @@ keywords: [duckdb, destination, data warehouse] # DuckDB ## Install dlt with DuckDB -**To install the DLT library with DuckDB dependencies, run:** +**To install the dlt library with DuckDB dependencies, run:** ```sh pip install dlt[duckdb] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md index 4f7a924be1..a8b2b084b9 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/filesystem.md +++ b/docs/website/docs/dlt-ecosystem/destinations/filesystem.md @@ -6,7 +6,7 @@ Its primary role is to be used as a staging for other destinations, but you can > 💡 Please read the notes on the layout of the data files. Currently, we are getting feedback on it. Please join our Slack (icon at the top of the page) and help us find the optimal layout. ## Install dlt with filesystem -**To install the DLT library with filesystem dependencies:** +**To install the dlt library with filesystem dependencies:** ```sh pip install dlt[filesystem] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md index 1c80f7be9b..f6fcdfbc0c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/motherduck.md +++ b/docs/website/docs/dlt-ecosystem/destinations/motherduck.md @@ -8,7 +8,7 @@ keywords: [MotherDuck, duckdb, destination, data warehouse] > 🧪 MotherDuck is still invitation-only and is being intensively tested. Please see the limitations/problems at the end. ## Install dlt with MotherDuck -**To install the DLT library with MotherDuck dependencies:** +**To install the dlt library with MotherDuck dependencies:** ```sh pip install dlt[motherduck] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/mssql.md b/docs/website/docs/dlt-ecosystem/destinations/mssql.md index c0bf2bcebf..a63044bd73 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/mssql.md +++ b/docs/website/docs/dlt-ecosystem/destinations/mssql.md @@ -7,7 +7,7 @@ keywords: [mssql, sqlserver, destination, data warehouse] # Microsoft SQL Server ## Install dlt with MS SQL -**To install the DLT library with MS SQL dependencies, use:** +**To install the dlt library with MS SQL dependencies, use:** ```sh pip install dlt[mssql] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/postgres.md b/docs/website/docs/dlt-ecosystem/destinations/postgres.md index b806ba78fe..95f45b6a1c 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/postgres.md +++ b/docs/website/docs/dlt-ecosystem/destinations/postgres.md @@ -7,7 +7,7 @@ keywords: [postgres, destination, data warehouse] # Postgres ## Install dlt with PostgreSQL -**To install the DLT library with PostgreSQL dependencies, run:** +**To install the dlt library with PostgreSQL dependencies, run:** ```sh pip install dlt[postgres] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md index 7711e7d877..1b560ad6fe 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/qdrant.md +++ b/docs/website/docs/dlt-ecosystem/destinations/qdrant.md @@ -95,7 +95,7 @@ It accepts the following arguments: - `data`: a dlt resource object or a Python data structure (e.g., a list of dictionaries). - `embed`: a name of the field or a list of names to generate embeddings for. -Returns: [DLT resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. +Returns: [dlt resource](../../general-usage/resource.md) object that you can pass to the `pipeline.run()`. Example: diff --git a/docs/website/docs/dlt-ecosystem/destinations/redshift.md b/docs/website/docs/dlt-ecosystem/destinations/redshift.md index a6445b6a5c..349698d201 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/redshift.md +++ b/docs/website/docs/dlt-ecosystem/destinations/redshift.md @@ -7,7 +7,7 @@ keywords: [redshift, destination, data warehouse] # Amazon Redshift ## Install dlt with Redshift -**To install the DLT library with Redshift dependencies:** +**To install the dlt library with Redshift dependencies:** ```sh pip install dlt[redshift] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md index e3a78422c6..8ba6934313 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/snowflake.md +++ b/docs/website/docs/dlt-ecosystem/destinations/snowflake.md @@ -7,7 +7,7 @@ keywords: [Snowflake, destination, data warehouse] # Snowflake ## Install dlt with Snowflake -**To install the DLT library with Snowflake dependencies, run:** +**To install the dlt library with Snowflake dependencies, run:** ```sh pip install dlt[snowflake] ``` diff --git a/docs/website/docs/dlt-ecosystem/destinations/synapse.md b/docs/website/docs/dlt-ecosystem/destinations/synapse.md index ff46efb272..d1c7d36aa2 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/synapse.md +++ b/docs/website/docs/dlt-ecosystem/destinations/synapse.md @@ -7,7 +7,7 @@ keywords: [synapse, destination, data warehouse] # Synapse ## Install dlt with Synapse -**To install the DLT library with Synapse dependencies:** +**To install the dlt library with Synapse dependencies:** ```sh pip install dlt[synapse] ``` diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md index 3be72adfa0..7b957e98ea 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/google_sheets.md @@ -604,6 +604,4 @@ def get_named_ranges(): tasks.add_run(pipeline, google_spreadsheet("1HhWHjqouQnnCIZAFa2rL6vT91YRN8aIhts22SUUR580"), decompose="none", trigger_rule="all_done", retries=0, provide_context=True) ``` -Enjoy the DLT Google Sheets pipeline experience! - diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md index caf5ae2359..a9d70c338c 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/strapi.md @@ -36,7 +36,7 @@ Sources and resources that can be loaded using this verified source are: 1. Fill in Name, Description, and Duration. 1. Choose a token type: Read Only, Full Access, or custom (with find and findOne selected). 1. Save to view your API token. -1. Copy it for DLT secrets setup. +1. Copy it for dlt secrets setup. > Note: The Strapi UI, which is described here, might change. > The full guide is available at [this link.](https://docs.strapi.io/user-docs/settings/API-tokens) diff --git a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md index 3163062ced..8808d1f1a5 100644 --- a/docs/website/docs/general-usage/customising-pipelines/removing_columns.md +++ b/docs/website/docs/general-usage/customising-pipelines/removing_columns.md @@ -78,7 +78,7 @@ Let's create a sample pipeline demonstrating the process of removing a column. 1. At last, create a pipeline: ```py - # Integrating with a DLT pipeline + # Integrating with a dlt pipeline pipeline = dlt.pipeline( pipeline_name='example', destination='bigquery', diff --git a/docs/website/docs/general-usage/incremental-loading.md b/docs/website/docs/general-usage/incremental-loading.md index 23b2218b46..28d2f862b2 100644 --- a/docs/website/docs/general-usage/incremental-loading.md +++ b/docs/website/docs/general-usage/incremental-loading.md @@ -48,7 +48,13 @@ dataset with the merge write disposition. ## Merge incremental loading -The `merge` write disposition is used in two scenarios: +The `merge` write disposition can be used with two different strategies: +1) `delete-insert` (default strategy) +2) `scd2` + +### `delete-insert` strategy + +The default `delete-insert` strategy is used in two scenarios: 1. You want to keep only one instance of certain record i.e. you receive updates of the `user` state from an API and want to keep just one record per `user_id`. @@ -56,7 +62,7 @@ The `merge` write disposition is used in two scenarios: instance of a record for each batch even in case you load an old batch or load the current batch several times a day (i.e. to receive "live" updates). -The `merge` write disposition loads data to a `staging` dataset, deduplicates the staging data if a +The `delete-insert` strategy loads data to a `staging` dataset, deduplicates the staging data if a `primary_key` is provided, deletes the data from the destination using `merge_key` and `primary_key`, and then inserts the new records. All of this happens in a single atomic transaction for a parent and all child tables. @@ -126,7 +132,7 @@ def github_repo_events(last_created_at = dlt.sources.incremental("created_at", " yield from _get_rest_pages("events") ``` -### Delete records +#### Delete records The `hard_delete` column hint can be used to delete records from the destination dataset. The behavior of the delete mechanism depends on the data type of the column marked with the hint: 1) `bool` type: only `True` leads to a delete—`None` and `False` values are disregarded 2) other types: each `not None` value leads to a delete @@ -135,7 +141,7 @@ Each record in the destination table with the same `primary_key` or `merge_key` Deletes are propagated to any child table that might exist. For each record that gets deleted in the root table, all corresponding records in the child table(s) will also be deleted. Records in parent and child tables are linked through the `root key` that is explained in the next section. -#### Example: with primary key and boolean delete column +##### Example: with primary key and boolean delete column ```py @dlt.resource( primary_key="id", @@ -158,7 +164,7 @@ def resource(): ... ``` -#### Example: with merge key and non-boolean delete column +##### Example: with merge key and non-boolean delete column ```py @dlt.resource( merge_key="id", @@ -176,7 +182,7 @@ def resource(): ... ``` -#### Example: with primary key and "dedup_sort" hint +##### Example: with primary key and "dedup_sort" hint ```py @dlt.resource( primary_key="id", @@ -198,7 +204,7 @@ def resource(): ... ``` -### Forcing root key propagation +#### Forcing root key propagation Merge write disposition requires that the `_dlt_id` of top level table is propagated to child tables. This concept is similar to foreign key which references a parent table, and we call it a @@ -230,6 +236,136 @@ In example above we enforce the root key propagation with `fb_ads.root_key = Tru that correct data is propagated on initial `replace` load so the future `merge` load can be executed. You can achieve the same in the decorator `@dlt.source(root_key=True)`. +### `scd2` strategy +`dlt` can create [Slowly Changing Dimension Type 2](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row) (SCD2) destination tables for dimension tables that change in the source. The resource is expected to provide a full extract of the source table each run. A row hash is stored in `_dlt_id` and used as surrogate key to identify source records that have been inserted, updated, or deleted. A high timestamp (9999-12-31 00:00:00.000000) is used to indicate an active record. + +#### Example: `scd2` merge strategy +```py +@dlt.resource( + write_disposition={"disposition": "merge", "strategy": "scd2"} +) +def dim_customer(): + # initial load + yield [ + {"customer_key": 1, "c1": "foo", "c2": 1}, + {"customer_key": 2, "c1": "bar", "c2": 2} + ] + +pipeline.run(dim_customer()) # first run — 2024-04-09 18:27:53.734235 +... +``` + +*`dim_customer` destination table after first run—inserted two records present in initial load and added validity columns:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `customer_key` | `c1` | `c2` | +| -- | -- | -- | -- | -- | +| 2024-04-09 18:27:53.734235 | 9999-12-31 00:00:00.000000 | 1 | foo | 1 | +| 2024-04-09 18:27:53.734235 | 9999-12-31 00:00:00.000000 | 2 | bar | 2 | + +```py +... +def dim_customer(): + # second load — record for customer_key 1 got updated + yield [ + {"customer_key": 1, "c1": "foo_updated", "c2": 1}, + {"customer_key": 2, "c1": "bar", "c2": 2} +] + +pipeline.run(dim_customer()) # second run — 2024-04-09 22:13:07.943703 +``` + +*`dim_customer` destination table after second run—inserted new record for `customer_key` 1 and retired old record by updating `_dlt_valid_to`:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `customer_key` | `c1` | `c2` | +| -- | -- | -- | -- | -- | +| 2024-04-09 18:27:53.734235 | **2024-04-09 22:13:07.943703** | 1 | foo | 1 | +| 2024-04-09 18:27:53.734235 | 9999-12-31 00:00:00.000000 | 2 | bar | 2 | +| **2024-04-09 22:13:07.943703** | **9999-12-31 00:00:00.000000** | **1** | **foo_updated** | **1** | + +```py +... +def dim_customer(): + # third load — record for customer_key 2 got deleted + yield [ + {"customer_key": 1, "c1": "foo_updated", "c2": 1}, + ] + +pipeline.run(dim_customer()) # third run — 2024-04-10 06:45:22.847403 +``` + +*`dim_customer` destination table after third run—retired deleted record by updating `_dlt_valid_to`:* + +| `_dlt_valid_from` | `_dlt_valid_to` | `customer_key` | `c1` | `c2` | +| -- | -- | -- | -- | -- | +| 2024-04-09 18:27:53.734235 | 2024-04-09 22:13:07.943703 | 1 | foo | 1 | +| 2024-04-09 18:27:53.734235 | **2024-04-10 06:45:22.847403** | 2 | bar | 2 | +| 2024-04-09 22:13:07.943703 | 9999-12-31 00:00:00.000000 | 1 | foo_updated | 1 | + +#### Example: customize validity column names +`_dlt_valid_from` and `_dlt_valid_to` are used by default as validity column names. Other names can be configured as follows: +```py +@dlt.resource( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "validity_column_names": ["from", "to"], # will use "from" and "to" instead of default values + } +) +def dim_customer(): + ... +... +``` + +#### Example: use your own row hash +By default, `dlt` generates a row hash based on all columns provided by the resource and stores it in `_dlt_id`. You can use your own hash instead by specifying `row_version_column_name` in the `write_disposition` dictionary. You might already have a column present in your resource that can naturally serve as row hash, in which case it's more efficient to use those pre-existing hash values than to generate new artificial ones. This option also allows you to use hashes based on a subset of columns, in case you want to ignore changes in some of the columns. When using your own hash, values for `_dlt_id` are randomly generated. +```py +@dlt.resource( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "row_version_column_name": "row_hash", # the column "row_hash" should be provided by the resource + } +) +def dim_customer(): + ... +... +``` + +#### 🧪 Use scd2 with Arrow Tables and Panda frames +`dlt` will not add **row hash** column to the tabular data automatically (we are working on it). +You need to do that yourself by adding a transform function to `scd2` resource that computes row hashes (using pandas.util, should be fairly fast). +```py +import dlt +from dlt.sources.helpers.transform import add_row_hash_to_table + +scd2_r = dlt.resource( + arrow_table, + name="tabular", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "row_version_column_name": "row_hash", + }, + ).add_map(add_row_hash_to_table("row_hash")) +``` +`add_row_hash_to_table` is the name of the transform function that will compute and create `row_hash` column that is declared as holding the hash by `row_version_column_name`. + +:::tip +You can modify existing resources that yield data in tabular form by calling `apply_hints` and passing `scd2` config in `write_disposition` and then by +adding the transform with `add_map`. +::: + +#### Child tables +Child tables, if any, do not contain validity columns. Validity columns are only added to the root table. Validity column values for records in child tables can be obtained by joining the root table using `_dlt_root_id`. + +#### Limitations + +* You cannot use columns like `updated_at` or integer `version` of a record that are unique within a `primary_key` (even if it is defined). Hash column +must be unique for a root table. We are working to allow `updated_at` style tracking +* We do not detect changes in child tables (except new records) if row hash of the corresponding parent row does not change. Use `updated_at` or similar +column in the root table to stamp changes in nested data. +* `merge_key(s)` are (for now) ignored. + ## Incremental loading with a cursor field In most of the REST APIs (and other data sources i.e. database tables) you can request new or updated diff --git a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md index 29a0ae86f8..cae8a7414d 100644 --- a/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md +++ b/docs/website/docs/walkthroughs/deploy-a-pipeline/deploy-gcp-cloud-function-as-webhook.md @@ -1,20 +1,20 @@ # Deploy GCP Cloud Function as a Webhook -A webhook is a way for one application to send automated messages or data to another application in real time. Unlike traditional APIs, which require constant polling for updates, webhooks allow applications to push information instantly as soon as an event occurs. This event-driven architecture enables faster and more responsive interactions between systems, saving valuable resources and improving overall system performance. +A webhook is a way for one application to send automated messages or data to another application in real time. Unlike traditional APIs, which require constant polling for updates, webhooks allow applications to push information instantly as soon as an event occurs. This event-driven architecture enables faster and more responsive interactions between systems, saving valuable resources and improving overall system performance. With this `dlt` google cloud event ingestion webhook, you can ingest the data and load it to the destination in real time as soon as a post request is triggered by the webhook. You can use this cloud function as an event ingestion webhook on various platforms such as Slack, Discord, Stripe, PayPal and any other as per your requirement. -You can setup GCP cloud function webhook using `dlt` as follows: +You can setup GCP cloud function webhook using `dlt` as follows: ## 1. **Initialize deployment** 1. Sign in to your GCP account and enable the Cloud Functions API. 2. Go to the Cloud Functions section and click Create Function. Set up the environment and select the region. -3. Configure the trigger type, you can use any trigger but for this example we will use HTTP and select "Allow unauthenticated invocations". +3. Configure the trigger type, you can use any trigger but for this example we will use HTTP and select "Allow unauthenticated invocations". 4. Click "Save" and then "Next". 5. Select "Python 3.10" as the environment. 6. Use the code provided to set up the cloud function for event ingestion: - + ```py import dlt import time @@ -24,31 +24,31 @@ You can setup GCP cloud function webhook using `dlt` as follows: def your_webhook(request): # Extract relevant data from the request payload data = request.get_json() - + Event = [data] - + pipeline = dlt.pipeline( pipeline_name='platform_to_bigquery', destination='bigquery', dataset_name='webhooks', ) - + pipeline.run(Event, table_name='webhook') #table_name can be customized return 'Event received and processed successfully.' ``` - + 7. Set the function name as "your_webhook" in the Entry point field. 8. In the requirements.txt file, specify the necessary packages: - + ```text # Function dependencies, for example: # package>=version dlt dlt[bigquery] ``` - + 9. Click on "Deploy" to complete the setup. - + > You can now use this cloud function as a webhook for event ingestion on various platforms such as Slack, Discord, Stripe, PayPal, and any other as per your requirement. Just remember to use the “Trigger URL” created by the cloud function when setting up the webhook. The Trigger URL can be found in the Trigger tab. @@ -58,7 +58,7 @@ To manually test the function you have created, you can send a manual POST reque ```sh import requests - + webhook_url = 'please set me up!' # Your cloud function Trigger URL message = { 'text': 'Hello, Slack!', @@ -72,9 +72,6 @@ if response.status_code == 200: else: print('Failed to send message. Error:', response.text) ``` - -> Replace the webhook_url with the Trigger URL for the cloud function created. -Now after setting up the webhook using cloud functions, every time an event occurs, the data will be ingested into your specified destination. - -That’s it! Enjoy deploying `DLT` GCP cloud function as webhook! +> Replace the webhook_url with the Trigger URL for the cloud function created. +Now after setting up the webhook using cloud functions, every time an event occurs, the data will be ingested into your specified destination. diff --git a/tests/cases.py b/tests/cases.py index f92c3ac5de..83814845a7 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -19,7 +19,7 @@ ) from dlt.common.schema import TColumnSchema, TTableSchemaColumns -from tests.utils import TArrowFormat, TestDataItemFormat, arrow_item_from_pandas +from tests.utils import TPythonTableFormat, TestDataItemFormat, arrow_item_from_pandas # _UUID = "c8209ee7-ee95-4b90-8c9f-f7a0f8b51014" JSON_TYPED_DICT: StrAny = { diff --git a/tests/common/storages/test_load_package.py b/tests/common/storages/test_load_package.py index 68396a76c8..ecbc5d296d 100644 --- a/tests/common/storages/test_load_package.py +++ b/tests/common/storages/test_load_package.py @@ -16,6 +16,7 @@ from dlt.common.configuration.container import Container from dlt.common.storages.load_package import ( LoadPackageStateInjectableContext, + create_load_id, destination_state, load_package, commit_load_package_state, @@ -69,7 +70,35 @@ def test_save_load_schema(load_storage: LoadStorage) -> None: assert schema.stored_version == schema_copy.stored_version -def test_create_and_update_loadpackage_state(load_storage: LoadStorage) -> None: +def test_create_package(load_storage: LoadStorage) -> None: + package_storage = load_storage.new_packages + # create package without initial state + load_id = create_load_id() + package_storage.create_package(load_id) + # get state, created at must be == load_id + state = package_storage.get_load_package_state(load_id) + assert state["created_at"] == pendulum.from_timestamp(float(load_id)) + # assume those few lines execute in less than a second + assert pendulum.now().diff(state["created_at"]).total_seconds() < 1 + + # create package with non timestamp load id + load_id = uniq_id() + package_storage.create_package(load_id) + state = package_storage.get_load_package_state(load_id) + # still valid created at is there + # assume those few lines execute in less than a second + assert pendulum.now().diff(state["created_at"]).total_seconds() < 1 + + force_created_at = pendulum.now().subtract(days=1) + state["destination_state"] = {"destination": "custom"} + state["created_at"] = force_created_at + load_id = uniq_id() + package_storage.create_package(load_id, initial_state=state) + state_2 = package_storage.get_load_package_state(load_id) + assert state_2["created_at"] == force_created_at + + +def test_create_and_update_load_package_state(load_storage: LoadStorage) -> None: load_storage.new_packages.create_package("copy") state = load_storage.new_packages.get_load_package_state("copy") assert state["_state_version"] == 0 @@ -88,12 +117,20 @@ def test_create_and_update_loadpackage_state(load_storage: LoadStorage) -> None: assert state["created_at"] == old_state["created_at"] # check timestamp - time = pendulum.parse(state["created_at"]) + created_at = state["created_at"] now = pendulum.now() - assert (now - time).in_seconds() < 2 # type: ignore + assert (now - created_at).in_seconds() < 2 -def test_loadpackage_state_injectable_context(load_storage: LoadStorage) -> None: +def test_create_load_id() -> None: + # must increase over time + load_id_1 = create_load_id() + sleep(0.1) + load_id_2 = create_load_id() + assert load_id_2 > load_id_1 + + +def test_load_package_state_injectable_context(load_storage: LoadStorage) -> None: load_storage.new_packages.create_package("copy") container = Container() diff --git a/tests/common/storages/test_load_storage.py b/tests/common/storages/test_load_storage.py index a70242001d..e8686ac2f9 100644 --- a/tests/common/storages/test_load_storage.py +++ b/tests/common/storages/test_load_storage.py @@ -6,6 +6,8 @@ from dlt.common.storages import PackageStorage, LoadStorage from dlt.common.storages.exceptions import LoadPackageNotFound, NoMigrationPathException +from dlt.common.storages.file_storage import FileStorage +from dlt.common.storages.load_package import create_load_id from tests.common.storages.utils import start_loading_file, assert_package_info, load_storage from tests.utils import write_version, autouse_test_storage @@ -158,6 +160,25 @@ def test_get_unknown_package_info(load_storage: LoadStorage) -> None: load_storage.get_load_package_info("UNKNOWN LOAD ID") +def test_import_extracted_package(load_storage: LoadStorage) -> None: + # create extracted package + extracted = PackageStorage( + FileStorage(os.path.join(load_storage.config.load_volume_path, "extracted")), "new" + ) + load_id = create_load_id() + extracted.create_package(load_id) + extracted_state = extracted.get_load_package_state(load_id) + load_storage.import_extracted_package(load_id, extracted) + # make sure state was imported + assert extracted_state == load_storage.new_packages.get_load_package_state(load_id) + # move to normalized + load_storage.commit_new_load_package(load_id) + assert extracted_state == load_storage.normalized_packages.get_load_package_state(load_id) + # move to loaded + load_storage.complete_load_package(load_id, aborted=False) + assert extracted_state == load_storage.loaded_packages.get_load_package_state(load_id) + + def test_full_migration_path() -> None: # create directory structure s = LoadStorage(True, LoadStorage.ALL_SUPPORTED_FILE_FORMATS) diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 6ff1a0bf5f..5dd3d6c3ca 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -1317,6 +1317,28 @@ def empty_gen(): "primary_key": True, "merge_key": True, } + # test SCD2 write disposition hint + empty_r.apply_hints( + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "validity_column_names": ["from", "to"], + } + ) + assert empty_r._hints["write_disposition"] == { + "disposition": "merge", + "strategy": "scd2", + "validity_column_names": ["from", "to"], + } + assert "from" not in empty_r._hints["columns"] + assert "to" not in empty_r._hints["columns"] + table = empty_r.compute_table_schema() + assert table["write_disposition"] == "merge" + assert table["x-merge-strategy"] == "scd2" + assert "from" in table["columns"] + assert "x-valid-from" in table["columns"]["from"] + assert "to" in table["columns"] + assert "x-valid-to" in table["columns"]["to"] def test_apply_dynamic_hints() -> None: diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index 82ccb24bf1..b239899bce 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -14,7 +14,12 @@ from tests.load.utils import destinations_configs, DestinationTestConfiguration from tests.load.pipeline.utils import select_data from tests.pipeline.utils import assert_load_info -from tests.utils import TestDataItemFormat, arrow_item_from_pandas, preserve_environ, TArrowFormat +from tests.utils import ( + TestDataItemFormat, + arrow_item_from_pandas, + preserve_environ, + TPythonTableFormat, +) from tests.cases import arrow_table_all_data_types # mark all tests as essential, do not remove @@ -148,7 +153,7 @@ def some_data(): ) @pytest.mark.parametrize("item_type", ["arrow-table", "pandas", "arrow-batch"]) def test_parquet_column_names_are_normalized( - item_type: TArrowFormat, destination_config: DestinationTestConfiguration + item_type: TPythonTableFormat, destination_config: DestinationTestConfiguration ) -> None: """Test normalizing of parquet columns in all destinations""" # Create df with column names with inconsistent naming conventions diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index a9a82e39f7..bfcdccfba4 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -11,6 +11,7 @@ from dlt.common.configuration.container import Container from dlt.common.pipeline import StateInjectableContext from dlt.common.schema.utils import has_table_seen_data +from dlt.common.schema.exceptions import SchemaException from dlt.common.typing import StrAny from dlt.common.utils import digest128 from dlt.extract import DltResource @@ -946,3 +947,19 @@ def r(): ) with pytest.raises(PipelineStepFailed): info = p.run(r(), loader_file_format=destination_config.file_format) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +def test_invalid_merge_strategy(destination_config: DestinationTestConfiguration) -> None: + @dlt.resource(write_disposition={"disposition": "merge", "strategy": "foo"}) # type: ignore[call-overload] + def r(): + yield {"foo": "bar"} + + p = destination_config.setup_pipeline("abstract", full_refresh=True) + with pytest.raises(PipelineStepFailed) as pip_ex: + p.run(r()) + assert isinstance(pip_ex.value.__context__, SchemaException) diff --git a/tests/load/pipeline/test_scd2.py b/tests/load/pipeline/test_scd2.py new file mode 100644 index 0000000000..cf313eaa61 --- /dev/null +++ b/tests/load/pipeline/test_scd2.py @@ -0,0 +1,583 @@ +# timezone is removed from all datetime objects in these tests to simplify comparison + +import pytest +from typing import List, Dict, Any +from datetime import datetime, timezone # noqa: I251 + +import dlt +from dlt.common.pipeline import LoadInfo +from dlt.common.schema.exceptions import ColumnNameConflictException +from dlt.common.schema.typing import DEFAULT_VALIDITY_COLUMN_NAMES +from dlt.common.normalizers.json.relational import DataItemNormalizer +from dlt.common.normalizers.naming.snake_case import NamingConvention as SnakeCaseNamingConvention +from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision +from dlt.common.typing import TDataItem +from dlt.destinations.sql_jobs import HIGH_TS +from dlt.extract.resource import DltResource +from dlt.pipeline.exceptions import PipelineStepFailed + +from tests.cases import arrow_table_all_data_types +from tests.pipeline.utils import assert_load_info, load_table_counts +from tests.load.pipeline.utils import ( + destinations_configs, + DestinationTestConfiguration, + load_tables_to_dicts, +) +from tests.utils import TPythonTableFormat + +get_row_hash = DataItemNormalizer.get_row_hash + + +def get_active_ts(pipeline: dlt.Pipeline) -> datetime: + caps = pipeline._get_destination_capabilities() + active_ts = HIGH_TS.in_timezone(tz="UTC").replace(tzinfo=None) + return reduce_pendulum_datetime_precision(active_ts, caps.timestamp_precision) + + +def get_load_package_created_at(pipeline: dlt.Pipeline, load_info: LoadInfo) -> datetime: + """Returns `created_at` property of load package state.""" + load_id = load_info.asdict()["loads_ids"][0] + created_at = ( + pipeline.get_load_package_state(load_id)["created_at"] + .in_timezone(tz="UTC") + .replace(tzinfo=None) + ) + caps = pipeline._get_destination_capabilities() + return reduce_pendulum_datetime_precision(created_at, caps.timestamp_precision) + + +def strip_timezone(ts: datetime) -> datetime: + """Converts timezone of datetime object to UTC and removes timezone awareness.""" + ts = ensure_pendulum_datetime(ts) + if ts.replace(tzinfo=None) == HIGH_TS: + return ts.replace(tzinfo=None) + else: + return ts.astimezone(tz=timezone.utc).replace(tzinfo=None) + + +def get_table( + pipeline: dlt.Pipeline, table_name: str, sort_column: str, include_root_id: bool = True +) -> List[Dict[str, Any]]: + """Returns destination table contents as list of dictionaries.""" + return sorted( + [ + { + k: strip_timezone(v) if isinstance(v, datetime) else v + for k, v in r.items() + if not k.startswith("_dlt") + or k in DEFAULT_VALIDITY_COLUMN_NAMES + or (k == "_dlt_root_id" if include_root_id else False) + } + for r in load_tables_to_dicts(pipeline, table_name)[table_name] + ], + key=lambda d: d[sort_column], + ) + + +def assert_records_as_set(actual: List[Dict[str, Any]], expected: List[Dict[str, Any]]) -> None: + """Compares two lists of dicts regardless of order""" + actual_set = set(frozenset(dict_.items()) for dict_ in actual) + expected_set = set(frozenset(dict_.items()) for dict_ in expected) + assert actual_set == expected_set + + +@pytest.mark.parametrize( + "destination_config,simple,validity_column_names", + [ # test basic case for alle SQL destinations supporting merge + (dconf, True, None) + for dconf in destinations_configs(default_sql_configs=True, supports_merge=True) + ] + + [ # test nested columns and validity column name configuration only for postgres + ( + dconf, + False, + ["from", "to"], + ) # "from" is a SQL keyword, so this also tests if columns are escaped + for dconf in destinations_configs(default_sql_configs=True, subset=["postgres", "duckdb"]) + ] + + [ + (dconf, False, ["ValidFrom", "ValidTo"]) + for dconf in destinations_configs(default_sql_configs=True, subset=["postgres", "duckdb"]) + ], + ids=lambda x: ( + x.name + if isinstance(x, DestinationTestConfiguration) + else (x[0] + "-" + x[1] if isinstance(x, list) else x) + ), +) +def test_core_functionality( + destination_config: DestinationTestConfiguration, + simple: bool, + validity_column_names: List[str], +) -> None: + p = destination_config.setup_pipeline("abstract", full_refresh=True) + + @dlt.resource( + table_name="dim_test", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "validity_column_names": validity_column_names, + }, + ) + def r(data): + yield data + + # get validity column names + from_, to = ( + DEFAULT_VALIDITY_COLUMN_NAMES + if validity_column_names is None + else map(SnakeCaseNamingConvention().normalize_identifier, validity_column_names) + ) + + # load 1 — initial load + dim_snap = [ + {"nk": 1, "c1": "foo", "c2": "foo" if simple else {"nc1": "foo"}}, + {"nk": 2, "c1": "bar", "c2": "bar" if simple else {"nc1": "bar"}}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + # assert x-hints + table = p.default_schema.get_table("dim_test") + assert table["x-merge-strategy"] == "scd2" # type: ignore[typeddict-item] + assert table["columns"][from_]["x-valid-from"] # type: ignore[typeddict-item] + assert table["columns"][to]["x-valid-to"] # type: ignore[typeddict-item] + assert table["columns"]["_dlt_id"]["x-row-version"] # type: ignore[typeddict-item] + # _dlt_id is still unique + assert table["columns"]["_dlt_id"]["unique"] + + # assert load results + ts_1 = get_load_package_created_at(p, info) + assert_load_info(info) + cname = "c2" if simple else "c2__nc1" + assert get_table(p, "dim_test", cname) == [ + {from_: ts_1, to: get_active_ts(p), "nk": 2, "c1": "bar", cname: "bar"}, + {from_: ts_1, to: get_active_ts(p), "nk": 1, "c1": "foo", cname: "foo"}, + ] + + # load 2 — update a record + dim_snap = [ + {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, + {"nk": 2, "c1": "bar", "c2": "bar" if simple else {"nc1": "bar"}}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_2 = get_load_package_created_at(p, info) + assert_load_info(info) + assert get_table(p, "dim_test", cname) == [ + {from_: ts_1, to: get_active_ts(p), "nk": 2, "c1": "bar", cname: "bar"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + {from_: ts_2, to: get_active_ts(p), "nk": 1, "c1": "foo", cname: "foo_updated"}, + ] + + # load 3 — delete a record + dim_snap = [ + {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_3 = get_load_package_created_at(p, info) + assert_load_info(info) + assert get_table(p, "dim_test", cname) == [ + {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", cname: "bar"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + {from_: ts_2, to: get_active_ts(p), "nk": 1, "c1": "foo", cname: "foo_updated"}, + ] + + # load 4 — insert a record + dim_snap = [ + {"nk": 1, "c1": "foo", "c2": "foo_updated" if simple else {"nc1": "foo_updated"}}, + {"nk": 3, "c1": "baz", "c2": "baz" if simple else {"nc1": "baz"}}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_4 = get_load_package_created_at(p, info) + assert_load_info(info) + assert get_table(p, "dim_test", cname) == [ + {from_: ts_1, to: ts_3, "nk": 2, "c1": "bar", cname: "bar"}, + {from_: ts_4, to: get_active_ts(p), "nk": 3, "c1": "baz", cname: "baz"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", cname: "foo"}, + {from_: ts_2, to: get_active_ts(p), "nk": 1, "c1": "foo", cname: "foo_updated"}, + ] + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("simple", [True, False]) +def test_child_table(destination_config: DestinationTestConfiguration, simple: bool) -> None: + p = destination_config.setup_pipeline("abstract", full_refresh=True) + + @dlt.resource( + table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"} + ) + def r(data): + yield data + + # get validity column names + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + + # load 1 — initial load + dim_snap: List[Dict[str, Any]] = [ + l1_1 := {"nk": 1, "c1": "foo", "c2": [1] if simple else [{"cc1": 1}]}, + l1_2 := {"nk": 2, "c1": "bar", "c2": [2, 3] if simple else [{"cc1": 2}, {"cc1": 3}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_1 = get_load_package_created_at(p, info) + assert_load_info(info) + assert get_table(p, "dim_test", "c1") == [ + {from_: ts_1, to: get_active_ts(p), "nk": 2, "c1": "bar"}, + {from_: ts_1, to: get_active_ts(p), "nk": 1, "c1": "foo"}, + ] + cname = "value" if simple else "cc1" + assert get_table(p, "dim_test__c2", cname) == [ + {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l1_2), cname: 2}, + {"_dlt_root_id": get_row_hash(l1_2), cname: 3}, + ] + + # load 2 — update a record — change not in complex column + dim_snap = [ + l2_1 := {"nk": 1, "c1": "foo_updated", "c2": [1] if simple else [{"cc1": 1}]}, + {"nk": 2, "c1": "bar", "c2": [2, 3] if simple else [{"cc1": 2}, {"cc1": 3}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_2 = get_load_package_created_at(p, info) + assert_load_info(info) + assert get_table(p, "dim_test", "c1") == [ + {from_: ts_1, to: get_active_ts(p), "nk": 2, "c1": "bar"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, # updated + {from_: ts_2, to: get_active_ts(p), "nk": 1, "c1": "foo_updated"}, # new + ] + assert_records_as_set( + get_table(p, "dim_test__c2", cname), + [ + {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l2_1), cname: 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), cname: 2}, + {"_dlt_root_id": get_row_hash(l1_2), cname: 3}, + ], + ) + + # load 3 — update a record — change in complex column + dim_snap = [ + l3_1 := { + "nk": 1, + "c1": "foo_updated", + "c2": [1, 2] if simple else [{"cc1": 1}, {"cc1": 2}], + }, + {"nk": 2, "c1": "bar", "c2": [2, 3] if simple else [{"cc1": 2}, {"cc1": 3}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_3 = get_load_package_created_at(p, info) + assert_load_info(info) + assert_records_as_set( + get_table(p, "dim_test", "c1"), + [ + {from_: ts_1, to: get_active_ts(p), "nk": 2, "c1": "bar"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, + {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, # updated + {from_: ts_3, to: get_active_ts(p), "nk": 1, "c1": "foo_updated"}, # new + ], + ) + exp_3 = [ + {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l2_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l3_1), cname: 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), cname: 2}, + {"_dlt_root_id": get_row_hash(l3_1), cname: 2}, # new + {"_dlt_root_id": get_row_hash(l1_2), cname: 3}, + ] + assert_records_as_set(get_table(p, "dim_test__c2", cname), exp_3) + + # load 4 — delete a record + dim_snap = [ + {"nk": 1, "c1": "foo_updated", "c2": [1, 2] if simple else [{"cc1": 1}, {"cc1": 2}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_4 = get_load_package_created_at(p, info) + assert_load_info(info) + assert_records_as_set( + get_table(p, "dim_test", "c1"), + [ + {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, # updated + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, + {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, + {from_: ts_3, to: get_active_ts(p), "nk": 1, "c1": "foo_updated"}, + ], + ) + assert_records_as_set( + get_table(p, "dim_test__c2", cname), exp_3 + ) # deletes should not alter child tables + + # load 5 — insert a record + dim_snap = [ + {"nk": 1, "c1": "foo_updated", "c2": [1, 2] if simple else [{"cc1": 1}, {"cc1": 2}]}, + l5_3 := {"nk": 3, "c1": "baz", "c2": [1, 2] if simple else [{"cc1": 1}, {"cc1": 2}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + ts_5 = get_load_package_created_at(p, info) + assert_load_info(info) + assert_records_as_set( + get_table(p, "dim_test", "c1"), + [ + {from_: ts_1, to: ts_4, "nk": 2, "c1": "bar"}, + {from_: ts_5, to: get_active_ts(p), "nk": 3, "c1": "baz"}, # new + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo"}, + {from_: ts_2, to: ts_3, "nk": 1, "c1": "foo_updated"}, + {from_: ts_3, to: get_active_ts(p), "nk": 1, "c1": "foo_updated"}, + ], + ) + assert_records_as_set( + get_table(p, "dim_test__c2", cname), + [ + {"_dlt_root_id": get_row_hash(l1_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l2_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l3_1), cname: 1}, + {"_dlt_root_id": get_row_hash(l5_3), cname: 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), cname: 2}, + {"_dlt_root_id": get_row_hash(l3_1), cname: 2}, + {"_dlt_root_id": get_row_hash(l5_3), cname: 2}, # new + {"_dlt_root_id": get_row_hash(l1_2), cname: 3}, + ], + ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, supports_merge=True), + ids=lambda x: x.name, +) +def test_grandchild_table(destination_config: DestinationTestConfiguration) -> None: + p = destination_config.setup_pipeline("abstract", full_refresh=True) + + @dlt.resource( + table_name="dim_test", write_disposition={"disposition": "merge", "strategy": "scd2"} + ) + def r(data): + yield data + + # load 1 — initial load + dim_snap = [ + l1_1 := {"nk": 1, "c1": "foo", "c2": [{"cc1": [1]}]}, + l1_2 := {"nk": 2, "c1": "bar", "c2": [{"cc1": [1, 2]}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + assert_records_as_set( + get_table(p, "dim_test__c2__cc1", "value"), + [ + {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, + {"_dlt_root_id": get_row_hash(l1_2), "value": 2}, + ], + ) + + # load 2 — update a record — change not in complex column + dim_snap = [ + l2_1 := {"nk": 1, "c1": "foo_updated", "c2": [{"cc1": [1]}]}, + l1_2 := {"nk": 2, "c1": "bar", "c2": [{"cc1": [1, 2]}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + assert_records_as_set( + (get_table(p, "dim_test__c2__cc1", "value")), + [ + {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, + {"_dlt_root_id": get_row_hash(l2_1), "value": 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), "value": 2}, + ], + ) + + # load 3 — update a record — change in complex column + dim_snap = [ + l3_1 := {"nk": 1, "c1": "foo_updated", "c2": [{"cc1": [1, 2]}]}, + {"nk": 2, "c1": "bar", "c2": [{"cc1": [1, 2]}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + exp_3 = [ + {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, + {"_dlt_root_id": get_row_hash(l2_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l3_1), "value": 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), "value": 2}, + {"_dlt_root_id": get_row_hash(l3_1), "value": 2}, # new + ] + assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3) + + # load 4 — delete a record + dim_snap = [ + {"nk": 1, "c1": "foo_updated", "c2": [{"cc1": [1, 2]}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + assert_records_as_set(get_table(p, "dim_test__c2__cc1", "value"), exp_3) + + # load 5 — insert a record + dim_snap = [ + {"nk": 1, "c1": "foo_updated", "c2": [{"cc1": [1, 2]}]}, + l5_3 := {"nk": 3, "c1": "baz", "c2": [{"cc1": [1]}]}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + assert_records_as_set( + get_table(p, "dim_test__c2__cc1", "value"), + [ + {"_dlt_root_id": get_row_hash(l1_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l1_2), "value": 1}, + {"_dlt_root_id": get_row_hash(l2_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l3_1), "value": 1}, + {"_dlt_root_id": get_row_hash(l5_3), "value": 1}, # new + {"_dlt_root_id": get_row_hash(l1_2), "value": 2}, + {"_dlt_root_id": get_row_hash(l3_1), "value": 2}, + ], + ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +def test_validity_column_name_conflict(destination_config: DestinationTestConfiguration) -> None: + p = destination_config.setup_pipeline("abstract", full_refresh=True) + + @dlt.resource( + table_name="dim_test", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "validity_column_names": ["from", "to"], + }, + ) + def r(data): + yield data + + # configuring a validity column name that appears in the data should cause an exception + dim_snap = {"nk": 1, "foo": 1, "from": 1} # conflict on "from" column + with pytest.raises(PipelineStepFailed) as pip_ex: + p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert isinstance(pip_ex.value.__context__.__context__, ColumnNameConflictException) + dim_snap = {"nk": 1, "foo": 1, "to": 1} # conflict on "to" column + with pytest.raises(PipelineStepFailed): + p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert isinstance(pip_ex.value.__context__.__context__, ColumnNameConflictException) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) +def test_arrow_custom_hash( + destination_config: DestinationTestConfiguration, item_type: TPythonTableFormat +) -> None: + table, _, _ = arrow_table_all_data_types(item_type, num_rows=100, include_json=False) + orig_table: Any = None + if item_type == "pandas": + orig_table = table.copy(deep=True) + + from dlt.sources.helpers.transform import add_row_hash_to_table + + def _make_scd2_r(table_: Any) -> DltResource: + return dlt.resource( + table_, + name="tabular", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "row_version_column_name": "row_hash", + }, + ).add_map(add_row_hash_to_table("row_hash")) + + p = destination_config.setup_pipeline("abstract", full_refresh=True) + info = p.run(_make_scd2_r(table), loader_file_format=destination_config.file_format) + assert_load_info(info) + # make sure we have scd2 columns in schema + table_schema = p.default_schema.get_table("tabular") + assert table_schema["x-merge-strategy"] == "scd2" # type: ignore[typeddict-item] + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + assert table_schema["columns"][from_]["x-valid-from"] # type: ignore[typeddict-item] + assert table_schema["columns"][to]["x-valid-to"] # type: ignore[typeddict-item] + assert table_schema["columns"]["row_hash"]["x-row-version"] # type: ignore[typeddict-item] + # 100 items in destination + assert load_table_counts(p, "tabular")["tabular"] == 100 + + # modify in place (pandas only) + if item_type == "pandas": + table = orig_table + orig_table = table.copy(deep=True) + info = p.run(_make_scd2_r(table), loader_file_format=destination_config.file_format) + assert_load_info(info) + # no changes (hopefully hash is deterministic) + assert load_table_counts(p, "tabular")["tabular"] == 100 + + # change single row + orig_table.iloc[0, 0] = "Duck 🦆!" + info = p.run(_make_scd2_r(orig_table), loader_file_format=destination_config.file_format) + assert_load_info(info) + # on row changed + assert load_table_counts(p, "tabular")["tabular"] == 101 + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) +def test_user_provided_row_hash(destination_config: DestinationTestConfiguration) -> None: + p = destination_config.setup_pipeline("abstract", full_refresh=True) + + @dlt.resource( + table_name="dim_test", + write_disposition={ + "disposition": "merge", + "strategy": "scd2", + "row_version_column_name": "row_hash", + }, + ) + def r(data): + yield data + + # load 1 — initial load + dim_snap: List[Dict[str, Any]] = [ + {"nk": 1, "c1": "foo", "c2": [1], "row_hash": "mocked_hash_1"}, + {"nk": 2, "c1": "bar", "c2": [2, 3], "row_hash": "mocked_hash_2"}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + ts_1 = get_load_package_created_at(p, info) + table = p.default_schema.get_table("dim_test") + assert table["columns"]["row_hash"]["x-row-version"] # type: ignore[typeddict-item] + assert "x-row-version" not in table["columns"]["_dlt_id"] + + # load 2 — update and delete a record + dim_snap = [ + {"nk": 1, "c1": "foo_upd", "c2": [1], "row_hash": "mocked_hash_1_upd"}, + ] + info = p.run(r(dim_snap), loader_file_format=destination_config.file_format) + assert_load_info(info) + ts_2 = get_load_package_created_at(p, info) + + # assert load results + from_, to = DEFAULT_VALIDITY_COLUMN_NAMES + assert get_table(p, "dim_test", "c1") == [ + {from_: ts_1, to: ts_2, "nk": 2, "c1": "bar", "row_hash": "mocked_hash_2"}, + {from_: ts_1, to: ts_2, "nk": 1, "c1": "foo", "row_hash": "mocked_hash_1"}, + { + from_: ts_2, + to: get_active_ts(p), + "nk": 1, + "c1": "foo_upd", + "row_hash": "mocked_hash_1_upd", + }, + ] + # root id is not deterministic when a user provided row hash is used + assert get_table(p, "dim_test__c2", "value", include_root_id=False) == [ + {"value": 1}, + {"value": 1}, + {"value": 2}, + {"value": 3}, + ] diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index d9930c19ee..667f26476b 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -19,7 +19,7 @@ ) from tests.utils import ( preserve_environ, - TArrowFormat, + TPythonTableFormat, arrow_item_from_pandas, arrow_item_from_table, ) @@ -36,7 +36,7 @@ ("arrow-batch", True), ], ) -def test_extract_and_normalize(item_type: TArrowFormat, is_list: bool): +def test_extract_and_normalize(item_type: TPythonTableFormat, is_list: bool): item, records, data = arrow_table_all_data_types(item_type) pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="filesystem") @@ -121,7 +121,7 @@ def some_data(): ("arrow-batch", True), ], ) -def test_normalize_jsonl(item_type: TArrowFormat, is_list: bool): +def test_normalize_jsonl(item_type: TPythonTableFormat, is_list: bool): os.environ["DUMMY__LOADER_FILE_FORMAT"] = "jsonl" item, records, _ = arrow_table_all_data_types(item_type, tz="Europe/Berlin") @@ -154,7 +154,7 @@ def some_data(): @pytest.mark.parametrize("item_type", ["arrow-table", "arrow-batch"]) -def test_add_map(item_type: TArrowFormat): +def test_add_map(item_type: TPythonTableFormat): item, _, _ = arrow_table_all_data_types(item_type, num_rows=200) @dlt.resource @@ -176,7 +176,7 @@ def map_func(item): @pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) -def test_extract_normalize_file_rotation(item_type: TArrowFormat) -> None: +def test_extract_normalize_file_rotation(item_type: TPythonTableFormat) -> None: # do not extract state os.environ["RESTORE_FROM_DESTINATION"] = "False" # use parquet for dummy @@ -208,7 +208,7 @@ def data_frames(): @pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) -def test_arrow_clashing_names(item_type: TArrowFormat) -> None: +def test_arrow_clashing_names(item_type: TPythonTableFormat) -> None: # # use parquet for dummy os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" pipeline_name = "arrow_" + uniq_id() @@ -227,7 +227,7 @@ def data_frames(): @pytest.mark.parametrize("item_type", ["arrow-table", "arrow-batch"]) -def test_load_arrow_vary_schema(item_type: TArrowFormat) -> None: +def test_load_arrow_vary_schema(item_type: TPythonTableFormat) -> None: pipeline_name = "arrow_" + uniq_id() pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") @@ -246,7 +246,7 @@ def test_load_arrow_vary_schema(item_type: TArrowFormat) -> None: @pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) -def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: +def test_arrow_as_data_loading(item_type: TPythonTableFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" @@ -264,7 +264,7 @@ def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: @pytest.mark.parametrize("item_type", ["arrow-table"]) # , "pandas", "arrow-batch" -def test_normalize_with_dlt_columns(item_type: TArrowFormat): +def test_normalize_with_dlt_columns(item_type: TPythonTableFormat): item, records, _ = arrow_table_all_data_types(item_type, num_rows=5432) os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_ID"] = "True" @@ -330,7 +330,7 @@ def some_data(): @pytest.mark.parametrize("item_type", ["arrow-table", "pandas", "arrow-batch"]) -def test_normalize_reorder_columns_separate_packages(item_type: TArrowFormat) -> None: +def test_normalize_reorder_columns_separate_packages(item_type: TPythonTableFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" table, shuffled_table, shuffled_removed_column = prepare_shuffled_tables() @@ -381,7 +381,7 @@ def _to_item(table: Any) -> Any: @pytest.mark.parametrize("item_type", ["arrow-table", "pandas", "arrow-batch"]) -def test_normalize_reorder_columns_single_package(item_type: TArrowFormat) -> None: +def test_normalize_reorder_columns_single_package(item_type: TPythonTableFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" # we do not want to rotate buffer os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "100000" @@ -423,7 +423,7 @@ def _to_item(table: Any) -> Any: @pytest.mark.parametrize("item_type", ["arrow-table", "pandas", "arrow-batch"]) -def test_normalize_reorder_columns_single_batch(item_type: TArrowFormat) -> None: +def test_normalize_reorder_columns_single_batch(item_type: TPythonTableFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" # we do not want to rotate buffer os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "100000" @@ -475,7 +475,7 @@ def _to_item(table: Any) -> Any: @pytest.mark.parametrize("item_type", ["pandas", "arrow-table", "arrow-batch"]) -def test_empty_arrow(item_type: TArrowFormat) -> None: +def test_empty_arrow(item_type: TPythonTableFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 8f736e13d9..c4e1f5314b 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -171,6 +171,7 @@ def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[D for table_name in table_names: table_rows = [] columns = p.default_schema.get_table_columns(table_name).keys() + query_columns = ",".join(map(p.sql_client().capabilities.escape_identifier, columns)) with p.sql_client() as c: query_columns = ",".join(map(c.escape_column_name, columns)) diff --git a/tests/utils.py b/tests/utils.py index 45aa29a416..1ccb7fc5e4 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -77,7 +77,7 @@ for destination in ACTIVE_DESTINATIONS: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown active destination {destination}" -TArrowFormat = Literal["pandas", "arrow-table", "arrow-batch"] +TPythonTableFormat = Literal["pandas", "arrow-table", "arrow-batch"] """Possible arrow item formats""" TestDataItemFormat = Literal["object", "pandas", "arrow-table", "arrow-batch"] @@ -229,7 +229,7 @@ def data_item_length(data: TDataItem) -> int: def arrow_item_from_pandas( df: Any, - object_format: TArrowFormat, + object_format: TPythonTableFormat, ) -> Any: from dlt.common.libs.pyarrow import pyarrow as pa @@ -244,7 +244,7 @@ def arrow_item_from_pandas( def arrow_item_from_table( table: Any, - object_format: TArrowFormat, + object_format: TPythonTableFormat, ) -> Any: if object_format == "pandas": return table.to_pandas() diff --git a/tox.ini b/tox.ini index 9469001572..ed6c69c585 100644 --- a/tox.ini +++ b/tox.ini @@ -5,4 +5,5 @@ banned-modules = datetime = use dlt.common.pendulum decimal = use dlt.common.decimal decimal.Decimal = use dlt.common.Decimal open = use dlt.common.open + pendulum = use dlt.common.pendulum extend-immutable-calls = dlt.sources.incremental