From 28dbba6e8e4dde3da8aa5516091487ed01119613 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Tue, 21 Nov 2023 19:18:59 +0100 Subject: [PATCH 01/10] source and schema changes (#769) * add schema ancestors * remove name attribute and init arg from dltsource * fix 2 tests * fix statekey related errors * pr fixes * revert changes on validate dict * fix one test --- dlt/common/schema/schema.py | 13 +- dlt/common/schema/typing.py | 3 +- dlt/common/schema/utils.py | 15 +- dlt/common/validation.py | 2 + dlt/extract/decorators.py | 9 +- dlt/extract/source.py | 19 +- dlt/pipeline/pipeline.py | 6 +- .../cases/schemas/eth/ethereum_schema_v8.yml | 461 ++++++++++++++++++ tests/common/schema/test_schema.py | 25 +- tests/common/schema/test_versioning.py | 41 +- tests/common/storages/test_schema_storage.py | 23 +- tests/common/test_validation.py | 4 +- tests/common/utils.py | 2 +- tests/extract/test_decorators.py | 4 +- tests/extract/test_extract.py | 8 +- tests/extract/test_incremental.py | 6 +- tests/extract/test_sources.py | 18 +- tests/load/pipeline/test_restore_state.py | 4 +- tests/load/weaviate/test_naming.py | 20 +- tests/pipeline/test_dlt_versions.py | 6 +- tests/pipeline/test_pipeline.py | 14 +- tests/pipeline/test_pipeline_trace.py | 10 +- 22 files changed, 620 insertions(+), 93 deletions(-) create mode 100644 tests/common/cases/schemas/eth/ethereum_schema_v8.yml diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 67ae345845..bcfba11c61 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -44,6 +44,7 @@ class Schema: _dlt_tables_prefix: str _stored_version: int # version at load/creation time _stored_version_hash: str # version hash at load/creation time + _stored_previous_hashes: Optional[List[str]] # list of ancestor hashes of the schema _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description _schema_tables: TSchemaTables @@ -101,7 +102,8 @@ def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> T "name": self._schema_name, "tables": self._schema_tables, "settings": self._settings, - "normalizers": self._normalizers_config + "normalizers": self._normalizers_config, + "previous_hashes": self._stored_previous_hashes } if self._imported_version_hash and not remove_defaults: stored_schema["imported_version_hash"] = self._imported_version_hash @@ -353,7 +355,7 @@ def bump_version(self) -> Tuple[int, str]: Returns: Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple """ - self._stored_version, self._stored_version_hash, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) + self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) return self._stored_version, self._stored_version_hash def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: @@ -475,6 +477,11 @@ def version_hash(self) -> str: """Current version hash of the schema, recomputed from the actual content""" return utils.bump_version_if_modified(self.to_dict())[1] + @property + def previous_hashes(self) -> List[str]: + """Current version hash of the schema, recomputed from the actual content""" + return utils.bump_version_if_modified(self.to_dict())[3] + @property def stored_version_hash(self) -> str: """Version hash of the schema content form the time of schema loading/creation.""" @@ -663,6 +670,7 @@ def _reset_schema(self, name: str, normalizers: TNormalizersConfig = None) -> No self._stored_version_hash: str = None self._imported_version_hash: str = None self._schema_description: str = None + self._stored_previous_hashes: List[str] = [] self._settings: TSchemaSettings = {} self._compiled_preferred_types: List[Tuple[REPattern, TDataType]] = [] @@ -701,6 +709,7 @@ def _from_stored_schema(self, stored_schema: TStoredSchema) -> None: self._imported_version_hash = stored_schema.get("imported_version_hash") self._schema_description = stored_schema.get("description") self._settings = stored_schema.get("settings") or {} + self._stored_previous_hashes = stored_schema.get("previous_hashes") self._compile_settings() def _set_schema_name(self, name: str) -> None: diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 720313b57b..1b6ef31800 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -11,7 +11,7 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 7 +SCHEMA_ENGINE_VERSION = 8 # dlt tables VERSION_TABLE_NAME = "_dlt_version" @@ -123,6 +123,7 @@ class TStoredSchema(TypedDict, total=False): """TypeDict defining the schema representation in storage""" version: int version_hash: str + previous_hashes: List[str] imported_version_hash: Optional[str] engine_version: int name: str diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 9b4e8fb047..b6a3cca0e2 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -134,7 +134,7 @@ def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: # return copy(column) # type: ignore -def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, str]: +def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, str, List[str]]: """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") @@ -143,8 +143,13 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, st pass elif hash_ != previous_hash: stored_schema["version"] += 1 + # unshift previous hash to previous_hashes and limit array to 10 entries + if previous_hash not in stored_schema["previous_hashes"]: + stored_schema["previous_hashes"].insert(0, previous_hash) + stored_schema["previous_hashes"] = stored_schema["previous_hashes"][:10] + stored_schema["version_hash"] = hash_ - return stored_schema["version"], hash_, previous_hash + return stored_schema["version"], hash_, previous_hash, stored_schema["previous_hashes"] def generate_version_hash(stored_schema: TStoredSchema) -> str: @@ -153,6 +158,7 @@ def generate_version_hash(stored_schema: TStoredSchema) -> str: schema_copy.pop("version") schema_copy.pop("version_hash", None) schema_copy.pop("imported_version_hash", None) + schema_copy.pop("previous_hashes", None) # ignore order of elements when computing the hash content = json.dumps(schema_copy, sort_keys=True) h = hashlib.sha3_256(content.encode("utf-8")) @@ -240,6 +246,7 @@ def compile_simple_regexes(r: Iterable[TSimpleRegex]) -> REPattern: def validate_stored_schema(stored_schema: TStoredSchema) -> None: + # use lambda to verify only non extra fields validate_dict_ignoring_xkeys( spec=TStoredSchema, @@ -256,6 +263,7 @@ def validate_stored_schema(stored_schema: TStoredSchema) -> None: def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> TStoredSchema: + if from_engine == to_engine: return cast(TStoredSchema, schema_dict) @@ -349,6 +357,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: if not table.get("parent"): table["schema_contract"] = {} from_engine = 7 + if from_engine == 7 and to_engine > 7: + schema_dict["previous_hashes"] = [] + from_engine = 8 schema_dict["engine_version"] = from_engine if from_engine != to_engine: diff --git a/dlt/common/validation.py b/dlt/common/validation.py index b746fda361..312371bbf1 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -22,6 +22,8 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil validator_f (TCustomValidator, optional): A function to perform additional validation for types not covered by this function. It should return `True` if the validation passes. Defaults to a function that rejects all such types. + filter_required (TFilterFunc, optional): A function to filter out required fields, useful + for testing historic versions of dict that might now have certain fields yet. Raises: DictValidationException: If there are missing required fields, unexpected fields, diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index b8abbc1d57..1dbfcb4350 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -150,9 +150,6 @@ def decorator(f: Callable[TSourceFunParams, Any]) -> Callable[TSourceFunParams, if name and name != schema.name: raise ExplicitSourceNameInvalid(name, schema.name) - # the name of the source must be identical to the name of the schema - name = schema.name - # wrap source extraction function in configuration with section func_module = inspect.getmodule(f) source_section = section or _get_source_section_name(func_module) @@ -167,16 +164,16 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] pipeline_name = None if not proxy.is_active() else proxy.pipeline().pipeline_name - with inject_section(ConfigSectionContext(pipeline_name=pipeline_name, sections=source_sections, source_state_key=name)): + with inject_section(ConfigSectionContext(pipeline_name=pipeline_name, sections=source_sections, source_state_key=schema.name)): rv = conf_f(*args, **kwargs) if rv is None: - raise SourceDataIsNone(name) + raise SourceDataIsNone(schema.name) # if generator, consume it immediately if inspect.isgenerator(rv): rv = list(rv) # convert to source - s = _impl_cls.from_data(name, source_section, schema.clone(update_normalizers=True), rv) + s = _impl_cls.from_data(schema.clone(update_normalizers=True), source_section, rv) # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 771e8ca0cc..0ff24d1f86 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -167,22 +167,17 @@ class DltSource(Iterable[TDataItem]): * You can use a `run` method to load the data with a default instance of dlt pipeline. * You can get source read only state for the currently active Pipeline instance """ - def __init__(self, name: str, section: str, schema: Schema, resources: Sequence[DltResource] = None) -> None: - self.name = name + def __init__(self, schema: Schema, section: str, resources: Sequence[DltResource] = None) -> None: self.section = section """Tells if iterator associated with a source is exhausted""" self._schema = schema self._resources: DltResourceDict = DltResourceDict(self.name, self.section) - if self.name != schema.name: - # raise ValueError(f"Schema name {schema.name} differs from source name {name}! The explicit source name argument is deprecated and will be soon removed.") - warnings.warn(f"Schema name {schema.name} differs from source name {name}! The explicit source name argument is deprecated and will be soon removed.") - if resources: self.resources.add(*resources) @classmethod - def from_data(cls, name: str, section: str, schema: Schema, data: Any) -> Self: + def from_data(cls, schema: Schema, section: str, data: Any) -> Self: """Converts any `data` supported by `dlt` `run` method into `dlt source` with a name `section`.`name` and `schema` schema.""" # creates source from various forms of data if isinstance(data, DltSource): @@ -194,10 +189,14 @@ def from_data(cls, name: str, section: str, schema: Schema, data: Any) -> Self: else: resources = [DltResource.from_data(data)] - return cls(name, section, schema, resources) + return cls(schema, section, resources) - # TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. + @property + def name(self) -> str: + return self._schema.name + + # TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. @property def max_table_nesting(self) -> int: """A schema hint that sets the maximum depth of nested table above which the remaining nodes are loaded as structs or JSON.""" @@ -328,7 +327,7 @@ def state(self) -> StrAny: def clone(self) -> "DltSource": """Creates a deep copy of the source where copies of schema, resources and pipes are created""" # mind that resources and pipes are cloned when added to the DltResourcesDict in the source constructor - return DltSource(self.name, self.section, self.schema.clone(), list(self._resources.values())) + return DltSource(self.schema.clone(), self.section, list(self._resources.values())) def __iter__(self) -> Iterator[TDataItem]: """Opens iterator that yields the data items from all the resources within the source in the same order as in Pipeline class. diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index b9eb958027..c893fd4e75 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -860,7 +860,7 @@ def append_data(data_item: Any) -> None: # do not set section to prevent source that represent a standalone resource # to overwrite other standalone resources (ie. parents) in that source sources.append( - DltSource(effective_schema.name, "", effective_schema, [data_item]) + DltSource(effective_schema, "", [data_item]) ) else: # iterator/iterable/generator @@ -881,7 +881,7 @@ def append_data(data_item: Any) -> None: # add all the appended resources in one source if resources: - sources.append(DltSource(effective_schema.name, self.pipeline_name, effective_schema, resources)) + sources.append(DltSource(effective_schema, self.pipeline_name, resources)) # apply hints and settings for source in sources: @@ -1293,7 +1293,7 @@ def _save_state(self, state: TPipelineState) -> None: def _extract_state(self, state: TPipelineState) -> TPipelineState: # this will extract the state into current load package and update the schema with the _dlt_pipeline_state table # note: the schema will be persisted because the schema saving decorator is over the state manager decorator for extract - state_source = DltSource(self.default_schema.name, self.pipeline_name, self.default_schema, [state_resource(state)]) + state_source = DltSource(self.default_schema, self.pipeline_name, [state_resource(state)]) storage = ExtractorStorage(self._normalize_storage_config) extract_id = extract_with_schema(storage, state_source, _NULL_COLLECTOR, 1, 1) storage.commit_extract_files(extract_id) diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v8.yml b/tests/common/cases/schemas/eth/ethereum_schema_v8.yml new file mode 100644 index 0000000000..928c9a3e54 --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v8.yml @@ -0,0 +1,461 @@ +version: 16 +version_hash: C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE= +engine_version: 8 +name: ethereum +tables: + _dlt_loads: + columns: + load_id: + nullable: false + data_type: text + name: load_id + schema_name: + nullable: true + data_type: text + name: schema_name + status: + nullable: false + data_type: bigint + name: status + inserted_at: + nullable: false + data_type: timestamp + name: inserted_at + schema_version_hash: + nullable: true + data_type: text + name: schema_version_hash + write_disposition: skip + description: Created by DLT. Tracks completed loads + schema_contract: {} + name: _dlt_loads + resource: _dlt_loads + _dlt_version: + columns: + version: + nullable: false + data_type: bigint + name: version + engine_version: + nullable: false + data_type: bigint + name: engine_version + inserted_at: + nullable: false + data_type: timestamp + name: inserted_at + schema_name: + nullable: false + data_type: text + name: schema_name + version_hash: + nullable: false + data_type: text + name: version_hash + schema: + nullable: false + data_type: text + name: schema + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_contract: {} + name: _dlt_version + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + description: load id coming from the extractor + data_type: text + name: _dlt_load_id + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + number: + nullable: false + primary_key: true + data_type: bigint + name: number + parent_hash: + nullable: true + data_type: text + name: parent_hash + hash: + nullable: false + cluster: true + unique: true + data_type: text + name: hash + base_fee_per_gas: + nullable: false + data_type: wei + name: base_fee_per_gas + difficulty: + nullable: false + data_type: wei + name: difficulty + extra_data: + nullable: true + data_type: text + name: extra_data + gas_limit: + nullable: false + data_type: bigint + name: gas_limit + gas_used: + nullable: false + data_type: bigint + name: gas_used + logs_bloom: + nullable: true + data_type: binary + name: logs_bloom + miner: + nullable: true + data_type: text + name: miner + mix_hash: + nullable: true + data_type: text + name: mix_hash + nonce: + nullable: true + data_type: text + name: nonce + receipts_root: + nullable: true + data_type: text + name: receipts_root + sha3_uncles: + nullable: true + data_type: text + name: sha3_uncles + size: + nullable: true + data_type: bigint + name: size + state_root: + nullable: false + data_type: text + name: state_root + timestamp: + nullable: false + unique: true + sort: true + data_type: timestamp + name: timestamp + total_difficulty: + nullable: true + data_type: wei + name: total_difficulty + transactions_root: + nullable: false + data_type: text + name: transactions_root + schema_contract: {} + name: blocks + resource: blocks + blocks__transactions: + parent: blocks + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + block_number: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: block_number + transaction_index: + nullable: false + primary_key: true + data_type: bigint + name: transaction_index + hash: + nullable: false + unique: true + data_type: text + name: hash + block_hash: + nullable: false + cluster: true + data_type: text + name: block_hash + block_timestamp: + nullable: false + sort: true + data_type: timestamp + name: block_timestamp + chain_id: + nullable: true + data_type: text + name: chain_id + from: + nullable: true + data_type: text + name: from + gas: + nullable: true + data_type: bigint + name: gas + gas_price: + nullable: true + data_type: bigint + name: gas_price + input: + nullable: true + data_type: text + name: input + max_fee_per_gas: + nullable: true + data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: + nullable: true + data_type: wei + name: max_priority_fee_per_gas + nonce: + nullable: true + data_type: bigint + name: nonce + r: + nullable: true + data_type: text + name: r + s: + nullable: true + data_type: text + name: s + status: + nullable: true + data_type: bigint + name: status + to: + nullable: true + data_type: text + name: to + type: + nullable: true + data_type: text + name: type + v: + nullable: true + data_type: bigint + name: v + value: + nullable: false + data_type: wei + name: value + eth_value: + nullable: true + data_type: decimal + name: eth_value + name: blocks__transactions + blocks__transactions__logs: + parent: blocks__transactions + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + address: + nullable: false + data_type: text + name: address + block_timestamp: + nullable: false + sort: true + data_type: timestamp + name: block_timestamp + block_hash: + nullable: false + cluster: true + data_type: text + name: block_hash + block_number: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: block_number + transaction_index: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: transaction_index + log_index: + nullable: false + primary_key: true + data_type: bigint + name: log_index + data: + nullable: true + data_type: text + name: data + removed: + nullable: true + data_type: bool + name: removed + transaction_hash: + nullable: false + data_type: text + name: transaction_hash + name: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__transactions__logs__topics + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + address: + nullable: true + data_type: text + name: address + name: blocks__transactions__access_list + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__transactions__access_list__storage_keys + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__uncles +settings: + default_hints: + foreign_key: + - _dlt_parent_id + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_contract: {} +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + generate_dlt_id: true + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash +previous_hashes: +- yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= + diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index f5f406a7a1..a42018f97b 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -132,10 +132,10 @@ def test_simple_regex_validator() -> None: def test_load_corrupted_schema() -> None: - eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") - del eth_v4["tables"]["blocks"] + eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") + del eth_v8["tables"]["blocks"] with pytest.raises(ParentTableNotFoundException): - utils.validate_stored_schema(eth_v4) + utils.validate_stored_schema(eth_v8) def test_column_name_validator(schema: Schema) -> None: @@ -287,21 +287,31 @@ def test_upgrade_engine_v1_schema() -> None: assert schema_dict["engine_version"] == 2 upgraded = utils.migrate_schema(schema_dict, from_engine=2, to_engine=4) assert upgraded["engine_version"] == 4 - utils.validate_stored_schema(upgraded) # upgrade 1 -> 4 schema_dict = load_json_case("schemas/ev1/event.schema") assert schema_dict["engine_version"] == 1 upgraded = utils.migrate_schema(schema_dict, from_engine=1, to_engine=4) assert upgraded["engine_version"] == 4 - utils.validate_stored_schema(upgraded) # upgrade 1 -> 6 schema_dict = load_json_case("schemas/ev1/event.schema") assert schema_dict["engine_version"] == 1 upgraded = utils.migrate_schema(schema_dict, from_engine=1, to_engine=6) assert upgraded["engine_version"] == 6 - utils.validate_stored_schema(upgraded) + + # upgrade 1 -> 7 + schema_dict = load_json_case("schemas/ev1/event.schema") + assert schema_dict["engine_version"] == 1 + upgraded = utils.migrate_schema(schema_dict, from_engine=1, to_engine=7) + assert upgraded["engine_version"] == 7 + + + # upgrade 1 -> 8 + schema_dict = load_json_case("schemas/ev1/event.schema") + assert schema_dict["engine_version"] == 1 + upgraded = utils.migrate_schema(schema_dict, from_engine=1, to_engine=8) + assert upgraded["engine_version"] == 8 def test_unknown_engine_upgrade() -> None: @@ -581,7 +591,8 @@ def assert_new_schema_values(schema: Schema) -> None: assert schema.stored_version == 1 assert schema.stored_version_hash is not None assert schema.version_hash is not None - assert schema.ENGINE_VERSION == 7 + assert schema.ENGINE_VERSION == 8 + assert schema._stored_previous_hashes == [] assert len(schema.settings["default_hints"]) > 0 # check settings assert utils.standard_type_detections() == schema.settings["detections"] == schema._type_detections diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 4e4278a539..401b463875 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -1,5 +1,6 @@ import pytest import yaml +from copy import deepcopy from dlt.common import json from dlt.common.schema import utils @@ -83,10 +84,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") - version = eth_v7["version"] - version_hash = eth_v7["version_hash"] - schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] + eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") + version = eth_v8["version"] + version_hash = eth_v8["version_hash"] + schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -95,8 +96,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") - schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] + eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") + schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] @@ -122,3 +123,31 @@ def test_version_preserve_on_reload(remove_defaults: bool) -> None: saved_rasa_schema = Schema.from_dict(yaml.safe_load(rasa_yml)) assert saved_rasa_schema.stored_version == rasa_schema.stored_version assert saved_rasa_schema.stored_version_hash == rasa_schema.stored_version_hash + + +def test_create_ancestry() -> None: + eth_v8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") + schema = Schema.from_dict(eth_v8) # type: ignore[arg-type] + assert schema._stored_previous_hashes == ["yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE="] + version = schema._stored_version + + # modify save and load schema 15 times and check ancestry + expected_previous_hashes = ["yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE="] + for i in range(1,15): + # keep expected previous_hashes + expected_previous_hashes.insert(0, schema._stored_version_hash) + + # update schema + row = {f"float{i}": 78172.128} + _, new_table = schema.coerce_row("event_user", None, row) + schema.update_table(new_table) + schema_dict = schema.to_dict() + schema = Schema.from_stored_schema(schema_dict) + + assert schema._stored_previous_hashes == expected_previous_hashes[:10] + assert schema._stored_version == version + i + + # we never have more than 10 previous_hashes + assert len(schema._stored_previous_hashes) == i + 1 if i + 1 <= 10 else 10 + + assert len(schema._stored_previous_hashes) == 10 \ No newline at end of file diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index a4b6c5c89f..401c22f0bc 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -11,7 +11,7 @@ from dlt.common.storages import SchemaStorageConfiguration, SchemaStorage, LiveSchemaStorage, FileStorage from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT -from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V7 +from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V8 @pytest.fixture @@ -87,6 +87,7 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch assert storage_schema.version == reloaded_schema.stored_version assert storage_schema.version_hash == reloaded_schema.stored_version_hash assert storage_schema._imported_version_hash == reloaded_schema._imported_version_hash + assert storage_schema.previous_hashes == reloaded_schema.previous_hashes # the import schema gets modified storage_schema.tables["_dlt_loads"]["write_disposition"] = "append" storage_schema.tables.pop("event_user") @@ -96,7 +97,11 @@ def test_skip_import_if_not_modified(synced_storage: SchemaStorage, storage: Sch # we have overwritten storage schema assert reloaded_schema.tables["_dlt_loads"]["write_disposition"] == "append" assert "event_user" not in reloaded_schema.tables + + # hash and ancestry stay the same assert reloaded_schema._imported_version_hash == storage_schema.version_hash + assert storage_schema.previous_hashes == reloaded_schema.previous_hashes + # but original version has increased assert reloaded_schema.stored_version == storage_schema.version + 1 @@ -194,12 +199,13 @@ def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: ie_storage.save_schema(schema) assert schema.version_hash == schema_hash # we linked schema to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V8 # load schema and make sure our new schema is here schema = ie_storage.load_schema("ethereum") - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V8 assert schema._stored_version_hash == schema_hash assert schema.version_hash == schema_hash + assert schema.previous_hashes == [] # we have simple schema in export folder fs = FileStorage(ie_storage.config.export_schema_path) exported_name = ie_storage._file_name_in_store("ethereum", "yaml") @@ -213,12 +219,13 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No schema = Schema("ethereum") schema_hash = schema.version_hash synced_storage.save_schema(schema) - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V8 # import schema is overwritten fs = FileStorage(synced_storage.config.import_schema_path) exported_name = synced_storage._file_name_in_store("ethereum", "yaml") exported_schema = yaml.safe_load(fs.load(exported_name)) assert schema.version_hash == exported_schema["version_hash"] == schema_hash + assert schema.previous_hashes == [] # when it is loaded we will import schema again which is identical to the current one but the import link # will be set to itself schema = synced_storage.load_schema("ethereum") @@ -269,18 +276,18 @@ def test_schema_from_file() -> None: def prepare_import_folder(storage: SchemaStorage) -> None: - shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v7"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) + shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v8"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: prepare_import_folder(synced_storage) - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + eth_V8: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v8") schema = synced_storage.load_schema("ethereum") # is linked to imported schema - schema._imported_version_hash = eth_v6["version_hash"] + schema._imported_version_hash = eth_V8["version_hash"] # also was saved in storage assert synced_storage.has_schema("ethereum") # and has link to imported schema s well (load without import) schema = storage.load_schema("ethereum") - assert schema._imported_version_hash == eth_v6["version_hash"] + assert schema._imported_version_hash == eth_V8["version_hash"] return schema diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 0a034dc72f..f274c82014 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -89,14 +89,14 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: - with open("tests/common/cases/schemas/eth/ethereum_schema_v7.yml", mode="r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v8.yml", mode="r", encoding="utf-8") as f: schema_dict: TStoredSchema = yaml.safe_load(f) validate_dict_ignoring_xkeys( spec=TStoredSchema, doc=schema_dict, path=".", - validator_f=simple_regex_validator + validator_f=simple_regex_validator, ) # with open("tests/common/cases/schemas/rasa/event.schema.json") as f: diff --git a/tests/common/utils.py b/tests/common/utils.py index d612dcbdcf..db9a8318fb 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -16,7 +16,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V7 = "yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE=" +IMPORTED_VERSION_HASH_ETH_V8 = "C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 28f3d34dcf..27cdc3d22d 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -28,7 +28,7 @@ SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable) from dlt.extract.typing import TableNameMeta -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7 +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V8 def test_none_returning_source() -> None: @@ -75,7 +75,7 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V7 + assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V8 def test_unbound_parametrized_transformer() -> None: diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 7ed74b41f2..a045dd4f3c 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -14,7 +14,7 @@ def test_extract_select_tables() -> None: def expect_tables(resource: DltResource) -> dlt.Schema: # delete files clean_test_storage() - source = DltSource("selectables", "module", dlt.Schema("selectables"), [resource(10)]) + source = DltSource(dlt.Schema("selectables"), "module", [resource(10)]) schema = source.discover_schema() storage = ExtractorStorage(NormalizeStorageConfiguration()) @@ -37,7 +37,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: clean_test_storage() storage = ExtractorStorage(NormalizeStorageConfiguration()) # same thing but select only odd - source = DltSource("selectables", "module", dlt.Schema("selectables"), [resource]) + source = DltSource(dlt.Schema("selectables"), "module", [resource]) source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() @@ -80,7 +80,7 @@ def input_gen(): yield from [1, 2, 3] input_r = DltResource.from_data(input_gen) - source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, input_r.with_name("gen_clone")]) + source = DltSource(dlt.Schema("selectables"), "module", [input_r, input_r.with_name("gen_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() extract(extract_id, source, storage) @@ -99,7 +99,7 @@ def tx_step(item): input_r = DltResource.from_data(input_gen) input_tx = DltResource.from_data(tx_step, data_from=DltResource.Empty) - source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, (input_r | input_tx).with_name("tx_clone")]) + source = DltSource(dlt.Schema("selectables"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() extract(extract_id, source, storage) diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index d03b125777..ec28018add 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -694,14 +694,14 @@ def child(item): # create a source where we place only child child.write_disposition = "replace" - s = DltSource("comp", "section", Schema("comp"), [child]) + s = DltSource(Schema("comp"), "section", [child]) # but extracted resources will include its parent where it derives write disposition from child extracted = s.resources.extracted assert extracted[child.name].write_disposition == "replace" assert extracted[child._pipe.parent.name].write_disposition == "replace" # create a source where we place parent explicitly - s = DltSource("comp", "section", Schema("comp"), [parent_r, child]) + s = DltSource(Schema("comp"), "section", [parent_r, child]) extracted = s.resources.extracted assert extracted[child.name].write_disposition == "replace" # now parent exists separately and has its own write disposition @@ -722,7 +722,7 @@ def child(item): # now we add child that has parent_r as parent but we add another instance of standalone_some_data explicitly # so we have a resource with the same name as child parent but the pipe instance is different - s = DltSource("comp", "section", Schema("comp"), [standalone_some_data(now), child]) + s = DltSource(Schema("comp"), "section", [standalone_some_data(now), child]) assert extracted[child.name].write_disposition == "replace" # now parent exists separately and has its own write disposition - because we search by name to identify matching resource assert extracted[child._pipe.parent.name].write_disposition == "append" diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 130e0a8d93..aae95e0a3f 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -55,7 +55,7 @@ def parametrized(p1, /, p2, *, p3 = None): # as part of the source r = DltResource.from_data(parametrized) - s = DltSource("source", "module", Schema("source"), [r]) + s = DltSource(Schema("source"), "module", [r]) with pytest.raises(ParametrizedResourceUnbound) as py_ex: list(s) @@ -1014,7 +1014,7 @@ def some_data(): yield [1, 2, 3] yield [1, 2, 3] - s = DltSource("source", "module", Schema("source"), [dlt.resource(some_data())]) + s = DltSource(Schema("source"), "module", [dlt.resource(some_data())]) assert s.exhausted is False assert list(s) == [1, 2, 3, 1, 2, 3] assert s.exhausted is True @@ -1028,19 +1028,19 @@ def test_exhausted_property() -> None: # this example will be exhausted after iteration def open_generator_data(): yield from [1, 2, 3, 4] - s = DltSource("source", "module", Schema("source"), [dlt.resource(open_generator_data())]) + s = DltSource(Schema("source"), "module", [dlt.resource(open_generator_data())]) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is True # lists will not exhaust - s = DltSource("source", "module", Schema("source"), [dlt.resource([1, 2, 3, 4], table_name="table", name="resource")]) + s = DltSource(Schema("source"), "module", [dlt.resource([1, 2, 3, 4], table_name="table", name="resource")]) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is False # iterators will not exhaust - s = DltSource("source", "module", Schema("source"), [dlt.resource(iter([1, 2, 3, 4]), table_name="table", name="resource")]) + s = DltSource(Schema("source"), "module", [dlt.resource(iter([1, 2, 3, 4]), table_name="table", name="resource")]) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is False @@ -1048,7 +1048,7 @@ def open_generator_data(): # having on exhausted generator resource will make the whole source exhausted def open_generator_data(): # type: ignore[no-redef] yield from [1, 2, 3, 4] - s = DltSource("source", "module", Schema("source"), [ dlt.resource([1, 2, 3, 4], table_name="table", name="resource"), dlt.resource(open_generator_data())]) + s = DltSource(Schema("source"), "module", [ dlt.resource([1, 2, 3, 4], table_name="table", name="resource"), dlt.resource(open_generator_data())]) assert s.exhausted is False # execute the whole source @@ -1239,7 +1239,7 @@ def tx_step(item): input_r_clone = input_r.with_name("input_gen_2") # separate resources have separate pipe instances - source = DltSource("dupes", "module", Schema("dupes"), [input_r, input_r_clone]) + source = DltSource(Schema("dupes"), "module", [input_r, input_r_clone]) pipes = source.resources.pipes assert len(pipes) == 2 assert pipes[0].name == "input_gen" @@ -1250,13 +1250,13 @@ def tx_step(item): assert list(source) == [1, 2, 3, 1, 2, 3] # cloned from fresh resource - source = DltSource("dupes", "module", Schema("dupes"), [DltResource.from_data(input_gen), DltResource.from_data(input_gen).with_name("gen_2")]) + source = DltSource(Schema("dupes"), "module", [DltResource.from_data(input_gen), DltResource.from_data(input_gen).with_name("gen_2")]) assert list(source) == [1, 2, 3, 1, 2, 3] # clone transformer input_r = DltResource.from_data(input_gen) input_tx = DltResource.from_data(tx_step, data_from=DltResource.Empty) - source = DltSource("dupes", "module", Schema("dupes"), [input_r, (input_r | input_tx).with_name("tx_clone")]) + source = DltSource(Schema("dupes"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")]) pipes = source.resources.pipes assert len(pipes) == 2 assert source.resources[pipes[0].name] == source.input_gen diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index c9c6c4c437..1ebb3378a6 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -18,7 +18,7 @@ from tests.utils import TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7, yml_case_path as common_yml_case_path +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V8, yml_case_path as common_yml_case_path from tests.common.configuration.utils import environment from tests.load.pipeline.utils import assert_query_data, drop_active_pipeline_data from tests.load.utils import destinations_configs, DestinationTestConfiguration, get_normalized_dataset_name @@ -404,7 +404,7 @@ def test_restore_schemas_while_import_schemas_exist(destination_config: Destinat assert normalized_annotations in schema.tables # check if attached to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V8 # extract some data with restored pipeline p.run(["C", "D", "E"], table_name="blacklist") assert normalized_labels in schema.tables diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index 850f70ee19..dad7fc176f 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -87,13 +87,13 @@ def test_reserved_property_names() -> None: # print(schema_2.name) # print(schema_2.naming) -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") -# eth_v6_schema = dlt.Schema.from_dict(eth_v6) +# eth_V8 = load_yml_case("schemas/eth/ethereum_schema_v8") +# eth_V8_schema = dlt.Schema.from_dict(eth_V8) -# pipeline.extract(s, schema=eth_v6_schema) +# pipeline.extract(s, schema=eth_V8_schema) -# print(eth_v6_schema.data_tables()) -# print(eth_v6_schema.dlt_tables()) +# print(eth_V8_schema.data_tables()) +# print(eth_V8_schema.dlt_tables()) # def test_x_schema_naming_normalize() -> None: @@ -101,14 +101,14 @@ def test_reserved_property_names() -> None: # print(pipeline.dataset_name) # s = small() -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") -# eth_v6_schema = dlt.Schema.from_dict(eth_v6) +# eth_V8 = load_yml_case("schemas/eth/ethereum_schema_v8") +# eth_V8_schema = dlt.Schema.from_dict(eth_V8) -# pipeline.extract(s, schema=eth_v6_schema) -# print(eth_v6_schema.tables.keys()) +# pipeline.extract(s, schema=eth_V8_schema) +# print(eth_V8_schema.tables.keys()) # default_schema = pipeline.default_schema # print(default_schema.name) -# print(eth_v6_schema.tables.keys()) +# print(eth_V8_schema.tables.keys()) # pipeline.run(s, destination="weaviate") # print(default_schema.tables.keys()) diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 7ac7dcbb34..71f6b5b813 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -55,7 +55,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py")) # hash hash in schema github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) - assert github_schema["engine_version"] == 7 + assert github_schema["engine_version"] == 8 assert "schema_version_hash" in github_schema["tables"][LOADS_TABLE_NAME]["columns"] with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at") @@ -81,7 +81,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: pipeline.sync_destination() # print(pipeline.working_dir) # we have updated schema - assert pipeline.default_schema.ENGINE_VERSION == 7 + assert pipeline.default_schema.ENGINE_VERSION == 8 # make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped assert pipeline.default_schema.stored_version_hash == github_schema["version_hash"] @@ -114,6 +114,6 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) pipeline = pipeline.drop() pipeline.sync_destination() - assert pipeline.default_schema.ENGINE_VERSION == 7 + assert pipeline.default_schema.ENGINE_VERSION == 8 # schema version does not match `dlt.attach` does not update to the right schema by itself assert pipeline.default_schema.stored_version_hash != github_schema["version_hash"] diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 309511b95f..af21eb9f81 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -274,7 +274,7 @@ def some_data(): yield [1, 2, 3] yield [1, 2, 3] - s = DltSource("source", "module", dlt.Schema("source"), [dlt.resource(some_data())]) + s = DltSource(dlt.Schema("source"), "module", [dlt.resource(some_data())]) dlt.pipeline().extract(s) with pytest.raises(PipelineStepFailed) as py_ex: dlt.pipeline().extract(s) @@ -289,7 +289,7 @@ def test_disable_enable_state_sync(environment: Any) -> None: def some_data(): yield [1, 2, 3] - s = DltSource("default", "module", dlt.Schema("default"), [dlt.resource(some_data())]) + s = DltSource(dlt.Schema("default"), "module", [dlt.resource(some_data())]) dlt.pipeline().extract(s) storage = ExtractorStorage(p._normalize_storage_config) assert len(storage.list_files_to_normalize_sorted()) == 1 @@ -299,14 +299,14 @@ def some_data(): p.config.restore_from_destination = True # extract to different schema, state must go to default schema - s = DltSource("default_2", "module", dlt.Schema("default_2"), [dlt.resource(some_data())]) + s = DltSource(dlt.Schema("default_2"), "module", [dlt.resource(some_data())]) dlt.pipeline().extract(s) expect_extracted_file(storage, "default", s.schema.state_table_name, "***") def test_extract_multiple_sources() -> None: - s1 = DltSource("default", "module", dlt.Schema("default"), [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) - s2 = DltSource("default_2", "module", dlt.Schema("default_2"), [dlt.resource([6, 7, 8], name="resource_3"), dlt.resource([9, 10, 0], name="resource_4")]) + s1 = DltSource(dlt.Schema("default"), "module", [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) + s2 = DltSource(dlt.Schema("default_2"),"module", [dlt.resource([6, 7, 8], name="resource_3"), dlt.resource([9, 10, 0], name="resource_4")]) p = dlt.pipeline(destination="dummy") p.config.restore_from_destination = False @@ -325,8 +325,8 @@ def test_extract_multiple_sources() -> None: def i_fail(): raise NotImplementedError() - s3 = DltSource("default_3", "module", dlt.Schema("default_3"), [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) - s4 = DltSource("default_4", "module", dlt.Schema("default_4"), [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) + s3 = DltSource(dlt.Schema("default_3"), "module", [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) + s4 = DltSource(dlt.Schema("default_4"), "module", [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) with pytest.raises(PipelineStepFailed): # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index cd3e2444c8..5644a32d2f 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -298,14 +298,14 @@ def data(): def test_extract_data_describe() -> None: schema = Schema("test") - assert describe_extract_data(DltSource("sss_extract", "sect", schema)) == [{"name": "sss_extract", "data_type": "source"}] + assert describe_extract_data(DltSource(schema, "sect")) == [{"name": "test", "data_type": "source"}] assert describe_extract_data(DltResource(Pipe("rrr_extract"), None, False)) == [{"name": "rrr_extract", "data_type": "resource"}] - assert describe_extract_data([DltSource("sss_extract", "sect", schema)]) == [{"name": "sss_extract", "data_type": "source"}] + assert describe_extract_data([DltSource(schema, "sect")]) == [{"name": "test", "data_type": "source"}] assert describe_extract_data([DltResource(Pipe("rrr_extract"), None, False)]) == [{"name": "rrr_extract", "data_type": "resource"}] assert describe_extract_data( - [DltResource(Pipe("rrr_extract"), None, False), DltSource("sss_extract", "sect", schema)] + [DltResource(Pipe("rrr_extract"), None, False), DltSource(schema, "sect")] ) == [ - {"name": "rrr_extract", "data_type": "resource"}, {"name": "sss_extract", "data_type": "source"} + {"name": "rrr_extract", "data_type": "resource"}, {"name": "test", "data_type": "source"} ] assert describe_extract_data([{"a": "b"}]) == [{"name": "", "data_type": "dict"}] from pandas import DataFrame @@ -313,7 +313,7 @@ def test_extract_data_describe() -> None: assert describe_extract_data([DataFrame(), {"a": "b"}]) == [{"name": "", "data_type": "DataFrame"}] # first unnamed element in the list breaks checking info assert describe_extract_data( - [DltResource(Pipe("rrr_extract"), None, False), DataFrame(), DltSource("sss_extract", "sect", schema)] + [DltResource(Pipe("rrr_extract"), None, False), DataFrame(), DltSource(schema, "sect")] ) == [ {"name": "rrr_extract", "data_type": "resource"}, {"name": "", "data_type": "DataFrame"} ] From 35e641667bc3ddc5c803580f69c0e17229f29bb1 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 24 Aug 2023 16:46:39 +0200 Subject: [PATCH 02/10] add black and makefile commands --- Makefile | 6 ++++++ pyproject.toml | 19 ++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 66c429743b..9055170e8e 100644 --- a/Makefile +++ b/Makefile @@ -48,11 +48,17 @@ dev: has-poetry lint: ./check-package.sh + poetry run black ./ --diff --exclude=".*syntax_error.py" + poetry run isort ./ --diff poetry run mypy --config-file mypy.ini dlt tests poetry run flake8 --max-line-length=200 dlt poetry run flake8 --max-line-length=200 tests --exclude tests/reflection/module_cases # $(MAKE) lint-security +format: + poetry run black ./ --exclude=".*syntax_error.py" + poetry run isort ./ + test-and-lint-snippets: poetry run mypy --config-file mypy.ini docs/website docs/examples poetry run flake8 --max-line-length=200 docs/website docs/examples diff --git a/pyproject.toml b/pyproject.toml index 6798df3696..5bc0811b69 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,6 @@ adlfs = {version = ">=2022.4.0", optional = true} pyodbc = {version = "^4.0.39", optional = true} qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} - [tool.poetry.extras] dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community"] gcp = ["grpcio", "google-cloud-bigquery", "db-dtypes", "gcsfs"] @@ -142,6 +141,8 @@ enlighten = "^1.11.2" alive-progress = "^3.1.1" pydantic = ">2" pandas = ">2" +black = "^23.7.0" +isort = "^5.12.0" [tool.poetry.group.airflow] optional = true @@ -173,6 +174,22 @@ dbt-core=">=1.2.0" dbt-duckdb=">=1.2.0" pymongo = ">=4.3.3" +[tool.black] # https://black.readthedocs.io/en/stable/usage_and_configuration/the_basics.html#configuration-via-a-file +line-length = 100 +preview = true + +[tool.isort] # https://pycqa.github.io/isort/docs/configuration/options.html +color_output = true +line_length = 100 +profile = "black" +src_paths = ["dlt"] +multi_line_output = 3 + +[tool.ruff] # https://beta.ruff.rs/docs/ +line-length = 100 +ignore = ["F401"] +ignore-init-module-imports = true + [build-system] requires = ["poetry-core>=1.0.8"] build-backend = "poetry.core.masonry.api" \ No newline at end of file From 57e5b224b79642c301817c11640ccd9b6da0019e Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 22 Nov 2023 16:29:17 +0100 Subject: [PATCH 03/10] post rebase lockfile update --- poetry.lock | 7758 ++++++++++++++++++++++++--------------------------- 1 file changed, 3708 insertions(+), 4050 deletions(-) diff --git a/poetry.lock b/poetry.lock index 018c1357fe..0d05ead4b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,18 +1,26 @@ +# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. + [[package]] name = "about-time" version = "4.2.1" description = "Easily measure timing and throughput of code blocks, with beautiful human friendly representations." -category = "dev" optional = false python-versions = ">=3.7, <4" +files = [ + {file = "about-time-4.2.1.tar.gz", hash = "sha256:6a538862d33ce67d997429d14998310e1dbfda6cb7d9bbfbf799c4709847fece"}, + {file = "about_time-4.2.1-py3-none-any.whl", hash = "sha256:8bbf4c75fe13cbd3d72f49a03b02c5c7dca32169b6d49117c257e7eb3eaee341"}, +] [[package]] name = "adlfs" version = "2023.8.0" description = "Access Azure Datalake Gen1 with fsspec and dask" -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "adlfs-2023.8.0-py3-none-any.whl", hash = "sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e"}, + {file = "adlfs-2023.8.0.tar.gz", hash = "sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9"}, +] [package.dependencies] aiohttp = ">=3.7.0" @@ -29,9 +37,12 @@ docs = ["furo", "myst-parser", "numpydoc", "sphinx"] name = "agate" version = "1.6.3" description = "A data analysis library that is optimized for humans instead of machines." -category = "main" optional = false python-versions = "*" +files = [ + {file = "agate-1.6.3-py2.py3-none-any.whl", hash = "sha256:2d568fd68a8eb8b56c805a1299ba4bc30ca0434563be1bea309c9d1c1c8401f4"}, + {file = "agate-1.6.3.tar.gz", hash = "sha256:e0f2f813f7e12311a4cdccc97d6ba0a6781e9c1aa8eca0ab00d5931c0113a308"}, +] [package.dependencies] Babel = ">=2.0" @@ -50,9 +61,12 @@ test = ["PyICU (>=2.4.2)", "coverage (>=3.7.1)", "cssselect (>=0.9.1)", "lxml (> name = "aiobotocore" version = "2.5.2" description = "Async client for aws services using botocore and aiohttp" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "aiobotocore-2.5.2-py3-none-any.whl", hash = "sha256:337429ffd3cc367532572d40be809a84c7b5335f3f8eca2f23e09dfaa9a9ef90"}, + {file = "aiobotocore-2.5.2.tar.gz", hash = "sha256:e7399f21570db1c287f1c0c814dd3475dfe1c8166722e2c77ce67f172cbcfa89"}, +] [package.dependencies] aiohttp = ">=3.3.1,<4.0.0" @@ -68,9 +82,97 @@ boto3 = ["boto3 (>=1.26.161,<1.26.162)"] name = "aiohttp" version = "3.8.5" description = "Async http client/server framework (asyncio)" -category = "main" optional = false python-versions = ">=3.6" +files = [ + {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"}, + {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"}, + {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"}, + {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"}, + {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"}, + {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"}, + {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"}, + {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"}, + {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"}, + {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"}, + {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"}, + {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"}, + {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"}, + {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"}, + {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"}, + {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"}, + {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"}, + {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"}, + {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"}, + {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"}, + {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"}, + {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"}, + {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"}, + {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"}, + {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"}, + {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"}, + {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"}, + {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"}, + {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"}, + {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"}, + {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"}, + {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"}, + {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"}, + {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"}, + {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"}, + {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"}, + {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"}, + {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"}, + {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"}, + {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"}, + {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"}, + {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"}, + {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"}, + {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"}, + {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"}, + {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"}, + {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"}, + {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"}, + {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"}, + {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"}, + {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"}, + {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"}, + {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"}, + {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"}, + {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"}, + {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"}, + {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"}, + {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"}, + {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"}, + {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"}, + {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"}, + {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"}, + {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"}, + {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"}, + {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"}, + {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"}, + {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"}, + {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"}, + {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"}, + {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"}, + {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"}, + {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"}, + {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"}, + {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"}, + {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"}, + {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"}, + {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"}, + {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"}, + {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"}, + {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"}, + {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"}, + {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"}, + {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"}, + {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"}, + {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"}, + {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"}, + {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"}, +] [package.dependencies] aiosignal = ">=1.1.2" @@ -88,9 +190,12 @@ speedups = ["Brotli", "aiodns", "cchardet"] name = "aioitertools" version = "0.11.0" description = "itertools and builtins for AsyncIO and mixed iterables" -category = "main" optional = true python-versions = ">=3.6" +files = [ + {file = "aioitertools-0.11.0-py3-none-any.whl", hash = "sha256:04b95e3dab25b449def24d7df809411c10e62aab0cbe31a50ca4e68748c43394"}, + {file = "aioitertools-0.11.0.tar.gz", hash = "sha256:42c68b8dd3a69c2bf7f2233bf7df4bb58b557bca5252ac02ed5187bbc67d6831"}, +] [package.dependencies] typing_extensions = {version = ">=4.0", markers = "python_version < \"3.10\""} @@ -99,9 +204,12 @@ typing_extensions = {version = ">=4.0", markers = "python_version < \"3.10\""} name = "aiosignal" version = "1.3.1" description = "aiosignal: a list of registered asynchronous callbacks" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, + {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, +] [package.dependencies] frozenlist = ">=1.1.0" @@ -110,9 +218,12 @@ frozenlist = ">=1.1.0" name = "alembic" version = "1.12.0" description = "A database migration tool for SQLAlchemy." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "alembic-1.12.0-py3-none-any.whl", hash = "sha256:03226222f1cf943deee6c85d9464261a6c710cd19b4fe867a3ad1f25afda610f"}, + {file = "alembic-1.12.0.tar.gz", hash = "sha256:8e7645c32e4f200675e69f0745415335eb59a3663f5feb487abfa0b30c45888b"}, +] [package.dependencies] importlib-metadata = {version = "*", markers = "python_version < \"3.9\""} @@ -128,9 +239,12 @@ tz = ["python-dateutil"] name = "alive-progress" version = "3.1.4" description = "A new kind of Progress Bar, with real-time throughput, ETA, and very cool animations!" -category = "dev" optional = false python-versions = ">=3.7, <4" +files = [ + {file = "alive-progress-3.1.4.tar.gz", hash = "sha256:74a95d8d0d42bc99d3a3725dbd06ebb852245f1b64e301a7c375b92b22663f7b"}, + {file = "alive_progress-3.1.4-py3-none-any.whl", hash = "sha256:c80ad87ce9c1054b01135a87fae69ecebbfc2107497ae87cbe6aec7e534903db"}, +] [package.dependencies] about-time = "4.2.1" @@ -140,9 +254,12 @@ grapheme = "0.6.0" name = "annotated-types" version = "0.6.0" description = "Reusable constraint types to use with typing.Annotated" -category = "main" optional = false python-versions = ">=3.8" +files = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] [package.dependencies] typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} @@ -151,17 +268,23 @@ typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} name = "ansicon" version = "1.89.0" description = "Python wrapper for loading Jason Hood's ANSICON" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "ansicon-1.89.0-py2.py3-none-any.whl", hash = "sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec"}, + {file = "ansicon-1.89.0.tar.gz", hash = "sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1"}, +] [[package]] name = "anyio" version = "4.0.0" description = "High level compatibility layer for multiple asynchronous event loop implementations" -category = "main" optional = false python-versions = ">=3.8" +files = [ + {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"}, + {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, +] [package.dependencies] exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} @@ -177,9 +300,12 @@ trio = ["trio (>=0.22)"] name = "apache-airflow" version = "2.7.2" description = "Programmatically author, schedule and monitor data pipelines" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-2.7.2.tar.gz", hash = "sha256:c6fab3449066867d9a7728f40b6b9e27f1ea68bca39b064a27f5c5ddc3262224"}, + {file = "apache_airflow-2.7.2-py3-none-any.whl", hash = "sha256:1bc2c022bcae24b911e49fafd5fb619b49efba87ed7bc8561a2065810d8fe899"}, +] [package.dependencies] alembic = ">=1.6.3,<2.0" @@ -382,9 +508,12 @@ zendesk = ["apache-airflow-providers-zendesk"] name = "apache-airflow-providers-common-sql" version = "1.7.1" description = "Provider for Apache Airflow. Implements apache-airflow-providers-common-sql package" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-providers-common-sql-1.7.1.tar.gz", hash = "sha256:ba37f795d9656a87cf4661edc381b8ecfe930272c59324b59f8a158fd0971aeb"}, + {file = "apache_airflow_providers_common_sql-1.7.1-py3-none-any.whl", hash = "sha256:36da2f51b51a64765b0ed5e6a5fece8eaa3ca173dfbff803e2fe2a0afbb90944"}, +] [package.dependencies] apache-airflow = ">=2.4.0" @@ -398,9 +527,12 @@ pandas = ["pandas (>=0.17.1)"] name = "apache-airflow-providers-ftp" version = "3.5.1" description = "Provider for Apache Airflow. Implements apache-airflow-providers-ftp package" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-providers-ftp-3.5.1.tar.gz", hash = "sha256:dc6dc524dc7454857a0812154d7540172e36db3a87e48a4a91918ebf80898bbf"}, + {file = "apache_airflow_providers_ftp-3.5.1-py3-none-any.whl", hash = "sha256:e4ea77d6276355acfe2392c12155db7b9d51be460b7673b616dc1d8bee03c1d7"}, +] [package.dependencies] apache-airflow = ">=2.4.0" @@ -412,9 +544,12 @@ openlineage = ["apache-airflow-providers-openlineage"] name = "apache-airflow-providers-http" version = "4.5.1" description = "Provider for Apache Airflow. Implements apache-airflow-providers-http package" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-providers-http-4.5.1.tar.gz", hash = "sha256:ec90920ff980fc264af9811dc72c37ef272bcdb3d007c7114e12366559426460"}, + {file = "apache_airflow_providers_http-4.5.1-py3-none-any.whl", hash = "sha256:702f26938bc22684eefecd297c2b0809793f9e43b8d911d807a29f21e69da179"}, +] [package.dependencies] aiohttp = "*" @@ -427,9 +562,12 @@ requests-toolbelt = "*" name = "apache-airflow-providers-imap" version = "3.3.1" description = "Provider for Apache Airflow. Implements apache-airflow-providers-imap package" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-providers-imap-3.3.1.tar.gz", hash = "sha256:40bac2a75e4dfbcd7d397776d90d03938facaf2707acc6cc119a8db684e53f77"}, + {file = "apache_airflow_providers_imap-3.3.1-py3-none-any.whl", hash = "sha256:adb6ef7864a5a8e245fbbd555bb4ef1eecf5b094d6d23ca0edc5f0aded50490d"}, +] [package.dependencies] apache-airflow = ">=2.4.0" @@ -438,9 +576,12 @@ apache-airflow = ">=2.4.0" name = "apache-airflow-providers-sqlite" version = "3.4.3" description = "Provider for Apache Airflow. Implements apache-airflow-providers-sqlite package" -category = "dev" optional = false python-versions = "~=3.8" +files = [ + {file = "apache-airflow-providers-sqlite-3.4.3.tar.gz", hash = "sha256:347d2db03eaa5ea9fef414666565ffa5e849935cbc30e37237edcaa822b5ced8"}, + {file = "apache_airflow_providers_sqlite-3.4.3-py3-none-any.whl", hash = "sha256:4ffa6a50f0ea1b4e51240b657dfec3fb026c87bdfa71af908a56461df6a6f2e0"}, +] [package.dependencies] apache-airflow = ">=2.4.0" @@ -453,9 +594,12 @@ common-sql = ["apache-airflow-providers-common-sql"] name = "apispec" version = "6.3.0" description = "A pluggable API specification generator. Currently supports the OpenAPI Specification (f.k.a. the Swagger specification)." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "apispec-6.3.0-py3-none-any.whl", hash = "sha256:95a0b9355785df998bb0e9b939237a30ee4c7428fd6ef97305eae3da06b9b339"}, + {file = "apispec-6.3.0.tar.gz", hash = "sha256:6cb08d92ce73ff0b3bf46cb2ea5c00d57289b0f279fb0256a3df468182ba5344"}, +] [package.dependencies] packaging = ">=21.3" @@ -474,17 +618,23 @@ yaml = ["PyYAML (>=3.10)"] name = "appdirs" version = "1.4.4" description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "dev" optional = false python-versions = "*" +files = [ + {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, + {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, +] [[package]] name = "argcomplete" version = "3.1.1" description = "Bash tab completion for argparse" -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "argcomplete-3.1.1-py3-none-any.whl", hash = "sha256:35fa893a88deea85ea7b20d241100e64516d6af6d7b0ae2bed1d263d26f70948"}, + {file = "argcomplete-3.1.1.tar.gz", hash = "sha256:6c4c563f14f01440aaffa3eae13441c5db2357b5eec639abe7c0b15334627dff"}, +] [package.extras] test = ["coverage", "mypy", "pexpect", "ruff", "wheel"] @@ -493,9 +643,12 @@ test = ["coverage", "mypy", "pexpect", "ruff", "wheel"] name = "asgiref" version = "3.7.2" description = "ASGI specs, helper code, and adapters" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "asgiref-3.7.2-py3-none-any.whl", hash = "sha256:89b2ef2247e3b562a16eef663bc0e2e703ec6468e2fa8a5cd61cd449786d4f6e"}, + {file = "asgiref-3.7.2.tar.gz", hash = "sha256:9e0ce3aa93a819ba5b45120216b23878cf6e8525eb3848653452b4192b92afed"}, +] [package.dependencies] typing-extensions = {version = ">=4", markers = "python_version < \"3.11\""} @@ -507,17 +660,23 @@ tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] name = "asn1crypto" version = "1.5.1" description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" -category = "main" optional = true python-versions = "*" +files = [ + {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, + {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, +] [[package]] name = "astatine" version = "0.3.3" description = "Some handy helper functions for Python's AST module." -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "astatine-0.3.3-py3-none-any.whl", hash = "sha256:6d8c914f01fbea252cb8f31563f2e766a9ab03c02b9bcc37d18f7d9138828401"}, + {file = "astatine-0.3.3.tar.gz", hash = "sha256:0c58a7844b5890ff16da07dbfeb187341d8324cb4378940f89d795cbebebce08"}, +] [package.dependencies] asttokens = ">=1.1" @@ -527,9 +686,12 @@ domdf-python-tools = ">=2.7.0" name = "asttokens" version = "2.3.0" description = "Annotate AST trees with source code positions" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "asttokens-2.3.0-py2.py3-none-any.whl", hash = "sha256:bef1a51bc256d349e9f94e7e40e44b705ed1162f55294220dd561d24583d9877"}, + {file = "asttokens-2.3.0.tar.gz", hash = "sha256:2552a88626aaa7f0f299f871479fc755bd4e7c11e89078965e928fb7bb9a6afe"}, +] [package.dependencies] six = ">=1.12.0" @@ -541,9 +703,12 @@ test = ["astroid", "pytest"] name = "astunparse" version = "1.6.3" description = "An AST unparser for Python" -category = "main" optional = false python-versions = "*" +files = [ + {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"}, + {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"}, +] [package.dependencies] six = ">=1.6.1,<2.0" @@ -553,25 +718,33 @@ wheel = ">=0.23.0,<1.0" name = "async-timeout" version = "4.0.3" description = "Timeout context manager for asyncio programs" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, +] [[package]] name = "atomicwrites" version = "1.4.1" description = "Atomic file writes." -category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, +] [[package]] name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, + {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, +] [package.extras] cov = ["attrs[tests]", "coverage[toml] (>=5.3)"] @@ -584,9 +757,12 @@ tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pyte name = "authlib" version = "1.2.1" description = "The ultimate Python library in building OAuth and OpenID Connect servers and clients." -category = "main" optional = true python-versions = "*" +files = [ + {file = "Authlib-1.2.1-py2.py3-none-any.whl", hash = "sha256:c88984ea00149a90e3537c964327da930779afa4564e354edfd98410bea01911"}, + {file = "Authlib-1.2.1.tar.gz", hash = "sha256:421f7c6b468d907ca2d9afede256f068f87e34d23dd221c07d13d4c234726afb"}, +] [package.dependencies] cryptography = ">=3.2" @@ -595,9 +771,12 @@ cryptography = ">=3.2" name = "azure-core" version = "1.29.3" description = "Microsoft Azure Core Library for Python" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "azure-core-1.29.3.tar.gz", hash = "sha256:c92700af982e71c8c73de9f4c20da8b3f03ce2c22d13066e4d416b4629c87903"}, + {file = "azure_core-1.29.3-py3-none-any.whl", hash = "sha256:f8b2910f92b66293d93bd00564924ad20ad48f4a1e150577cf18d1e7d4f9263c"}, +] [package.dependencies] requests = ">=2.18.4" @@ -611,9 +790,12 @@ aio = ["aiohttp (>=3.0)"] name = "azure-datalake-store" version = "0.0.53" description = "Azure Data Lake Store Filesystem Client Library for Python" -category = "main" optional = true python-versions = "*" +files = [ + {file = "azure-datalake-store-0.0.53.tar.gz", hash = "sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393"}, + {file = "azure_datalake_store-0.0.53-py2.py3-none-any.whl", hash = "sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b"}, +] [package.dependencies] cffi = "*" @@ -624,9 +806,12 @@ requests = ">=2.20.0" name = "azure-identity" version = "1.14.0" description = "Microsoft Azure Identity Library for Python" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "azure-identity-1.14.0.zip", hash = "sha256:72441799f8c5c89bfe21026965e266672a7c5d050c2c65119ef899dd5362e2b1"}, + {file = "azure_identity-1.14.0-py3-none-any.whl", hash = "sha256:edabf0e010eb85760e1dd19424d5e8f97ba2c9caff73a16e7b30ccbdbcce369b"}, +] [package.dependencies] azure-core = ">=1.11.0,<2.0.0" @@ -638,9 +823,12 @@ msal-extensions = ">=0.3.0,<2.0.0" name = "azure-storage-blob" version = "12.17.0" description = "Microsoft Azure Blob Storage Client Library for Python" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "azure-storage-blob-12.17.0.zip", hash = "sha256:c14b785a17050b30fc326a315bdae6bc4a078855f4f94a4c303ad74a48dc8c63"}, + {file = "azure_storage_blob-12.17.0-py3-none-any.whl", hash = "sha256:0016e0c549a80282d7b4920c03f2f4ba35c53e6e3c7dbcd2a4a8c8eb3882c1e7"}, +] [package.dependencies] azure-core = ">=1.28.0,<2.0.0" @@ -655,9 +843,12 @@ aio = ["azure-core[aio] (>=1.28.0,<2.0.0)"] name = "babel" version = "2.12.1" description = "Internationalization utilities" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "Babel-2.12.1-py3-none-any.whl", hash = "sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610"}, + {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"}, +] [package.dependencies] pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} @@ -666,17 +857,23 @@ pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} name = "backoff" version = "2.2.1" description = "Function decoration for backoff and retry" -category = "dev" optional = false python-versions = ">=3.7,<4.0" +files = [ + {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, + {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, +] [[package]] name = "bandit" version = "1.7.5" description = "Security oriented static analyser for python code." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "bandit-1.7.5-py3-none-any.whl", hash = "sha256:75665181dc1e0096369112541a056c59d1c5f66f9bb74a8d686c3c362b83f549"}, + {file = "bandit-1.7.5.tar.gz", hash = "sha256:bdfc739baa03b880c2d15d0431b31c658ffc348e907fe197e54e0389dd59e11e"}, +] [package.dependencies] colorama = {version = ">=0.3.9", markers = "platform_system == \"Windows\""} @@ -694,9 +891,12 @@ yaml = ["PyYAML"] name = "beautifulsoup4" version = "4.12.2" description = "Screen-scraping library" -category = "main" optional = true python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, + {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, +] [package.dependencies] soupsieve = ">1.2" @@ -709,9 +909,32 @@ lxml = ["lxml"] name = "black" version = "23.9.1" description = "The uncompromising code formatter." -category = "dev" optional = false python-versions = ">=3.8" +files = [ + {file = "black-23.9.1-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:d6bc09188020c9ac2555a498949401ab35bb6bf76d4e0f8ee251694664df6301"}, + {file = "black-23.9.1-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:13ef033794029b85dfea8032c9d3b92b42b526f1ff4bf13b2182ce4e917f5100"}, + {file = "black-23.9.1-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:75a2dc41b183d4872d3a500d2b9c9016e67ed95738a3624f4751a0cb4818fe71"}, + {file = "black-23.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13a2e4a93bb8ca74a749b6974925c27219bb3df4d42fc45e948a5d9feb5122b7"}, + {file = "black-23.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:adc3e4442eef57f99b5590b245a328aad19c99552e0bdc7f0b04db6656debd80"}, + {file = "black-23.9.1-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:8431445bf62d2a914b541da7ab3e2b4f3bc052d2ccbf157ebad18ea126efb91f"}, + {file = "black-23.9.1-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:8fc1ddcf83f996247505db6b715294eba56ea9372e107fd54963c7553f2b6dfe"}, + {file = "black-23.9.1-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:7d30ec46de88091e4316b17ae58bbbfc12b2de05e069030f6b747dfc649ad186"}, + {file = "black-23.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:031e8c69f3d3b09e1aa471a926a1eeb0b9071f80b17689a655f7885ac9325a6f"}, + {file = "black-23.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:538efb451cd50f43aba394e9ec7ad55a37598faae3348d723b59ea8e91616300"}, + {file = "black-23.9.1-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:638619a559280de0c2aa4d76f504891c9860bb8fa214267358f0a20f27c12948"}, + {file = "black-23.9.1-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:a732b82747235e0542c03bf352c126052c0fbc458d8a239a94701175b17d4855"}, + {file = "black-23.9.1-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:cf3a4d00e4cdb6734b64bf23cd4341421e8953615cba6b3670453737a72ec204"}, + {file = "black-23.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf99f3de8b3273a8317681d8194ea222f10e0133a24a7548c73ce44ea1679377"}, + {file = "black-23.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:14f04c990259576acd093871e7e9b14918eb28f1866f91968ff5524293f9c573"}, + {file = "black-23.9.1-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:c619f063c2d68f19b2d7270f4cf3192cb81c9ec5bc5ba02df91471d0b88c4c5c"}, + {file = "black-23.9.1-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:6a3b50e4b93f43b34a9d3ef00d9b6728b4a722c997c99ab09102fd5efdb88325"}, + {file = "black-23.9.1-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c46767e8df1b7beefb0899c4a95fb43058fa8500b6db144f4ff3ca38eb2f6393"}, + {file = "black-23.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50254ebfa56aa46a9fdd5d651f9637485068a1adf42270148cd101cdf56e0ad9"}, + {file = "black-23.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:403397c033adbc45c2bd41747da1f7fc7eaa44efbee256b53842470d4ac5a70f"}, + {file = "black-23.9.1-py3-none-any.whl", hash = "sha256:6ccd59584cc834b6d127628713e4b6b968e5f79572da66284532525a042549f9"}, + {file = "black-23.9.1.tar.gz", hash = "sha256:24b6b3ff5c6d9ea08a8888f6977eae858e1f340d7260cf56d70a49823236b62d"}, +] [package.dependencies] click = ">=8.0.0" @@ -732,9 +955,12 @@ uvloop = ["uvloop (>=0.15.2)"] name = "blessed" version = "1.20.0" description = "Easy, practical library for making terminal apps, by providing an elegant, well-documented interface to Colors, Keyboard input, and screen Positioning capabilities." -category = "dev" optional = false python-versions = ">=2.7" +files = [ + {file = "blessed-1.20.0-py2.py3-none-any.whl", hash = "sha256:0c542922586a265e699188e52d5f5ac5ec0dd517e5a1041d90d2bbf23f906058"}, + {file = "blessed-1.20.0.tar.gz", hash = "sha256:2cdd67f8746e048f00df47a2880f4d6acbcdb399031b604e34ba8f71d5787680"}, +] [package.dependencies] jinxed = {version = ">=1.1.0", markers = "platform_system == \"Windows\""} @@ -745,17 +971,23 @@ wcwidth = ">=0.1.4" name = "blinker" version = "1.6.2" description = "Fast, simple object-to-object and broadcast signaling" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "blinker-1.6.2-py3-none-any.whl", hash = "sha256:c3d739772abb7bc2860abf5f2ec284223d9ad5c76da018234f6f50d6f31ab1f0"}, + {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"}, +] [[package]] name = "boto3" version = "1.26.161" description = "The AWS SDK for Python" -category = "main" optional = true python-versions = ">= 3.7" +files = [ + {file = "boto3-1.26.161-py3-none-any.whl", hash = "sha256:f66e5c9dbe7f34383bcf64fa6070771355c11a44dd75c7f1279f2f37e1c89183"}, + {file = "boto3-1.26.161.tar.gz", hash = "sha256:662731e464d14af1035f44fc6a46b0e3112ee011ac0a5ed416d205daa3e15f25"}, +] [package.dependencies] botocore = ">=1.29.161,<1.30.0" @@ -769,9 +1001,12 @@ crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] name = "boto3-stubs" version = "1.28.40" description = "Type annotations for boto3 1.28.40 generated with mypy-boto3-builder 7.18.2" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "boto3-stubs-1.28.40.tar.gz", hash = "sha256:76079a82f199087319762c931f13506e02129132e80257dab0888d3da7dc11c7"}, + {file = "boto3_stubs-1.28.40-py3-none-any.whl", hash = "sha256:bd1d1cbdcbf18902a090d4a746cdecef2a7ebe31cf9a474bbe407d57eaa79a6a"}, +] [package.dependencies] botocore-stubs = "*" @@ -1146,9 +1381,12 @@ xray = ["mypy-boto3-xray (>=1.28.0,<1.29.0)"] name = "botocore" version = "1.29.161" description = "Low-level, data-driven core of boto 3." -category = "main" optional = true python-versions = ">= 3.7" +files = [ + {file = "botocore-1.29.161-py3-none-any.whl", hash = "sha256:b906999dd53dda2ef0ef6f7f55fcc81a4b06b9f1c8a9f65c546e0b981f959f5f"}, + {file = "botocore-1.29.161.tar.gz", hash = "sha256:a50edd715eb510343e27849f36483804aae4b871590db4d4996aa53368dcac40"}, +] [package.dependencies] jmespath = ">=0.7.1,<2.0.0" @@ -1162,9 +1400,12 @@ crt = ["awscrt (==0.16.9)"] name = "botocore-stubs" version = "1.31.40" description = "Type annotations and code completion for botocore" -category = "main" optional = false python-versions = ">=3.7,<4.0" +files = [ + {file = "botocore_stubs-1.31.40-py3-none-any.whl", hash = "sha256:aab534d7e7949cd543bc9b2fadc1a36712033cb00e6f31e2475eefe8486d19ae"}, + {file = "botocore_stubs-1.31.40.tar.gz", hash = "sha256:2001a253daf4ae2e171e6137b9982a00a7fbfc7a53449a16856dc049e7cd5214"}, +] [package.dependencies] types-awscrt = "*" @@ -1174,25 +1415,34 @@ typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.9\""} name = "cachelib" version = "0.9.0" description = "A collection of cache libraries in the same API interface." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "cachelib-0.9.0-py3-none-any.whl", hash = "sha256:811ceeb1209d2fe51cd2b62810bd1eccf70feba5c52641532498be5c675493b3"}, + {file = "cachelib-0.9.0.tar.gz", hash = "sha256:38222cc7c1b79a23606de5c2607f4925779e37cdcea1c2ad21b8bae94b5425a5"}, +] [[package]] name = "cachetools" version = "5.3.1" description = "Extensible memoizing collections and decorators" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, + {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, +] [[package]] name = "cattrs" version = "23.1.2" description = "Composable complex class support for attrs and dataclasses." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"}, + {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"}, +] [package.dependencies] attrs = ">=20" @@ -1212,44 +1462,194 @@ ujson = ["ujson (>=5.4.0,<6.0.0)"] name = "certifi" version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." -category = "main" optional = false python-versions = ">=3.6" +files = [ + {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, + {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, +] [[package]] name = "cffi" version = "1.15.1" description = "Foreign Function Interface for Python calling C code." -category = "main" optional = false python-versions = "*" - -[package.dependencies] -pycparser = "*" - +files = [ + {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, + {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, + {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, + {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, + {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, + {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, + {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, + {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, + {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, + {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, + {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, + {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, + {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, + {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, + {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, + {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, + {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, + {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, + {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, +] + +[package.dependencies] +pycparser = "*" + [[package]] name = "chardet" version = "5.2.0" description = "Universal encoding detector for Python 3" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] [[package]] name = "charset-normalizer" version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -category = "main" optional = false python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"}, + {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"}, + {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"}, + {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"}, + {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"}, + {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"}, + {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, +] [[package]] name = "click" version = "8.1.7" description = "Composable command line interface toolkit" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} @@ -1258,9 +1658,12 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} name = "clickclick" version = "20.10.2" description = "Click utility functions" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "clickclick-20.10.2-py2.py3-none-any.whl", hash = "sha256:c8f33e6d9ec83f68416dd2136a7950125bd256ec39ccc9a85c6e280a16be2bb5"}, + {file = "clickclick-20.10.2.tar.gz", hash = "sha256:4efb13e62353e34c5eef7ed6582c4920b418d7dedc86d819e22ee089ba01802c"}, +] [package.dependencies] click = ">=4.0" @@ -1270,17 +1673,23 @@ PyYAML = ">=3.11" name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." -category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] [[package]] name = "coloredlogs" version = "15.0.1" description = "Colored terminal output for Python's logging module" -category = "main" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, + {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, +] [package.dependencies] humanfriendly = ">=9.1" @@ -1292,9 +1701,12 @@ cron = ["capturer (>=2.4)"] name = "colorlog" version = "4.8.0" description = "Log formatting with colors!" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "colorlog-4.8.0-py2.py3-none-any.whl", hash = "sha256:3dd15cb27e8119a24c1a7b5c93f9f3b455855e0f73993b1c25921b2f646f1dcd"}, + {file = "colorlog-4.8.0.tar.gz", hash = "sha256:59b53160c60902c405cdec28d38356e09d40686659048893e026ecbd589516b1"}, +] [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -1303,9 +1715,12 @@ colorama = {version = "*", markers = "sys_platform == \"win32\""} name = "configupdater" version = "3.1.1" description = "Parser like ConfigParser but for updating configuration files" -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "ConfigUpdater-3.1.1-py2.py3-none-any.whl", hash = "sha256:805986dbeba317886c7a8d348b2e34986dc9e3128cd3761ecc35decbd372b286"}, + {file = "ConfigUpdater-3.1.1.tar.gz", hash = "sha256:46f0c74d73efa723776764b43c9739f68052495dd3d734319c1d0eb58511f15b"}, +] [package.extras] testing = ["flake8", "pytest", "pytest-cov", "pytest-virtualenv", "pytest-xdist", "sphinx"] @@ -1314,17 +1729,37 @@ testing = ["flake8", "pytest", "pytest-cov", "pytest-virtualenv", "pytest-xdist" name = "connectorx" version = "0.3.1" description = "" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "connectorx-0.3.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:719750045e7c3b94c199271fbfe6aef47944768e711f27bcc606b498707e0054"}, + {file = "connectorx-0.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aed31b08acebeb3ebbe53c0df846c686e7c27c4242bff3a75b72cf517d070257"}, + {file = "connectorx-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71d2c2678339fb01f89469bbe22e66e75cabcf727a52ed72d576fef5744ebc58"}, + {file = "connectorx-0.3.1-cp310-none-win_amd64.whl", hash = "sha256:92e576ef9610b59f8e5456c12d22e5b0752d0207f586df82701987657909888b"}, + {file = "connectorx-0.3.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:36c28cc59220998928e7b283eecf404e17e077dc3e525570096d0968b192cc64"}, + {file = "connectorx-0.3.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:c5173e7252f593c46787627a46561b0d949eb80ab23321e045bbf6bd5131945c"}, + {file = "connectorx-0.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c8411631750d24c12e5e296720637909b8515d5faa3b5eaf7bb86c582d02667"}, + {file = "connectorx-0.3.1-cp37-none-win_amd64.whl", hash = "sha256:0674b6389f8f2ba62155ac2f718df18f76f9de5c50d9911a5fefe7485e1c598e"}, + {file = "connectorx-0.3.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:324c5075e8aa6698db8c877cb847f0d86172784db88ac0f3e6762aa9852330f3"}, + {file = "connectorx-0.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:027a3880629a7b33ae0c7a80ab4fa53286957a253af2dfe34f19adfea6b79b91"}, + {file = "connectorx-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a666b967958fcf9fc0444a7b3603483ee23a2fe39f0da3d545ff199f376f7e4b"}, + {file = "connectorx-0.3.1-cp38-none-win_amd64.whl", hash = "sha256:3c5dedfd75cf44898c17cc84a1dd0ab6ed0fa54de0461f2d6aa4bcb2c2b0dc1d"}, + {file = "connectorx-0.3.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:354c4126bcd7a9efbb8879feac92e1e7b0d0712f7e98665c392af663805491f8"}, + {file = "connectorx-0.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3011e1f9a27fd2a7b12c6a45bc29f6e7577a27418a3f607adaf54b301ff09068"}, + {file = "connectorx-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1efb6ed547acc5837c2211e3d65d22948019d1653e7b30e522a4a4bd6d25fa8"}, + {file = "connectorx-0.3.1-cp39-none-win_amd64.whl", hash = "sha256:001b473e600b6d25af83b32674f98dccf49705a59bd6df724b5ba9beb236a0e0"}, +] [[package]] name = "connexion" version = "2.14.1" description = "Connexion - API first applications with OpenAPI/Swagger and Flask" -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "connexion-2.14.1-py2.py3-none-any.whl", hash = "sha256:f343717241b4c4802a694c38fee66fb1693c897fe4ea5a957fa9b3b07caf6394"}, + {file = "connexion-2.14.1.tar.gz", hash = "sha256:99aa5781e70a7b94f8ffae8cf89f309d49cdb811bbd65a8e2f2546f3b19a01e6"}, +] [package.dependencies] clickclick = ">=1.2,<21" @@ -1348,9 +1783,11 @@ tests = ["MarkupSafe (>=0.23)", "aiohttp (>=2.3.10,<4)", "aiohttp-jinja2 (>=0.14 name = "cron-descriptor" version = "1.4.0" description = "A Python library that converts cron expressions into human readable strings." -category = "main" optional = false python-versions = "*" +files = [ + {file = "cron_descriptor-1.4.0.tar.gz", hash = "sha256:b6ff4e3a988d7ca04a4ab150248e9f166fb7a5c828a85090e75bcc25aa93b4dd"}, +] [package.extras] dev = ["polib"] @@ -1359,9 +1796,12 @@ dev = ["polib"] name = "croniter" version = "1.4.1" description = "croniter provides iteration for datetime object with cron like format" -category = "dev" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "croniter-1.4.1-py2.py3-none-any.whl", hash = "sha256:9595da48af37ea06ec3a9f899738f1b2c1c13da3c38cea606ef7cd03ea421128"}, + {file = "croniter-1.4.1.tar.gz", hash = "sha256:1a6df60eacec3b7a0aa52a8f2ef251ae3dd2a7c7c8b9874e73e791636d55a361"}, +] [package.dependencies] python-dateutil = "*" @@ -1370,9 +1810,33 @@ python-dateutil = "*" name = "cryptography" version = "41.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"}, + {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"}, + {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"}, + {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"}, + {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"}, + {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"}, + {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"}, +] [package.dependencies] cffi = ">=1.12" @@ -1391,9 +1855,12 @@ test-randomorder = ["pytest-randomly"] name = "databind-core" version = "4.4.0" description = "Databind is a library inspired by jackson-databind to de-/serialize Python dataclasses. Compatible with Python 3.7 and newer." -category = "dev" optional = false python-versions = ">=3.6.3,<4.0.0" +files = [ + {file = "databind.core-4.4.0-py3-none-any.whl", hash = "sha256:3c8a4d9abc93e158af9931d8cec389ddfc0514e02aec03b397948d243db11881"}, + {file = "databind.core-4.4.0.tar.gz", hash = "sha256:715d485e934c073f819f0250bbfcaf59c1319f83427365bc7cfd4c347f87576d"}, +] [package.dependencies] Deprecated = ">=1.2.12,<2.0.0" @@ -1406,9 +1873,12 @@ typing-extensions = ">=3.10.0" name = "databind-json" version = "4.4.0" description = "De-/serialize Python dataclasses to or from JSON payloads. Compatible with Python 3.7 and newer." -category = "dev" optional = false python-versions = ">=3.6.3,<4.0.0" +files = [ + {file = "databind.json-4.4.0-py3-none-any.whl", hash = "sha256:df8874118cfba6fd0e77ec3d41a87e04e26034bd545230cab0db1fe904bf1b09"}, + {file = "databind.json-4.4.0.tar.gz", hash = "sha256:4356afdf0aeefcc053eda0888650c59cc558be2686f08a58324d675ccd023586"}, +] [package.dependencies] "databind.core" = ">=4.4.0,<5.0.0" @@ -1420,9 +1890,12 @@ typing-extensions = ">=3.10.0" name = "dbt-athena-community" version = "1.5.2" description = "The athena adapter plugin for dbt (data build tool)" -category = "main" optional = true python-versions = "*" +files = [ + {file = "dbt-athena-community-1.5.2.tar.gz", hash = "sha256:9acd333ddf33514769189a7a0b6219e13966d370098211cb1d022fa32e64671a"}, + {file = "dbt_athena_community-1.5.2-py3-none-any.whl", hash = "sha256:c9f0f8425500211a1c1deddce5aff5ed24fe08530f0ffad38e63de9c9b9f3ee6"}, +] [package.dependencies] boto3 = ">=1.26,<2.0" @@ -1436,9 +1909,12 @@ tenacity = ">=8.2,<9.0" name = "dbt-bigquery" version = "1.5.6" description = "The Bigquery adapter plugin for dbt" -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "dbt-bigquery-1.5.6.tar.gz", hash = "sha256:4655cf2ee0acda986b80e6c5d55cae57871bef22d868dfe29d8d4a5bca98a1ba"}, + {file = "dbt_bigquery-1.5.6-py3-none-any.whl", hash = "sha256:3f37544716880cbd17b32bc0c9728a0407b5615b2cd08e1bb904a7a83c46eb6c"}, +] [package.dependencies] agate = ">=1.6.3,<1.7.0" @@ -1451,9 +1927,12 @@ google-cloud-storage = ">=2.4,<3.0" name = "dbt-core" version = "1.5.6" description = "With dbt, data analysts and engineers can build analytics the way engineers build applications." -category = "main" optional = false python-versions = ">=3.7.2" +files = [ + {file = "dbt-core-1.5.6.tar.gz", hash = "sha256:af3c03cd4a1fc92481362888014ca1ffed2ffef0b0e0d98463ad0f26c49ef458"}, + {file = "dbt_core-1.5.6-py3-none-any.whl", hash = "sha256:030d2179f9efbf8ccea079296d0c79278d963bb2475c0bcce9ca4bbb0d8c393c"}, +] [package.dependencies] agate = ">=1.6,<1.7.1" @@ -1483,9 +1962,12 @@ werkzeug = ">=1,<3" name = "dbt-duckdb" version = "1.5.2" description = "The duckdb adapter plugin for dbt (data build tool)" -category = "main" optional = false python-versions = "*" +files = [ + {file = "dbt-duckdb-1.5.2.tar.gz", hash = "sha256:3407216c21bf78fd128dccfcff3ec4bf260fb145e633432015bc7d0f123e8e4b"}, + {file = "dbt_duckdb-1.5.2-py3-none-any.whl", hash = "sha256:5d18254807bbc3e61daf4f360208ad886adf44b8525e1998168290fbe73a5cbb"}, +] [package.dependencies] dbt-core = ">=1.5.0,<1.6.0" @@ -1498,17 +1980,37 @@ glue = ["boto3", "mypy-boto3-glue"] name = "dbt-extractor" version = "0.4.1" description = "A tool to analyze and extract information from Jinja used in dbt projects." -category = "main" optional = false python-versions = ">=3.6.1" +files = [ + {file = "dbt_extractor-0.4.1-cp36-abi3-macosx_10_7_x86_64.whl", hash = "sha256:4dc715bd740e418d8dc1dd418fea508e79208a24cf5ab110b0092a3cbe96bf71"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:bc9e0050e3a2f4ea9fe58e8794bc808e6709a0c688ed710fc7c5b6ef3e5623ec"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76872cdee659075d6ce2df92dc62e59a74ba571be62acab2e297ca478b49d766"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:81435841610be1b07806d72cd89b1956c6e2a84c360b9ceb3f949c62a546d569"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7c291f9f483eae4f60dd5859097d7ba51d5cb6c4725f08973ebd18cdea89d758"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:822b1e911db230e1b9701c99896578e711232001027b518c44c32f79a46fa3f9"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:554d27741a54599c39e5c0b7dbcab77400d83f908caba284a3e960db812e5814"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a805d51a25317f53cbff951c79b9cf75421cf48e4b3e1dfb3e9e8de6d824b76c"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cad90ddc708cb4182dc16fe2c87b1f088a1679877b93e641af068eb68a25d582"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:34783d788b133f223844e280e37b3f5244f2fb60acc457aa75c2667e418d5442"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:9da211869a1220ea55c5552c1567a3ea5233a6c52fa89ca87a22465481c37bc9"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:7d7c47774dc051b8c18690281a55e2e3d3320e823b17e04b06bc3ff81b1874ba"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:037907a7c7ae0391045d81338ca77ddaef899a91d80f09958f09fe374594e19b"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-win32.whl", hash = "sha256:3fe8d8e28a7bd3e0884896147269ca0202ca432d8733113386bdc84c824561bf"}, + {file = "dbt_extractor-0.4.1-cp36-abi3-win_amd64.whl", hash = "sha256:35265a0ae0a250623b0c2e3308b2738dc8212e40e0aa88407849e9ea090bb312"}, + {file = "dbt_extractor-0.4.1.tar.gz", hash = "sha256:75b1c665699ec0f1ffce1ba3d776f7dfce802156f22e70a7b9c8f0b4d7e80f42"}, +] [[package]] name = "dbt-postgres" version = "1.5.6" description = "The postgres adapter plugin for dbt (data build tool)" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "dbt-postgres-1.5.6.tar.gz", hash = "sha256:b74e471dc661819a3d4bda2d11497935661ac2e25786c8a5b7314d8241b18582"}, + {file = "dbt_postgres-1.5.6-py3-none-any.whl", hash = "sha256:bc5711c9ab0ec4b57ab814b2c4e4c973554c8374b7da94b06814ac81c91f67ef"}, +] [package.dependencies] dbt-core = "1.5.6" @@ -1518,9 +2020,12 @@ psycopg2-binary = ">=2.8,<3.0" name = "dbt-redshift" version = "1.5.10" description = "The Redshift adapter plugin for dbt" -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "dbt-redshift-1.5.10.tar.gz", hash = "sha256:2b9ae1a7d05349e208b0937cd7cc920ea427341ef96096021b18e4070e927f5c"}, + {file = "dbt_redshift-1.5.10-py3-none-any.whl", hash = "sha256:b7689b043535b6b0d217c2abfe924db2336beaae71f3f36ab9aa1e920d2bb2e0"}, +] [package.dependencies] agate = "*" @@ -1533,9 +2038,12 @@ redshift-connector = "2.0.913" name = "dbt-snowflake" version = "1.5.3" description = "The Snowflake adapter plugin for dbt" -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "dbt-snowflake-1.5.3.tar.gz", hash = "sha256:cf42772d2c2f1e29a2a64b039c66d80a8593f52a2dd711a144d43b4175802f9a"}, + {file = "dbt_snowflake-1.5.3-py3-none-any.whl", hash = "sha256:8aaa939d834798e5bb10a3ba4f52fc32a53e6e5568d6c0e8b3ac644f099972ff"}, +] [package.dependencies] dbt-core = ">=1.5.0,<1.6.0" @@ -1545,9 +2053,12 @@ snowflake-connector-python = {version = ">=3.0,<4.0", extras = ["secure-local-st name = "decopatch" version = "1.4.10" description = "Create decorators easily in python." -category = "dev" optional = false python-versions = "*" +files = [ + {file = "decopatch-1.4.10-py2.py3-none-any.whl", hash = "sha256:e151f7f93de2b1b3fd3f3272dcc7cefd1a69f68ec1c2d8e288ecd9deb36dc5f7"}, + {file = "decopatch-1.4.10.tar.gz", hash = "sha256:957f49c93f4150182c23f8fb51d13bb3213e0f17a79e09c8cca7057598b55720"}, +] [package.dependencies] makefun = ">=1.5.0" @@ -1556,17 +2067,23 @@ makefun = ">=1.5.0" name = "decorator" version = "5.1.1" description = "Decorators for Humans" -category = "main" optional = false python-versions = ">=3.5" +files = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] [[package]] name = "deprecated" version = "1.2.14" description = "Python @deprecated decorator to deprecate old python classes, functions or methods." -category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"}, + {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, +] [package.dependencies] wrapt = ">=1.10,<2" @@ -1578,9 +2095,12 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"] name = "diff-cover" version = "7.7.0" description = "Run coverage and linting reports on diffs" -category = "dev" optional = false python-versions = ">=3.7.2,<4.0.0" +files = [ + {file = "diff_cover-7.7.0-py3-none-any.whl", hash = "sha256:bf86f32ec999f9a9e79bf24969f7127ea7b4e55c3ef3cd9300feb13188c89736"}, + {file = "diff_cover-7.7.0.tar.gz", hash = "sha256:60614cf7e722cf7fb1bde497afac0b514294e1e26534449622dac4da296123fb"}, +] [package.dependencies] chardet = ">=3.0.0" @@ -1595,9 +2115,12 @@ toml = ["tomli (>=1.2.1)"] name = "dill" version = "0.3.7" description = "serialize all of Python" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"}, + {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"}, +] [package.extras] graph = ["objgraph (>=1.7.2)"] @@ -1606,9 +2129,12 @@ graph = ["objgraph (>=1.7.2)"] name = "dnspython" version = "2.4.2" description = "DNS toolkit" -category = "dev" optional = false python-versions = ">=3.8,<4.0" +files = [ + {file = "dnspython-2.4.2-py3-none-any.whl", hash = "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8"}, + {file = "dnspython-2.4.2.tar.gz", hash = "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984"}, +] [package.extras] dnssec = ["cryptography (>=2.6,<42.0)"] @@ -1622,9 +2148,12 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] name = "docspec" version = "2.2.1" description = "Docspec is a JSON object specification for representing API documentation of programming languages." -category = "dev" optional = false python-versions = ">=3.7,<4.0" +files = [ + {file = "docspec-2.2.1-py3-none-any.whl", hash = "sha256:7538f750095a9688c6980ff9a4e029a823a500f64bd00b6b4bdb27951feb31cb"}, + {file = "docspec-2.2.1.tar.gz", hash = "sha256:4854e77edc0e2de40e785e57e95880f7095a05fe978f8b54cef7a269586e15ff"}, +] [package.dependencies] "databind.core" = ">=4.2.6,<5.0.0" @@ -1635,9 +2164,12 @@ Deprecated = ">=1.2.12,<2.0.0" name = "docspec-python" version = "2.2.1" description = "A parser based on lib2to3 producing docspec data from Python source code." -category = "dev" optional = false python-versions = ">=3.7,<4.0" +files = [ + {file = "docspec_python-2.2.1-py3-none-any.whl", hash = "sha256:76ac41d35a8face35b2d766c2e8a416fb8832359785d396f0d53bcb00f178e54"}, + {file = "docspec_python-2.2.1.tar.gz", hash = "sha256:c41b850b4d6f4de30999ea6f82c9cdb9183d9bcba45559ee9173d3dab7281559"}, +] [package.dependencies] black = ">=23.1.0,<24.0.0" @@ -1648,9 +2180,11 @@ docspec = ">=2.2.1,<3.0.0" name = "docstring-parser" version = "0.11" description = "\"Parse Python docstrings in reST, Google and Numpydoc format\"" -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "docstring_parser-0.11.tar.gz", hash = "sha256:93b3f8f481c7d24e37c5d9f30293c89e2933fa209421c8abd731dd3ef0715ecb"}, +] [package.extras] test = ["black", "pytest"] @@ -1659,17 +2193,23 @@ test = ["black", "pytest"] name = "docutils" version = "0.20.1" description = "Docutils -- Python Documentation Utilities" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, + {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, +] [[package]] name = "domdf-python-tools" version = "3.6.1" description = "Helpful functions for Python 🐍 🛠️" -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "domdf_python_tools-3.6.1-py3-none-any.whl", hash = "sha256:e18158460850957f18e740eb94ede56f580ddb0cb162ab9d9834ed8bbb1b6431"}, + {file = "domdf_python_tools-3.6.1.tar.gz", hash = "sha256:acc04563d23bce4d437dd08af6b9bea788328c412772a044d8ca428a7ad861be"}, +] [package.dependencies] importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.9\""} @@ -1684,17 +2224,60 @@ dates = ["pytz (>=2019.1)"] name = "duckdb" version = "0.9.1" description = "DuckDB embedded database" -category = "main" optional = false python-versions = ">=3.7.0" +files = [ + {file = "duckdb-0.9.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6c724e105ecd78c8d86b3c03639b24e1df982392fc836705eb007e4b1b488864"}, + {file = "duckdb-0.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:75f12c5a3086079fb6440122565f1762ef1a610a954f2d8081014c1dd0646e1a"}, + {file = "duckdb-0.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:151f5410c32f8f8fe03bf23462b9604349bc0b4bd3a51049bbf5e6a482a435e8"}, + {file = "duckdb-0.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c1d066fdae22b9b711b1603541651a378017645f9fbc4adc9764b2f3c9e9e4a"}, + {file = "duckdb-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1de56d8b7bd7a7653428c1bd4b8948316df488626d27e9c388194f2e0d1428d4"}, + {file = "duckdb-0.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1fb6cd590b1bb4e31fde8efd25fedfbfa19a86fa72789fa5b31a71da0d95bce4"}, + {file = "duckdb-0.9.1-cp310-cp310-win32.whl", hash = "sha256:1039e073714d668cef9069bb02c2a6756c7969cedda0bff1332520c4462951c8"}, + {file = "duckdb-0.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:7e6ac4c28918e1d278a89ff26fd528882aa823868ed530df69d6c8a193ae4e41"}, + {file = "duckdb-0.9.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5eb750f2ee44397a61343f32ee9d9e8c8b5d053fa27ba4185d0e31507157f130"}, + {file = "duckdb-0.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aea2a46881d75dc069a242cb164642d7a4f792889010fb98210953ab7ff48849"}, + {file = "duckdb-0.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed3dcedfc7a9449b6d73f9a2715c730180056e0ba837123e7967be1cd3935081"}, + {file = "duckdb-0.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c55397bed0087ec4445b96f8d55f924680f6d40fbaa7f2e35468c54367214a5"}, + {file = "duckdb-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3261696130f1cfb955735647c93297b4a6241753fb0de26c05d96d50986c6347"}, + {file = "duckdb-0.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:64c04b1728e3e37cf93748829b5d1e028227deea75115bb5ead01c608ece44b1"}, + {file = "duckdb-0.9.1-cp311-cp311-win32.whl", hash = "sha256:12cf9fb441a32702e31534330a7b4d569083d46a91bf185e0c9415000a978789"}, + {file = "duckdb-0.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:fdfd85575ce9540e593d5d25c9d32050bd636c27786afd7b776aae0f6432b55e"}, + {file = "duckdb-0.9.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:704700a4b469e3bb1a7e85ac12e58037daaf2b555ef64a3fe2913ffef7bd585b"}, + {file = "duckdb-0.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf55b303b7b1a8c2165a96e609eb30484bc47481d94a5fb1e23123e728df0a74"}, + {file = "duckdb-0.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b70e23c14746904ca5de316436e43a685eb769c67fe3dbfaacbd3cce996c5045"}, + {file = "duckdb-0.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:77379f7f1f8b4dc98e01f8f6f8f15a0858cf456e2385e22507f3cb93348a88f9"}, + {file = "duckdb-0.9.1-cp37-cp37m-win32.whl", hash = "sha256:92c8f738489838666cae9ef41703f8b16f660bb146970d1eba8b2c06cb3afa39"}, + {file = "duckdb-0.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08c5484ac06ab714f745526d791141f547e2f5ac92f97a0a1b37dfbb3ea1bd13"}, + {file = "duckdb-0.9.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f66d3c07c7f6938d3277294677eb7dad75165e7c57c8dd505503fc5ef10f67ad"}, + {file = "duckdb-0.9.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c38044e5f78c0c7b58e9f937dcc6c34de17e9ca6be42f9f8f1a5a239f7a847a5"}, + {file = "duckdb-0.9.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73bc0d715b79566b3ede00c367235cfcce67be0eddda06e17665c7a233d6854a"}, + {file = "duckdb-0.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d26622c3b4ea6a8328d95882059e3cc646cdc62d267d48d09e55988a3bba0165"}, + {file = "duckdb-0.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3367d10096ff2b7919cedddcf60d308d22d6e53e72ee2702f6e6ca03d361004a"}, + {file = "duckdb-0.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d88a119f1cb41911a22f08a6f084d061a8c864e28b9433435beb50a56b0d06bb"}, + {file = "duckdb-0.9.1-cp38-cp38-win32.whl", hash = "sha256:99567496e45b55c67427133dc916013e8eb20a811fc7079213f5f03b2a4f5fc0"}, + {file = "duckdb-0.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:5b3da4da73422a3235c3500b3fb541ac546adb3e35642ef1119dbcd9cc7f68b8"}, + {file = "duckdb-0.9.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eca00c0c2062c0265c6c0e78ca2f6a30611b28f3afef062036610e9fc9d4a67d"}, + {file = "duckdb-0.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eb5af8e89d40fc4baab1515787ea1520a6c6cf6aa40ab9f107df6c3a75686ce1"}, + {file = "duckdb-0.9.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fae3d4f83ebcb47995f6acad7c6d57d003a9b6f0e1b31f79a3edd6feb377443"}, + {file = "duckdb-0.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16b9a7efc745bc3c5d1018c3a2f58d9e6ce49c0446819a9600fdba5f78e54c47"}, + {file = "duckdb-0.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b0b60167f5537772e9f5af940e69dcf50e66f5247732b8bb84a493a9af6055"}, + {file = "duckdb-0.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4f27f5e94c47df6c4ccddf18e3277b7464eea3db07356d2c4bf033b5c88359b8"}, + {file = "duckdb-0.9.1-cp39-cp39-win32.whl", hash = "sha256:d43cd7e6f783006b59dcc5e40fcf157d21ee3d0c8dfced35278091209e9974d7"}, + {file = "duckdb-0.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:e666795887d9cf1d6b6f6cbb9d487270680e5ff6205ebc54b2308151f13b8cff"}, + {file = "duckdb-0.9.1.tar.gz", hash = "sha256:603a878746015a3f2363a65eb48bcbec816261b6ee8d71eee53061117f6eef9d"}, +] [[package]] name = "email-validator" version = "1.3.1" description = "A robust email address syntax and deliverability validation library." -category = "dev" optional = false python-versions = ">=3.5" +files = [ + {file = "email_validator-1.3.1-py2.py3-none-any.whl", hash = "sha256:49a72f5fa6ed26be1c964f0567d931d10bf3fdeeacdf97bc26ef1cd2a44e0bda"}, + {file = "email_validator-1.3.1.tar.gz", hash = "sha256:d178c5c6fa6c6824e9b04f199cf23e79ac15756786573c190d2ad13089411ad2"}, +] [package.dependencies] dnspython = ">=1.15.0" @@ -1704,9 +2287,12 @@ idna = ">=2.0.0" name = "enlighten" version = "1.11.2" description = "Enlighten Progress Bar" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "enlighten-1.11.2-py2.py3-none-any.whl", hash = "sha256:98c9eb20e022b6a57f1c8d4f17e16760780b6881e6d658c40f52d21255ea45f3"}, + {file = "enlighten-1.11.2.tar.gz", hash = "sha256:9284861dee5a272e0e1a3758cd3f3b7180b1bd1754875da76876f2a7f46ccb61"}, +] [package.dependencies] blessed = ">=1.17.7" @@ -1716,9 +2302,12 @@ prefixed = ">=0.3.2" name = "exceptiongroup" version = "1.1.3" description = "Backport of PEP 654 (exception groups)" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, + {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, +] [package.extras] test = ["pytest (>=6)"] @@ -1727,9 +2316,12 @@ test = ["pytest (>=6)"] name = "fastembed" version = "0.1.1" description = "Fast, light, accurate library built for retrieval embedding generation" -category = "main" optional = true python-versions = ">=3.8.0,<3.12" +files = [ + {file = "fastembed-0.1.1-py3-none-any.whl", hash = "sha256:131413ae52cd72f4c8cced7a675f8269dbfd1a852abade3c815e265114bcc05a"}, + {file = "fastembed-0.1.1.tar.gz", hash = "sha256:f7e524ee4f74bb8aad16be5b687d1f77f608d40e96e292c87881dc36baf8f4c7"}, +] [package.dependencies] onnx = ">=1.11,<2.0" @@ -1742,9 +2334,12 @@ tqdm = ">=4.65,<5.0" name = "filelock" version = "3.12.3" description = "A platform independent file lock." -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, + {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, +] [package.dependencies] typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""} @@ -1757,9 +2352,12 @@ testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pyt name = "flake8" version = "5.0.4" description = "the modular source code checker: pep8 pyflakes and co" -category = "dev" optional = false python-versions = ">=3.6.1" +files = [ + {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, + {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, +] [package.dependencies] mccabe = ">=0.7.0,<0.8.0" @@ -1770,9 +2368,12 @@ pyflakes = ">=2.5.0,<2.6.0" name = "flake8-bugbear" version = "22.12.6" description = "A plugin for flake8 finding likely bugs and design problems in your program. Contains warnings that don't belong in pyflakes and pycodestyle." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "flake8-bugbear-22.12.6.tar.gz", hash = "sha256:4cdb2c06e229971104443ae293e75e64c6107798229202fbe4f4091427a30ac0"}, + {file = "flake8_bugbear-22.12.6-py3-none-any.whl", hash = "sha256:b69a510634f8a9c298dfda2b18a8036455e6b19ecac4fe582e4d7a0abfa50a30"}, +] [package.dependencies] attrs = ">=19.2.0" @@ -1785,9 +2386,12 @@ dev = ["coverage", "hypothesis", "hypothesmith (>=0.2)", "pre-commit", "tox"] name = "flake8-builtins" version = "1.5.3" description = "Check for python builtins being used as variables or parameters." -category = "dev" optional = false python-versions = "*" +files = [ + {file = "flake8-builtins-1.5.3.tar.gz", hash = "sha256:09998853b2405e98e61d2ff3027c47033adbdc17f9fe44ca58443d876eb00f3b"}, + {file = "flake8_builtins-1.5.3-py2.py3-none-any.whl", hash = "sha256:7706babee43879320376861897e5d1468e396a40b8918ed7bccf70e5f90b8687"}, +] [package.dependencies] flake8 = "*" @@ -1799,9 +2403,12 @@ test = ["coverage", "coveralls", "mock", "pytest", "pytest-cov"] name = "flake8-encodings" version = "0.5.0.post1" description = "A Flake8 plugin to identify incorrect use of encodings." -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "flake8_encodings-0.5.0.post1-py3-none-any.whl", hash = "sha256:d2fecca0e89ba09c86e5d61cf6bdb1b337f0d74746aac67bbcf0c517b4cb6cba"}, + {file = "flake8_encodings-0.5.0.post1.tar.gz", hash = "sha256:082c0163325c85b438a8106e876283b5ed3cbfc53e68d89130d70be8be4c9977"}, +] [package.dependencies] astatine = ">=0.3.1" @@ -1817,9 +2424,12 @@ classes = ["jedi (>=0.18.0)"] name = "flake8-helper" version = "0.2.1" description = "A helper library for Flake8 plugins." -category = "dev" optional = false python-versions = ">=3.6" +files = [ + {file = "flake8_helper-0.2.1-py3-none-any.whl", hash = "sha256:9123cdf351ad32ee8a51b85036052302c478122d62fb512c0773e111b3d05241"}, + {file = "flake8_helper-0.2.1.tar.gz", hash = "sha256:479f86d1c52df8e49ff876ecd3873242699f93eeece7e6675cdca9c37c9b0a16"}, +] [package.dependencies] flake8 = ">=3.8.4" @@ -1828,9 +2438,12 @@ flake8 = ">=3.8.4" name = "flake8-tidy-imports" version = "4.10.0" description = "A flake8 plugin that helps you write tidier imports." -category = "dev" optional = false python-versions = ">=3.8" +files = [ + {file = "flake8_tidy_imports-4.10.0-py3-none-any.whl", hash = "sha256:b0387fb2ea200441bd142309e716fb7b8f4b0937bdf5f8b7c0c118a5f5e2b8ed"}, + {file = "flake8_tidy_imports-4.10.0.tar.gz", hash = "sha256:bd6cf86465402d2b86903009b748d85a628e599e17b76e810c9857e3a2815173"}, +] [package.dependencies] flake8 = ">=3.8.0" @@ -1839,9 +2452,12 @@ flake8 = ">=3.8.0" name = "flask" version = "2.2.5" description = "A simple framework for building complex web applications." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, + {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, +] [package.dependencies] click = ">=8.0" @@ -1858,9 +2474,12 @@ dotenv = ["python-dotenv"] name = "flask-appbuilder" version = "4.3.6" description = "Simple and rapid application development framework, built on top of Flask. includes detailed security, auto CRUD generation for your models, google charts and much more." -category = "dev" optional = false python-versions = "~=3.7" +files = [ + {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, + {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, +] [package.dependencies] apispec = {version = ">=6.0.0,<7", extras = ["yaml"]} @@ -1894,9 +2513,12 @@ talisman = ["flask-talisman (>=1.0.0,<2.0)"] name = "flask-babel" version = "2.0.0" description = "Adds i18n/l10n support to Flask applications" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "Flask-Babel-2.0.0.tar.gz", hash = "sha256:f9faf45cdb2e1a32ea2ec14403587d4295108f35017a7821a2b1acb8cfd9257d"}, + {file = "Flask_Babel-2.0.0-py3-none-any.whl", hash = "sha256:e6820a052a8d344e178cdd36dd4bb8aea09b4bda3d5f9fa9f008df2c7f2f5468"}, +] [package.dependencies] Babel = ">=2.3" @@ -1911,9 +2533,12 @@ dev = ["Pallets-Sphinx-Themes", "bumpversion", "ghp-import", "pytest", "pytest-m name = "flask-caching" version = "2.0.2" description = "Adds caching support to Flask applications." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-Caching-2.0.2.tar.gz", hash = "sha256:24b60c552d59a9605cc1b6a42c56cdb39a82a28dab4532bbedb9222ae54ecb4e"}, + {file = "Flask_Caching-2.0.2-py3-none-any.whl", hash = "sha256:19571f2570e9b8dd9dd9d2f49d7cbee69c14ebe8cc001100b1eb98c379dd80ad"}, +] [package.dependencies] cachelib = ">=0.9.0,<0.10.0" @@ -1923,9 +2548,12 @@ Flask = "<3" name = "flask-jwt-extended" version = "4.5.2" description = "Extended JWT integration with Flask" -category = "dev" optional = false python-versions = ">=3.7,<4" +files = [ + {file = "Flask-JWT-Extended-4.5.2.tar.gz", hash = "sha256:ba56245ba43b71c8ae936784b867625dce8b9956faeedec2953222e57942fb0b"}, + {file = "Flask_JWT_Extended-4.5.2-py2.py3-none-any.whl", hash = "sha256:e0ef23d8c863746bd141046167073699e1a7b03c97169cbba70f05b8d9cd6b9e"}, +] [package.dependencies] Flask = ">=2.0,<3.0" @@ -1939,9 +2567,12 @@ asymmetric-crypto = ["cryptography (>=3.3.1)"] name = "flask-limiter" version = "3.5.0" description = "Rate limiting for flask applications" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-Limiter-3.5.0.tar.gz", hash = "sha256:13a3491b994c49f7cb4706587a38ca47e8162b576530472df38be68104f299c0"}, + {file = "Flask_Limiter-3.5.0-py3-none-any.whl", hash = "sha256:dbda4174f44e6cb858c6eb75e7488186f2977dd5d33d7028ba1aabf179de1bee"}, +] [package.dependencies] Flask = ">=2" @@ -1959,9 +2590,12 @@ redis = ["limits[redis]"] name = "flask-login" version = "0.6.2" description = "User authentication and session management for Flask." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-Login-0.6.2.tar.gz", hash = "sha256:c0a7baa9fdc448cdd3dd6f0939df72eec5177b2f7abe6cb82fc934d29caac9c3"}, + {file = "Flask_Login-0.6.2-py3-none-any.whl", hash = "sha256:1ef79843f5eddd0f143c2cd994c1b05ac83c0401dc6234c143495af9a939613f"}, +] [package.dependencies] Flask = ">=1.0.4" @@ -1971,9 +2605,12 @@ Werkzeug = ">=1.0.1" name = "flask-session" version = "0.5.0" description = "Server-side session support for Flask" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-Session-0.5.0.tar.gz", hash = "sha256:190875e6aebf2953c6803d42379ef3b934bc209ef8ef006f97aecb08f5aaeb86"}, + {file = "flask_session-0.5.0-py3-none-any.whl", hash = "sha256:1619bcbc16f04f64e90f8e0b17145ba5c9700090bb1294e889956c1282d58631"}, +] [package.dependencies] cachelib = "*" @@ -1983,9 +2620,12 @@ flask = ">=2.2" name = "flask-sqlalchemy" version = "2.5.1" description = "Adds SQLAlchemy support to your Flask application." -category = "dev" optional = false python-versions = ">= 2.7, != 3.0.*, != 3.1.*, != 3.2.*, != 3.3.*" +files = [ + {file = "Flask-SQLAlchemy-2.5.1.tar.gz", hash = "sha256:2bda44b43e7cacb15d4e05ff3cc1f8bc97936cc464623424102bfc2c35e95912"}, + {file = "Flask_SQLAlchemy-2.5.1-py2.py3-none-any.whl", hash = "sha256:f12c3d4cc5cc7fdcc148b9527ea05671718c3ea45d50c7e732cceb33f574b390"}, +] [package.dependencies] Flask = ">=0.10" @@ -1995,9 +2635,12 @@ SQLAlchemy = ">=0.8.0" name = "flask-wtf" version = "1.1.1" description = "Form rendering, validation, and CSRF protection for Flask with WTForms." -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "Flask-WTF-1.1.1.tar.gz", hash = "sha256:41c4244e9ae626d63bed42ae4785b90667b885b1535d5a4095e1f63060d12aa9"}, + {file = "Flask_WTF-1.1.1-py3-none-any.whl", hash = "sha256:7887d6f1ebb3e17bf648647422f0944c9a469d0fcf63e3b66fb9a83037e38b2c"}, +] [package.dependencies] Flask = "*" @@ -2011,25 +2654,93 @@ email = ["email-validator"] name = "flatbuffers" version = "23.5.26" description = "The FlatBuffers serialization format for Python" -category = "main" optional = true python-versions = "*" +files = [ + {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, + {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, +] [[package]] name = "frozenlist" version = "1.4.0" description = "A list-like structure which implements collections.abc.MutableSequence" -category = "main" optional = false python-versions = ">=3.8" +files = [ + {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"}, + {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"}, + {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"}, + {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"}, + {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"}, + {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"}, + {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"}, + {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"}, + {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"}, + {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"}, + {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"}, + {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"}, + {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"}, + {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"}, + {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"}, + {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"}, + {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"}, + {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"}, + {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"}, + {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"}, + {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"}, + {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"}, + {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, +] [[package]] name = "fsspec" version = "2023.6.0" description = "File-system specification" -category = "main" optional = false python-versions = ">=3.8" +files = [ + {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"}, + {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"}, +] [package.extras] abfs = ["adlfs"] @@ -2059,17 +2770,22 @@ tqdm = ["tqdm"] name = "future" version = "0.18.3" description = "Clean single-source support for Python 3 and 2" -category = "main" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "future-0.18.3.tar.gz", hash = "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"}, +] [[package]] name = "gcsfs" version = "2023.6.0" description = "Convenient Filesystem interface over GCS" -category = "main" optional = true python-versions = ">=3.8" +files = [ + {file = "gcsfs-2023.6.0-py2.py3-none-any.whl", hash = "sha256:3b3c7d8eddd4ec1380f3b49fbb861ee1e974adb223564401f10884b6260d406f"}, + {file = "gcsfs-2023.6.0.tar.gz", hash = "sha256:30b14fccadb3b7f0d99b2cd03bd8507c40f3a9a7d05847edca571f642bedbdff"}, +] [package.dependencies] aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1" @@ -2088,9 +2804,12 @@ gcsfuse = ["fusepy"] name = "gitdb" version = "4.0.10" description = "Git Object Database" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "gitdb-4.0.10-py3-none-any.whl", hash = "sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7"}, + {file = "gitdb-4.0.10.tar.gz", hash = "sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a"}, +] [package.dependencies] smmap = ">=3.0.1,<6" @@ -2099,9 +2818,12 @@ smmap = ">=3.0.1,<6" name = "gitpython" version = "3.1.34" description = "GitPython is a Python library used to interact with Git repositories" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "GitPython-3.1.34-py3-none-any.whl", hash = "sha256:5d3802b98a3bae1c2b8ae0e1ff2e4aa16bcdf02c145da34d092324f599f01395"}, + {file = "GitPython-3.1.34.tar.gz", hash = "sha256:85f7d365d1f6bf677ae51039c1ef67ca59091c7ebd5a3509aa399d4eda02d6dd"}, +] [package.dependencies] gitdb = ">=4.0.1,<5" @@ -2110,28 +2832,34 @@ gitdb = ">=4.0.1,<5" name = "giturlparse" version = "0.11.1" description = "A Git URL parsing module (supports parsing and rewriting)" -category = "main" optional = false python-versions = ">=3.8" +files = [ + {file = "giturlparse-0.11.1-py2.py3-none-any.whl", hash = "sha256:6422f25c8ca563e1a3cb6b85862e48614be804cd1334e6d84be5630eb26b343f"}, + {file = "giturlparse-0.11.1.tar.gz", hash = "sha256:cdbe0c062096c69e00f08397826dddebc1f73bc15b793994579c13aafc70c990"}, +] [[package]] name = "google-api-core" version = "2.11.1" description = "Google API client core library" -category = "main" optional = false python-versions = ">=3.7" +files = [ + {file = "google-api-core-2.11.1.tar.gz", hash = "sha256:25d29e05a0058ed5f19c61c0a78b1b53adea4d9364b464d014fbda941f6d1c9a"}, + {file = "google_api_core-2.11.1-py3-none-any.whl", hash = "sha256:d92a5a92dc36dd4f4b9ee4e55528a90e432b059f93aee6ad857f9de8cc7ae94a"}, +] [package.dependencies] google-auth = ">=2.14.1,<3.0.dev0" googleapis-common-protos = ">=1.56.2,<2.0.dev0" grpcio = [ - {version = ">=1.33.2,<2.0dev", optional = true, markers = "extra == \"grpc\""}, - {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\""}, + {version = ">=1.33.2,<2.0dev", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0dev", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] grpcio-status = [ - {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "extra == \"grpc\""}, - {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\""}, + {version = ">=1.33.2,<2.0.dev0", optional = true, markers = "python_version < \"3.11\" and extra == \"grpc\""}, + {version = ">=1.49.1,<2.0.dev0", optional = true, markers = "python_version >= \"3.11\" and extra == \"grpc\""}, ] protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" requests = ">=2.18.0,<3.0.0.dev0" @@ -2145,12 +2873,15 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] name = "google-api-python-client" version = "2.97.0" description = "Google API Client Library for Python" -category = "dev" optional = false python-versions = ">=3.7" +files = [ + {file = "google-api-python-client-2.97.0.tar.gz", hash = "sha256:48277291894876a1ca7ed4127e055e81f81e6343ced1b544a7200ae2c119dcd7"}, + {file = "google_api_python_client-2.97.0-py2.py3-none-any.whl", hash = "sha256:5215f4cd577753fc4192ccfbe0bb8b55d4bb5fd68fa6268ac5cf271b6305de31"}, +] [package.dependencies] -google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0.dev0" +google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0.dev0" google-auth = ">=1.19.0,<3.0.0.dev0" google-auth-httplib2 = ">=0.1.0" httplib2 = ">=0.15.0,<1.dev0" @@ -2160,9 +2891,12 @@ uritemplate = ">=3.0.1,<5" name = "google-auth" version = "2.22.0" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=3.6" +files = [ + {file = "google-auth-2.22.0.tar.gz", hash = "sha256:164cba9af4e6e4e40c3a4f90a1a6c12ee56f14c0b4868d1ca91b32826ab334ce"}, + {file = "google_auth-2.22.0-py2.py3-none-any.whl", hash = "sha256:d61d1b40897407b574da67da1a833bdc10d5a11642566e506565d1b1a46ba873"}, +] [package.dependencies] cachetools = ">=2.0.0,<6.0" @@ -2182,9 +2916,12 @@ requests = ["requests (>=2.20.0,<3.0.0.dev0)"] name = "google-auth-httplib2" version = "0.1.0" description = "Google Authentication Library: httplib2 transport" -category = "dev" optional = false python-versions = "*" +files = [ + {file = "google-auth-httplib2-0.1.0.tar.gz", hash = "sha256:a07c39fd632becacd3f07718dfd6021bf396978f03ad3ce4321d060015cc30ac"}, + {file = "google_auth_httplib2-0.1.0-py2.py3-none-any.whl", hash = "sha256:31e49c36c6b5643b57e82617cb3e021e3e1d2df9da63af67252c02fa9c1f4a10"}, +] [package.dependencies] google-auth = "*" @@ -2195,9 +2932,12 @@ six = "*" name = "google-auth-oauthlib" version = "1.0.0" description = "Google Authentication Library" -category = "main" optional = false python-versions = ">=3.6" +files = [ + {file = "google-auth-oauthlib-1.0.0.tar.gz", hash = "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5"}, + {file = "google_auth_oauthlib-1.0.0-py2.py3-none-any.whl", hash = "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb"}, +] [package.dependencies] google-auth = ">=2.15.0" @@ -2210,12 +2950,15 @@ tool = ["click (>=6.0.0)"] name = "google-cloud-bigquery" version = "3.11.4" description = "Google BigQuery API client library" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "google-cloud-bigquery-3.11.4.tar.gz", hash = "sha256:697df117241a2283bcbb93b21e10badc14e51c9a90800d2a7e1a3e1c7d842974"}, + {file = "google_cloud_bigquery-3.11.4-py2.py3-none-any.whl", hash = "sha256:5fa7897743a0ed949ade25a0942fc9e7557d8fce307c6f8a76d1b604cf27f1b1"}, +] [package.dependencies] -google-api-core = {version = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev", extras = ["grpc"]} +google-api-core = {version = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev", extras = ["grpc"]} google-cloud-core = ">=1.6.0,<3.0.0dev" google-resumable-media = ">=0.6.0,<3.0dev" grpcio = [ @@ -2242,12 +2985,15 @@ tqdm = ["tqdm (>=4.7.4,<5.0.0dev)"] name = "google-cloud-core" version = "2.3.3" description = "Google Cloud API client core library" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "google-cloud-core-2.3.3.tar.gz", hash = "sha256:37b80273c8d7eee1ae816b3a20ae43585ea50506cb0e60f3cf5be5f87f1373cb"}, + {file = "google_cloud_core-2.3.3-py2.py3-none-any.whl", hash = "sha256:fbd11cad3e98a7e5b0343dc07cb1039a5ffd7a5bb96e1f1e27cee4bda4a90863"}, +] [package.dependencies] -google-api-core = ">=1.31.6,<2.0.0 || >2.3.0,<3.0.0dev" +google-api-core = ">=1.31.6,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" [package.extras] @@ -2257,12 +3003,15 @@ grpc = ["grpcio (>=1.38.0,<2.0dev)"] name = "google-cloud-dataproc" version = "5.4.3" description = "Google Cloud Dataproc API client library" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "google-cloud-dataproc-5.4.3.tar.gz", hash = "sha256:d9c77c52aa5ddf52ae657736dbfb5312402933f72bab8480fc2d2afe98697402"}, + {file = "google_cloud_dataproc-5.4.3-py2.py3-none-any.whl", hash = "sha256:9cfff56cb53621cdffd0a3d6b10701e886e0a8ad54891e6c223eb67c0ff753ad"}, +] [package.dependencies] -google-api-core = {version = ">=1.34.0,<2.0.0 || >=2.11.0,<3.0.0dev", extras = ["grpc"]} +google-api-core = {version = ">=1.34.0,<2.0.dev0 || >=2.11.dev0,<3.0.0dev", extras = ["grpc"]} grpc-google-iam-v1 = ">=0.12.4,<1.0.0dev" proto-plus = [ {version = ">=1.22.0,<2.0.0dev", markers = "python_version < \"3.11\""}, @@ -2274,12 +3023,15 @@ protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4 name = "google-cloud-storage" version = "2.10.0" description = "Google Cloud Storage API client library" -category = "main" optional = true python-versions = ">=3.7" +files = [ + {file = "google-cloud-storage-2.10.0.tar.gz", hash = "sha256:934b31ead5f3994e5360f9ff5750982c5b6b11604dc072bc452c25965e076dc7"}, + {file = "google_cloud_storage-2.10.0-py2.py3-none-any.whl", hash = "sha256:9433cf28801671de1c80434238fb1e7e4a1ba3087470e90f70c928ea77c2b9d7"}, +] [package.dependencies] -google-api-core = ">=1.31.5,<2.0.0 || >2.3.0,<3.0.0dev" +google-api-core = ">=1.31.5,<2.0.dev0 || >2.3.0,<3.0.0dev" google-auth = ">=1.25.0,<3.0dev" google-cloud-core = ">=2.3.0,<3.0dev" google-resumable-media = ">=2.3.2" @@ -2292,3686 +3044,9 @@ protobuf = ["protobuf (<5.0.0dev)"] name = "google-crc32c" version = "1.5.0" description = "A python wrapper of the C library 'Google CRC32C'" -category = "main" optional = true python-versions = ">=3.7" - -[package.extras] -testing = ["pytest"] - -[[package]] -name = "google-re2" -version = "1.1" -description = "RE2 Python bindings" -category = "dev" -optional = false -python-versions = "~=3.8" - -[[package]] -name = "google-resumable-media" -version = "2.5.0" -description = "Utilities for Google Media Downloads and Resumable Uploads" -category = "main" -optional = true -python-versions = ">= 3.7" - -[package.dependencies] -google-crc32c = ">=1.0,<2.0dev" - -[package.extras] -aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)"] -requests = ["requests (>=2.18.0,<3.0.0dev)"] - -[[package]] -name = "googleapis-common-protos" -version = "1.60.0" -description = "Common protobufs used in Google APIs" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -grpcio = {version = ">=1.44.0,<2.0.0.dev0", optional = true, markers = "extra == \"grpc\""} -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" - -[package.extras] -grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] - -[[package]] -name = "grapheme" -version = "0.6.0" -description = "Unicode grapheme helpers" -category = "dev" -optional = false -python-versions = "*" - -[package.extras] -test = ["pytest", "sphinx", "sphinx-autobuild", "twine", "wheel"] - -[[package]] -name = "graphviz" -version = "0.20.1" -description = "Simple Python interface for Graphviz" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -dev = ["flake8", "pep8-naming", "tox (>=3)", "twine", "wheel"] -docs = ["sphinx (>=5)", "sphinx-autodoc-typehints", "sphinx-rtd-theme"] -test = ["coverage", "mock (>=4)", "pytest (>=7)", "pytest-cov", "pytest-mock (>=3)"] - -[[package]] -name = "greenlet" -version = "2.0.2" -description = "Lightweight in-process concurrent programming" -category = "main" -optional = false -python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" - -[package.extras] -docs = ["Sphinx", "docutils (<0.18)"] -test = ["objgraph", "psutil"] - -[[package]] -name = "grpc-google-iam-v1" -version = "0.12.6" -description = "IAM API client library" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -googleapis-common-protos = {version = ">=1.56.0,<2.0.0dev", extras = ["grpc"]} -grpcio = ">=1.44.0,<2.0.0dev" -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" - -[[package]] -name = "grpcio" -version = "1.57.0" -description = "HTTP/2-based RPC framework" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -protobuf = ["grpcio-tools (>=1.57.0)"] - -[[package]] -name = "grpcio-status" -version = "1.57.0" -description = "Status proto mapping for gRPC" -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -googleapis-common-protos = ">=1.5.5" -grpcio = ">=1.57.0" -protobuf = ">=4.21.6" - -[[package]] -name = "grpcio-tools" -version = "1.57.0" -description = "Protobuf code generator for gRPC" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -grpcio = ">=1.57.0" -protobuf = ">=4.21.6,<5.0dev" -setuptools = "*" - -[[package]] -name = "gunicorn" -version = "21.2.0" -description = "WSGI HTTP Server for UNIX" -category = "dev" -optional = false -python-versions = ">=3.5" - -[package.dependencies] -packaging = "*" - -[package.extras] -eventlet = ["eventlet (>=0.24.1)"] -gevent = ["gevent (>=1.4.0)"] -setproctitle = ["setproctitle"] -tornado = ["tornado (>=0.2)"] - -[[package]] -name = "h11" -version = "0.14.0" -description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "h2" -version = "4.1.0" -description = "HTTP/2 State-Machine based protocol implementation" -category = "main" -optional = true -python-versions = ">=3.6.1" - -[package.dependencies] -hpack = ">=4.0,<5" -hyperframe = ">=6.0,<7" - -[[package]] -name = "hexbytes" -version = "0.3.1" -description = "hexbytes: Python `bytes` subclass that decodes hex, with a readable console output" -category = "main" -optional = false -python-versions = ">=3.7, <4" - -[package.extras] -dev = ["black (>=22)", "bumpversion (>=0.5.3)", "eth-utils (>=1.0.1,<3)", "flake8 (==6.0.0)", "flake8-bugbear (==23.3.23)", "hypothesis (>=3.44.24,<=6.31.6)", "ipython", "isort (>=5.10.1)", "mypy (==0.971)", "pydocstyle (>=5.0.0)", "pytest (>=7.0.0)", "pytest-watch (>=4.1.0)", "pytest-xdist (>=2.4.0)", "sphinx (>=5.0.0)", "sphinx-rtd-theme (>=1.0.0)", "towncrier (>=21,<22)", "tox (>=4.0.0)", "twine", "wheel"] -doc = ["sphinx (>=5.0.0)", "sphinx-rtd-theme (>=1.0.0)", "towncrier (>=21,<22)"] -lint = ["black (>=22)", "flake8 (==6.0.0)", "flake8-bugbear (==23.3.23)", "isort (>=5.10.1)", "mypy (==0.971)", "pydocstyle (>=5.0.0)"] -test = ["eth-utils (>=1.0.1,<3)", "hypothesis (>=3.44.24,<=6.31.6)", "pytest (>=7.0.0)", "pytest-xdist (>=2.4.0)"] - -[[package]] -name = "hologram" -version = "0.0.16" -description = "JSON schema generation from dataclasses" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -jsonschema = ">=3.0" -python-dateutil = ">=2.8,<2.9" - -[[package]] -name = "hpack" -version = "4.0.0" -description = "Pure-Python HPACK header compression" -category = "main" -optional = true -python-versions = ">=3.6.1" - -[[package]] -name = "httpcore" -version = "0.17.3" -description = "A minimal low-level HTTP client." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -anyio = ">=3.0,<5.0" -certifi = "*" -h11 = ">=0.13,<0.15" -sniffio = ">=1.0.0,<2.0.0" - -[package.extras] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] - -[[package]] -name = "httplib2" -version = "0.22.0" -description = "A comprehensive HTTP client library." -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.dependencies] -pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} - -[[package]] -name = "httpx" -version = "0.24.1" -description = "The next generation HTTP client." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -certifi = "*" -h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} -httpcore = ">=0.15.0,<0.18.0" -idna = "*" -sniffio = "*" - -[package.extras] -brotli = ["brotli", "brotlicffi"] -cli = ["click (>=8.0.0,<9.0.0)", "pygments (>=2.0.0,<3.0.0)", "rich (>=10,<14)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (>=1.0.0,<2.0.0)"] - -[[package]] -name = "humanfriendly" -version = "10.0" -description = "Human friendly output for text interfaces using Python" -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.dependencies] -pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} - -[[package]] -name = "humanize" -version = "4.8.0" -description = "Python humanize utilities" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.extras] -tests = ["freezegun", "pytest", "pytest-cov"] - -[[package]] -name = "hyperframe" -version = "6.0.1" -description = "HTTP/2 framing layer for Python" -category = "main" -optional = true -python-versions = ">=3.6.1" - -[[package]] -name = "idna" -version = "3.4" -description = "Internationalized Domain Names in Applications (IDNA)" -category = "main" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "importlib-metadata" -version = "4.13.0" -description = "Read metadata from Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] -perf = ["ipython"] -testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - -[[package]] -name = "importlib-resources" -version = "6.0.1" -description = "Read resources from Python packages" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] - -[[package]] -name = "inflection" -version = "0.5.1" -description = "A port of Ruby on Rails inflector to Python" -category = "dev" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "isodate" -version = "0.6.1" -description = "An ISO 8601 date/time/duration parser and formatter" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -six = "*" - -[[package]] -name = "itsdangerous" -version = "2.1.2" -description = "Safely pass data to untrusted environments and back." -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "jaraco-classes" -version = "3.3.0" -description = "Utility functions for Python class constructs" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -more-itertools = "*" - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] - -[[package]] -name = "jeepney" -version = "0.8.0" -description = "Low-level, pure Python DBus protocol wrapper." -category = "main" -optional = true -python-versions = ">=3.7" - -[package.extras] -test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] -trio = ["async_generator", "trio"] - -[[package]] -name = "jinja2" -version = "3.1.2" -description = "A very fast and expressive template engine." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -MarkupSafe = ">=2.0" - -[package.extras] -i18n = ["Babel (>=2.7)"] - -[[package]] -name = "jinxed" -version = "1.2.0" -description = "Jinxed Terminal Library" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -ansicon = {version = "*", markers = "platform_system == \"Windows\""} - -[[package]] -name = "jmespath" -version = "1.0.1" -description = "JSON Matching Expressions" -category = "main" -optional = true -python-versions = ">=3.7" - -[[package]] -name = "jsonpath-ng" -version = "1.5.3" -description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -decorator = "*" -ply = "*" -six = "*" - -[[package]] -name = "jsonschema" -version = "4.19.0" -description = "An implementation of JSON Schema validation for Python" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -attrs = ">=22.2.0" -importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} -jsonschema-specifications = ">=2023.03.6" -pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} -referencing = ">=0.28.4" -rpds-py = ">=0.7.1" - -[package.extras] -format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] -format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] - -[[package]] -name = "jsonschema-specifications" -version = "2023.7.1" -description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} -referencing = ">=0.28.0" - -[[package]] -name = "keyring" -version = "24.2.0" -description = "Store and access your passwords safely." -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""} -importlib-resources = {version = "*", markers = "python_version < \"3.9\""} -"jaraco.classes" = "*" -jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} -pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} -SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} - -[package.extras] -completion = ["shtab"] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] - -[[package]] -name = "lazy-object-proxy" -version = "1.9.0" -description = "A fast and thorough lazy object proxy." -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "leather" -version = "0.3.4" -description = "Python charting for 80% of humans." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -six = ">=1.6.1" - -[[package]] -name = "limits" -version = "3.6.0" -description = "Rate limiting utilities" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -deprecated = ">=1.2" -importlib-resources = ">=1.3" -packaging = ">=21,<24" -typing-extensions = "*" - -[package.extras] -all = ["aetcd", "coredis (>=3.4.0,<5)", "emcache (>=0.6.1)", "emcache (>=1)", "etcd3", "motor (>=3,<4)", "pymemcache (>3,<5.0.0)", "pymongo (>4.1,<5)", "redis (>3,!=4.5.2,!=4.5.3,<6.0.0)", "redis (>=4.2.0,!=4.5.2,!=4.5.3)"] -async-etcd = ["aetcd"] -async-memcached = ["emcache (>=0.6.1)", "emcache (>=1)"] -async-mongodb = ["motor (>=3,<4)"] -async-redis = ["coredis (>=3.4.0,<5)"] -etcd = ["etcd3"] -memcached = ["pymemcache (>3,<5.0.0)"] -mongodb = ["pymongo (>4.1,<5)"] -redis = ["redis (>3,!=4.5.2,!=4.5.3,<6.0.0)"] -rediscluster = ["redis (>=4.2.0,!=4.5.2,!=4.5.3)"] - -[[package]] -name = "linkify-it-py" -version = "2.0.2" -description = "Links recognition library with FULL unicode support." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -uc-micro-py = "*" - -[package.extras] -benchmark = ["pytest", "pytest-benchmark"] -dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] -doc = ["myst-parser", "sphinx", "sphinx-book-theme"] -test = ["coverage", "pytest", "pytest-cov"] - -[[package]] -name = "lockfile" -version = "0.12.2" -description = "Platform-independent file locking module" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "logbook" -version = "1.5.3" -description = "A logging replacement for Python" -category = "main" -optional = false -python-versions = "*" - -[package.extras] -all = ["Jinja2", "brotli", "cython", "execnet (>=1.0.9)", "pytest (>4.0)", "pytest-cov (>=2.6)", "pyzmq", "redis", "sqlalchemy"] -compression = ["brotli"] -dev = ["cython", "pytest (>4.0)", "pytest-cov (>=2.6)"] -execnet = ["execnet (>=1.0.9)"] -jinja = ["Jinja2"] -redis = ["redis"] -sqlalchemy = ["sqlalchemy"] -test = ["pytest (>4.0)", "pytest-cov (>=2.6)"] -zmq = ["pyzmq"] - -[[package]] -name = "lxml" -version = "4.9.3" -description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" - -[package.extras] -cssselect = ["cssselect (>=0.7)"] -html5 = ["html5lib"] -htmlsoup = ["BeautifulSoup4"] -source = ["Cython (>=0.29.35)"] - -[[package]] -name = "makefun" -version = "1.15.1" -description = "Small library to dynamically create python functions." -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "mako" -version = "1.2.4" -description = "A super-fast templating language that borrows the best ideas from the existing templating languages." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -MarkupSafe = ">=0.9.2" - -[package.extras] -babel = ["Babel"] -lingua = ["lingua"] -testing = ["pytest"] - -[[package]] -name = "markdown" -version = "3.4.4" -description = "Python implementation of John Gruber's Markdown." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.0)", "mkdocs-nature (>=0.4)"] -testing = ["coverage", "pyyaml"] - -[[package]] -name = "markdown-it-py" -version = "3.0.0" -description = "Python port of markdown-it. Markdown parsing, done right!" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -mdurl = ">=0.1,<1.0" - -[package.extras] -benchmarking = ["psutil", "pytest", "pytest-benchmark"] -code-style = ["pre-commit (>=3.0,<4.0)"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] -linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins"] -profiling = ["gprof2dot"] -rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - -[[package]] -name = "markupsafe" -version = "2.1.3" -description = "Safely add untrusted strings to HTML/XML markup." -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "marshmallow" -version = "3.20.1" -description = "A lightweight library for converting complex datatypes to and from native Python datatypes." -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -packaging = ">=17.0" - -[package.extras] -dev = ["flake8 (==6.0.0)", "flake8-bugbear (==23.7.10)", "mypy (==1.4.1)", "pre-commit (>=2.4,<4.0)", "pytest", "pytz", "simplejson", "tox"] -docs = ["alabaster (==0.7.13)", "autodocsumm (==0.2.11)", "sphinx (==7.0.1)", "sphinx-issues (==3.0.1)", "sphinx-version-warning (==1.1.2)"] -lint = ["flake8 (==6.0.0)", "flake8-bugbear (==23.7.10)", "mypy (==1.4.1)", "pre-commit (>=2.4,<4.0)"] -tests = ["pytest", "pytz", "simplejson"] - -[[package]] -name = "marshmallow-oneofschema" -version = "3.0.1" -description = "marshmallow multiplexing schema" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -marshmallow = ">=3.0.0,<4.0.0" - -[package.extras] -dev = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "mock", "pre-commit (>=2.7,<3.0)", "pytest", "tox"] -lint = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.7,<3.0)"] -tests = ["mock", "pytest"] - -[[package]] -name = "marshmallow-sqlalchemy" -version = "0.26.1" -description = "SQLAlchemy integration with the marshmallow (de)serialization library" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -marshmallow = ">=3.0.0" -SQLAlchemy = ">=1.2.0" - -[package.extras] -dev = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.0,<3.0)", "pytest", "pytest-lazy-fixture", "tox"] -docs = ["alabaster (==0.7.12)", "sphinx (==4.0.2)", "sphinx-issues (==1.2.0)"] -lint = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.0,<3.0)"] -tests = ["pytest", "pytest-lazy-fixture"] - -[[package]] -name = "mashumaro" -version = "3.6" -description = "Fast serialization library on top of dataclasses" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -msgpack = {version = ">=0.5.6", optional = true, markers = "extra == \"msgpack\""} -typing-extensions = ">=4.1.0" - -[package.extras] -msgpack = ["msgpack (>=0.5.6)"] -orjson = ["orjson"] -toml = ["tomli (>=1.1.0)", "tomli-w (>=1.0)"] -yaml = ["pyyaml (>=3.13)"] - -[[package]] -name = "mccabe" -version = "0.7.0" -description = "McCabe checker, plugin for flake8" -category = "dev" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "mdit-py-plugins" -version = "0.4.0" -description = "Collection of plugins for markdown-it-py" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -markdown-it-py = ">=1.0.0,<4.0.0" - -[package.extras] -code-style = ["pre-commit"] -rtd = ["myst-parser", "sphinx-book-theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - -[[package]] -name = "mdurl" -version = "0.1.2" -description = "Markdown URL utilities" -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "minimal-snowplow-tracker" -version = "0.0.2" -description = "A minimal snowplow event tracker for Python. Add analytics to your Python and Django apps, webapps and games" -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -requests = ">=2.2.1,<3.0" -six = ">=1.9.0,<2.0" - -[[package]] -name = "more-itertools" -version = "10.1.0" -description = "More routines for operating on iterables, beyond itertools" -category = "main" -optional = true -python-versions = ">=3.8" - -[[package]] -name = "mpmath" -version = "1.3.0" -description = "Python library for arbitrary-precision floating-point arithmetic" -category = "main" -optional = true -python-versions = "*" - -[package.extras] -develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] -docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] -tests = ["pytest (>=4.6)"] - -[[package]] -name = "msal" -version = "1.23.0" -description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -cryptography = ">=0.6,<44" -PyJWT = {version = ">=1.0.0,<3", extras = ["crypto"]} -requests = ">=2.0.0,<3" - -[package.extras] -broker = ["pymsalruntime (>=0.13.2,<0.14)"] - -[[package]] -name = "msal-extensions" -version = "1.0.0" -description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -msal = ">=0.4.1,<2.0.0" -portalocker = [ - {version = ">=1.0,<3", markers = "python_version >= \"3.5\" and platform_system != \"Windows\""}, - {version = ">=1.6,<3", markers = "python_version >= \"3.5\" and platform_system == \"Windows\""}, -] - -[[package]] -name = "msgpack" -version = "1.0.5" -description = "MessagePack serializer" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "multidict" -version = "6.0.4" -description = "multidict implementation" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "mypy" -version = "1.6.1" -description = "Optional static typing for Python" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" - -[package.extras] -dmypy = ["psutil (>=4.0)"] -install-types = ["pip"] -reports = ["lxml"] - -[[package]] -name = "mypy-boto3-athena" -version = "1.28.36" -description = "Type annotations for boto3.Athena 1.28.36 service generated with mypy-boto3-builder 7.18.0" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} - -[[package]] -name = "mypy-boto3-glue" -version = "1.28.36" -description = "Type annotations for boto3.Glue 1.28.36 service generated with mypy-boto3-builder 7.18.0" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} - -[[package]] -name = "mypy-boto3-lakeformation" -version = "1.28.36" -description = "Type annotations for boto3.LakeFormation 1.28.36 service generated with mypy-boto3-builder 7.18.0" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} - -[[package]] -name = "mypy-boto3-sts" -version = "1.28.37" -description = "Type annotations for boto3.STS 1.28.37 service generated with mypy-boto3-builder 7.18.2" -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} - -[[package]] -name = "mypy-extensions" -version = "1.0.0" -description = "Type system extensions for programs checked with the mypy type checker." -category = "dev" -optional = false -python-versions = ">=3.5" - -[[package]] -name = "natsort" -version = "8.4.0" -description = "Simple yet flexible natural sorting in Python." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -fast = ["fastnumbers (>=2.0.0)"] -icu = ["PyICU (>=1.0.0)"] - -[[package]] -name = "networkx" -version = "2.8.8" -description = "Python package for creating and manipulating graphs and networks" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.extras] -default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] -developer = ["mypy (>=0.982)", "pre-commit (>=2.20)"] -doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-theme (>=0.11)", "sphinx (>=5.2)", "sphinx-gallery (>=0.11)", "texext (>=0.6.6)"] -extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] -test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] - -[[package]] -name = "nr-date" -version = "2.1.0" -description = "" -category = "dev" -optional = false -python-versions = ">=3.6,<4.0" - -[[package]] -name = "nr-stream" -version = "1.1.5" -description = "" -category = "dev" -optional = false -python-versions = ">=3.6,<4.0" - -[[package]] -name = "nr-util" -version = "0.8.12" -description = "General purpose Python utility library." -category = "dev" -optional = false -python-versions = ">=3.7,<4.0" - -[package.dependencies] -deprecated = ">=1.2.0,<2.0.0" -typing-extensions = ">=3.0.0" - -[[package]] -name = "numpy" -version = "1.24.4" -description = "Fundamental package for array computing in Python" -category = "main" -optional = false -python-versions = ">=3.8" - -[[package]] -name = "numpy" -version = "1.26.1" -description = "Fundamental package for array computing in Python" -category = "main" -optional = false -python-versions = "<3.13,>=3.9" - -[[package]] -name = "oauthlib" -version = "3.2.2" -description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" -category = "main" -optional = false -python-versions = ">=3.6" - -[package.extras] -rsa = ["cryptography (>=3.0.0)"] -signals = ["blinker (>=1.4.0)"] -signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] - -[[package]] -name = "onnx" -version = "1.15.0" -description = "Open Neural Network Exchange" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -numpy = "*" -protobuf = ">=3.20.2" - -[package.extras] -reference = ["Pillow", "google-re2"] - -[[package]] -name = "onnxruntime" -version = "1.16.1" -description = "ONNX Runtime is a runtime accelerator for Machine Learning models" -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -coloredlogs = "*" -flatbuffers = "*" -numpy = ">=1.21.6" -packaging = "*" -protobuf = "*" -sympy = "*" - -[[package]] -name = "opentelemetry-api" -version = "1.15.0" -description = "OpenTelemetry Python API" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -deprecated = ">=1.2.6" -setuptools = ">=16.0" - -[[package]] -name = "opentelemetry-exporter-otlp" -version = "1.15.0" -description = "OpenTelemetry Collector Exporters" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -opentelemetry-exporter-otlp-proto-grpc = "1.15.0" -opentelemetry-exporter-otlp-proto-http = "1.15.0" - -[[package]] -name = "opentelemetry-exporter-otlp-proto-grpc" -version = "1.15.0" -description = "OpenTelemetry Collector Protobuf over gRPC Exporter" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} -googleapis-common-protos = ">=1.52,<2.0" -grpcio = ">=1.0.0,<2.0.0" -opentelemetry-api = ">=1.12,<2.0" -opentelemetry-proto = "1.15.0" -opentelemetry-sdk = ">=1.12,<2.0" - -[package.extras] -test = ["pytest-grpc"] - -[[package]] -name = "opentelemetry-exporter-otlp-proto-http" -version = "1.15.0" -description = "OpenTelemetry Collector Protobuf over HTTP Exporter" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} -googleapis-common-protos = ">=1.52,<2.0" -opentelemetry-api = ">=1.12,<2.0" -opentelemetry-proto = "1.15.0" -opentelemetry-sdk = ">=1.12,<2.0" -requests = ">=2.7,<3.0" - -[package.extras] -test = ["responses (==0.22.0)"] - -[[package]] -name = "opentelemetry-proto" -version = "1.15.0" -description = "OpenTelemetry Python Proto" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -protobuf = ">=3.19,<5.0" - -[[package]] -name = "opentelemetry-sdk" -version = "1.15.0" -description = "OpenTelemetry Python SDK" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -opentelemetry-api = "1.15.0" -opentelemetry-semantic-conventions = "0.36b0" -setuptools = ">=16.0" -typing-extensions = ">=3.7.4" - -[[package]] -name = "opentelemetry-semantic-conventions" -version = "0.36b0" -description = "OpenTelemetry Semantic Conventions" -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "ordered-set" -version = "4.1.0" -description = "An OrderedSet is a custom MutableSet that remembers its order, so that every" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -dev = ["black", "mypy", "pytest"] - -[[package]] -name = "orjson" -version = "3.9.5" -description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "packaging" -version = "23.1" -description = "Core utilities for Python packages" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "pandas" -version = "2.0.3" -description = "Powerful data structures for data analysis, time series, and statistics" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, -] -python-dateutil = ">=2.8.2" -pytz = ">=2020.1" -tzdata = ">=2022.1" - -[package.extras] -all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] -aws = ["s3fs (>=2021.08.0)"] -clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] -compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] -computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2021.07.0)"] -gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] -hdf5 = ["tables (>=3.6.1)"] -html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] -mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] -spss = ["pyreadstat (>=1.1.2)"] -sql-other = ["SQLAlchemy (>=1.4.16)"] -test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.6.3)"] - -[[package]] -name = "parsedatetime" -version = "2.4" -description = "Parse human-readable date/time text." -category = "main" -optional = false -python-versions = "*" - -[package.dependencies] -future = "*" - -[[package]] -name = "pathspec" -version = "0.11.2" -description = "Utility library for gitignore style pattern matching of file paths." -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "pathvalidate" -version = "3.1.0" -description = "pathvalidate is a Python library to sanitize/validate a string such as filenames/file-paths/etc." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["Sphinx (>=2.4)", "sphinx-rtd-theme (>=1.2.2)", "urllib3 (<2)"] -test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-discord (>=0.1.2)", "pytest-md-report (>=0.3)"] - -[[package]] -name = "pbr" -version = "5.11.1" -description = "Python Build Reasonableness" -category = "dev" -optional = false -python-versions = ">=2.6" - -[[package]] -name = "pendulum" -version = "2.1.2" -description = "Python datetimes made easy" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.dependencies] -python-dateutil = ">=2.6,<3.0" -pytzdata = ">=2020.1" - -[[package]] -name = "pipdeptree" -version = "2.9.6" -description = "Command line utility to show dependency tree of packages." -category = "main" -optional = true -python-versions = ">=3.7" - -[package.extras] -graphviz = ["graphviz (>=0.20.1)"] -test = ["covdefaults (>=2.3)", "diff-cover (>=7.6)", "pip (>=23.1.2)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "virtualenv (>=20.23.1,<21)"] - -[[package]] -name = "pkgutil-resolve-name" -version = "1.3.10" -description = "Resolve a name to an object." -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "platformdirs" -version = "3.8.1" -description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] - -[[package]] -name = "pluggy" -version = "1.3.0" -description = "plugin and hook calling mechanisms for python" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[[package]] -name = "ply" -version = "3.11" -description = "Python Lex & Yacc" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "portalocker" -version = "2.7.0" -description = "Wraps the portalocker recipe for easy usage" -category = "main" -optional = true -python-versions = ">=3.5" - -[package.dependencies] -pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} - -[package.extras] -docs = ["sphinx (>=1.7.1)"] -redis = ["redis"] -tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)"] - -[[package]] -name = "prefixed" -version = "0.7.0" -description = "Prefixed alternative numeric library" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "prison" -version = "0.2.1" -description = "Rison encoder/decoder" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -six = "*" - -[package.extras] -dev = ["nose", "pipreqs", "twine"] - -[[package]] -name = "proto-plus" -version = "1.22.3" -description = "Beautiful, Pythonic protocol buffers." -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -protobuf = ">=3.19.0,<5.0.0dev" - -[package.extras] -testing = ["google-api-core[grpc] (>=1.31.5)"] - -[[package]] -name = "protobuf" -version = "4.24.2" -description = "" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "psutil" -version = "5.9.5" -description = "Cross-platform lib for process and system monitoring in Python." -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.extras] -test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] - -[[package]] -name = "psycopg2-binary" -version = "2.9.7" -description = "psycopg2 - Python-PostgreSQL Database Adapter" -category = "main" -optional = true -python-versions = ">=3.6" - -[[package]] -name = "psycopg2cffi" -version = "2.9.0" -description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=master" -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -cffi = ">=1.0" -six = "*" - -[[package]] -name = "py" -version = "1.11.0" -description = "library with cross-python path, ini-parsing, io, code, log facilities" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[[package]] -name = "pyarrow" -version = "14.0.1" -description = "Python library for Apache Arrow" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -numpy = ">=1.16.6" - -[[package]] -name = "pyasn1" -version = "0.5.0" -description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" - -[[package]] -name = "pyasn1-modules" -version = "0.3.0" -description = "A collection of ASN.1-based protocols modules" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" - -[package.dependencies] -pyasn1 = ">=0.4.6,<0.6.0" - -[[package]] -name = "pyathena" -version = "3.0.6" -description = "Python DB API 2.0 (PEP 249) client for Amazon Athena" -category = "main" -optional = true -python-versions = ">=3.8.1" - -[package.dependencies] -boto3 = ">=1.26.4" -botocore = ">=1.29.4" -fsspec = "*" -tenacity = ">=4.1.0" - -[package.extras] -arrow = ["pyarrow (>=7.0.0)"] -fastparquet = ["fastparquet (>=0.4.0)"] -pandas = ["pandas (>=1.3.0)"] -sqlalchemy = ["sqlalchemy (>=1.0.0)"] - -[[package]] -name = "pycodestyle" -version = "2.9.1" -description = "Python style guide checker" -category = "dev" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "pycparser" -version = "2.21" -description = "C parser in Python" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "pydantic" -version = "2.5.0" -description = "Data validation using Python type hints" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -annotated-types = ">=0.4.0" -pydantic-core = "2.14.1" -typing-extensions = ">=4.6.1" - -[package.extras] -email = ["email-validator (>=2.0.0)"] - -[[package]] -name = "pydantic-core" -version = "2.14.1" -description = "" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" - -[[package]] -name = "pydoc-markdown" -version = "4.8.2" -description = "Create Python API documentation in Markdown format." -category = "dev" -optional = false -python-versions = ">=3.7,<4.0" - -[package.dependencies] -click = ">=7.1,<9.0" -"databind.core" = ">=4.4.0,<5.0.0" -"databind.json" = ">=4.4.0,<5.0.0" -docspec = ">=2.2.1,<3.0.0" -docspec-python = ">=2.2.1,<3.0.0" -docstring-parser = ">=0.11,<0.12" -jinja2 = ">=3.0.0,<4.0.0" -"nr.util" = ">=0.7.5,<1.0.0" -PyYAML = ">=5.0,<7.0" -requests = ">=2.23.0,<3.0.0" -tomli = ">=2.0.0,<3.0.0" -tomli_w = ">=1.0.0,<2.0.0" -watchdog = "*" -yapf = ">=0.30.0" - -[[package]] -name = "pyflakes" -version = "2.5.0" -description = "passive checker of Python programs" -category = "dev" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "pygments" -version = "2.16.1" -description = "Pygments is a syntax highlighting package written in Python." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -plugins = ["importlib-metadata"] - -[[package]] -name = "pyjwt" -version = "2.8.0" -description = "JSON Web Token implementation in Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""} - -[package.extras] -crypto = ["cryptography (>=3.4.0)"] -dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] -docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] -tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] - -[[package]] -name = "pymongo" -version = "4.6.0" -description = "Python driver for MongoDB " -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -dnspython = ">=1.16.0,<3.0.0" - -[package.extras] -aws = ["pymongo-auth-aws (<2.0.0)"] -encryption = ["certifi", "pymongo[aws]", "pymongocrypt (>=1.6.0,<2.0.0)"] -gssapi = ["pykerberos", "winkerberos (>=0.5.0)"] -ocsp = ["certifi", "cryptography (>=2.5)", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] -snappy = ["python-snappy"] -test = ["pytest (>=7)"] -zstd = ["zstandard"] - -[[package]] -name = "pymysql" -version = "1.1.0" -description = "Pure Python MySQL Driver" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -ed25519 = ["PyNaCl (>=1.4.0)"] -rsa = ["cryptography"] - -[[package]] -name = "pyodbc" -version = "4.0.39" -description = "DB API Module for ODBC" -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" - -[[package]] -name = "pyopenssl" -version = "23.2.0" -description = "Python wrapper module around the OpenSSL library" -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -cryptography = ">=38.0.0,<40.0.0 || >40.0.0,<40.0.1 || >40.0.1,<42" - -[package.extras] -docs = ["sphinx (!=5.2.0,!=5.2.0.post0)", "sphinx-rtd-theme"] -test = ["flaky", "pretend", "pytest (>=3.0.1)"] - -[[package]] -name = "pyparsing" -version = "3.1.1" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -category = "dev" -optional = false -python-versions = ">=3.6.8" - -[package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[[package]] -name = "pypdf2" -version = "3.0.1" -description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -typing_extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} - -[package.extras] -crypto = ["PyCryptodome"] -dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] -docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] -full = ["Pillow", "PyCryptodome"] -image = ["Pillow"] - -[[package]] -name = "pyreadline3" -version = "3.4.1" -description = "A python implementation of GNU readline." -category = "main" -optional = true -python-versions = "*" - -[[package]] -name = "pytest" -version = "6.2.5" -description = "pytest: simple powerful testing with Python" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} -attrs = ">=19.2.0" -colorama = {version = "*", markers = "sys_platform == \"win32\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" -py = ">=1.8.2" -toml = "*" - -[package.extras] -testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] - -[[package]] -name = "pytest-cases" -version = "3.6.14" -description = "Separate test code from test cases in pytest." -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -decopatch = "*" -makefun = ">=1.9.5" - -[[package]] -name = "pytest-console-scripts" -version = "1.4.1" -description = "Pytest plugin for testing console scripts" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""} -pytest = ">=4.0.0" - -[[package]] -name = "pytest-forked" -version = "1.6.0" -description = "run tests in isolated forked subprocesses" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -py = "*" -pytest = ">=3.10" - -[[package]] -name = "pytest-order" -version = "1.1.0" -description = "pytest plugin to run your tests in a specific order" -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -pytest = [ - {version = ">=5.0", markers = "python_version < \"3.10\""}, - {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, -] - -[[package]] -name = "pytest-pythonpath" -version = "0.7.4" -description = "pytest plugin for adding to the PYTHONPATH from command line or configs." -category = "dev" -optional = false -python-versions = ">=2.6, <4" - -[package.dependencies] -pytest = ">=2.5.2,<7" - -[[package]] -name = "python-daemon" -version = "3.0.1" -description = "Library to implement a well-behaved Unix daemon process." -category = "dev" -optional = false -python-versions = ">=3" - -[package.dependencies] -docutils = "*" -lockfile = ">=0.10" -setuptools = ">=62.4.0" - -[package.extras] -devel = ["coverage", "docutils", "isort", "testscenarios (>=0.4)", "testtools", "twine"] -test = ["coverage", "docutils", "testscenarios (>=0.4)", "testtools"] - -[[package]] -name = "python-dateutil" -version = "2.8.2" -description = "Extensions to the standard Python datetime module" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "python-nvd3" -version = "0.15.0" -description = "Python NVD3 - Chart Library for d3.js" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -Jinja2 = ">=2.8" -python-slugify = ">=1.2.5" - -[[package]] -name = "python-slugify" -version = "8.0.1" -description = "A Python slugify application that also handles Unicode" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -text-unidecode = ">=1.3" - -[package.extras] -unidecode = ["Unidecode (>=1.1.1)"] - -[[package]] -name = "pytimeparse" -version = "1.1.8" -description = "Time expression parser" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pytz" -version = "2023.3" -description = "World timezone definitions, modern and historical" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "pytzdata" -version = "2020.1" -description = "The Olson timezone database for Python." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[[package]] -name = "pywin32" -version = "306" -description = "Python for Window Extensions" -category = "main" -optional = true -python-versions = "*" - -[[package]] -name = "pywin32-ctypes" -version = "0.2.2" -description = "A (partial) reimplementation of pywin32 using ctypes/cffi" -category = "main" -optional = true -python-versions = ">=3.6" - -[[package]] -name = "pyyaml" -version = "6.0.1" -description = "YAML parser and emitter for Python" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "qdrant-client" -version = "1.6.4" -description = "Client library for the Qdrant vector search engine" -category = "main" -optional = true -python-versions = ">=3.8,<3.13" - -[package.dependencies] -fastembed = {version = "0.1.1", optional = true, markers = "python_version < \"3.12\" and extra == \"fastembed\""} -grpcio = ">=1.41.0" -grpcio-tools = ">=1.41.0" -httpx = {version = ">=0.14.0", extras = ["http2"]} -numpy = [ - {version = ">=1.21", markers = "python_version >= \"3.8\" and python_version < \"3.12\""}, - {version = ">=1.26", markers = "python_version >= \"3.12\""}, -] -portalocker = ">=2.7.0,<3.0.0" -pydantic = ">=1.10.8" -urllib3 = ">=1.26.14,<2.0.0" - -[package.extras] -fastembed = ["fastembed (==0.1.1)"] - -[[package]] -name = "redshift-connector" -version = "2.0.913" -description = "Redshift interface library" -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -beautifulsoup4 = ">=4.7.0,<5.0.0" -boto3 = ">=1.9.201,<2.0.0" -botocore = ">=1.12.201,<2.0.0" -lxml = ">=4.6.5" -packaging = "*" -pytz = ">=2020.1" -requests = ">=2.23.0,<3.0.0" -scramp = ">=1.2.0,<1.5.0" -setuptools = "*" - -[package.extras] -full = ["numpy", "pandas"] - -[[package]] -name = "referencing" -version = "0.30.2" -description = "JSON Referencing + Python" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -attrs = ">=22.2.0" -rpds-py = ">=0.7.0" - -[[package]] -name = "regex" -version = "2023.8.8" -description = "Alternative regular expression module, to replace re." -category = "dev" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "requests" -version = "2.31.0" -description = "Python HTTP for Humans." -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[[package]] -name = "requests-mock" -version = "1.11.0" -description = "Mock out responses from the requests package" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -requests = ">=2.3,<3" -six = "*" - -[package.extras] -fixture = ["fixtures"] -test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] - -[[package]] -name = "requests-oauthlib" -version = "1.3.1" -description = "OAuthlib authentication support for Requests." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.dependencies] -oauthlib = ">=3.0.0" -requests = ">=2.0.0" - -[package.extras] -rsa = ["oauthlib[signedtoken] (>=3.0.0)"] - -[[package]] -name = "requests-toolbelt" -version = "1.0.0" -description = "A utility belt for advanced users of python-requests" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" - -[package.dependencies] -requests = ">=2.0.1,<3.0.0" - -[[package]] -name = "requirements-parser" -version = "0.5.0" -description = "This is a small Python module for parsing Pip requirement files." -category = "main" -optional = false -python-versions = ">=3.6,<4.0" - -[package.dependencies] -types-setuptools = ">=57.0.0" - -[[package]] -name = "rfc3339-validator" -version = "0.1.4" -description = "A pure python RFC3339 validator" -category = "dev" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - -[package.dependencies] -six = "*" - -[[package]] -name = "rich" -version = "13.5.2" -description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -category = "dev" -optional = false -python-versions = ">=3.7.0" - -[package.dependencies] -markdown-it-py = ">=2.2.0" -pygments = ">=2.13.0,<3.0.0" -typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} - -[package.extras] -jupyter = ["ipywidgets (>=7.5.1,<9)"] - -[[package]] -name = "rich-argparse" -version = "1.3.0" -description = "Rich help formatters for argparse and optparse" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -rich = ">=11.0.0" - -[[package]] -name = "rpds-py" -version = "0.10.0" -description = "Python bindings to Rust's persistent data structures (rpds)" -category = "main" -optional = false -python-versions = ">=3.8" - -[[package]] -name = "rsa" -version = "4.9" -description = "Pure-Python RSA implementation" -category = "main" -optional = false -python-versions = ">=3.6,<4" - -[package.dependencies] -pyasn1 = ">=0.1.3" - -[[package]] -name = "s3fs" -version = "2023.6.0" -description = "Convenient Filesystem interface over S3" -category = "main" -optional = true -python-versions = ">= 3.8" - -[package.dependencies] -aiobotocore = ">=2.5.0,<2.6.0" -aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1" -fsspec = "2023.6.0" - -[package.extras] -awscli = ["aiobotocore[awscli] (>=2.5.0,<2.6.0)"] -boto3 = ["aiobotocore[boto3] (>=2.5.0,<2.6.0)"] - -[[package]] -name = "s3transfer" -version = "0.6.2" -description = "An Amazon S3 Transfer Manager" -category = "main" -optional = true -python-versions = ">= 3.7" - -[package.dependencies] -botocore = ">=1.12.36,<2.0a.0" - -[package.extras] -crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] - -[[package]] -name = "scramp" -version = "1.4.4" -description = "An implementation of the SCRAM protocol." -category = "main" -optional = true -python-versions = ">=3.7" - -[package.dependencies] -asn1crypto = ">=1.5.1" - -[[package]] -name = "secretstorage" -version = "3.3.3" -description = "Python bindings to FreeDesktop.org Secret Service API" -category = "main" -optional = true -python-versions = ">=3.6" - -[package.dependencies] -cryptography = ">=2.0" -jeepney = ">=0.6" - -[[package]] -name = "semver" -version = "3.0.1" -description = "Python helper for Semantic Versioning (https://semver.org)" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "sentry-sdk" -version = "1.30.0" -description = "Python client for Sentry (https://sentry.io)" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -certifi = "*" -urllib3 = {version = ">=1.26.11", markers = "python_version >= \"3.6\""} - -[package.extras] -aiohttp = ["aiohttp (>=3.5)"] -arq = ["arq (>=0.23)"] -beam = ["apache-beam (>=2.12)"] -bottle = ["bottle (>=0.12.13)"] -celery = ["celery (>=3)"] -chalice = ["chalice (>=1.16.0)"] -django = ["django (>=1.8)"] -falcon = ["falcon (>=1.4)"] -fastapi = ["fastapi (>=0.79.0)"] -flask = ["blinker (>=1.1)", "flask (>=0.11)", "markupsafe"] -grpcio = ["grpcio (>=1.21.1)"] -httpx = ["httpx (>=0.16.0)"] -huey = ["huey (>=2)"] -loguru = ["loguru (>=0.5)"] -opentelemetry = ["opentelemetry-distro (>=0.35b0)"] -opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"] -pure-eval = ["asttokens", "executing", "pure-eval"] -pymongo = ["pymongo (>=3.1)"] -pyspark = ["pyspark (>=2.4.4)"] -quart = ["blinker (>=1.1)", "quart (>=0.16.1)"] -rq = ["rq (>=0.6)"] -sanic = ["sanic (>=0.8)"] -sqlalchemy = ["sqlalchemy (>=1.2)"] -starlette = ["starlette (>=0.19.1)"] -starlite = ["starlite (>=1.48)"] -tornado = ["tornado (>=5)"] - -[[package]] -name = "setproctitle" -version = "1.3.2" -description = "A Python module to customize the process title" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -test = ["pytest"] - -[[package]] -name = "setuptools" -version = "68.1.2" -description = "Easily download, build, install, upgrade, and uninstall Python packages" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] - -[[package]] -name = "simplejson" -version = "3.19.1" -description = "Simple, fast, extensible JSON encoder/decoder for Python" -category = "main" -optional = false -python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "smmap" -version = "5.0.0" -description = "A pure Python implementation of a sliding window memory map manager" -category = "main" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "sniffio" -version = "1.3.0" -description = "Sniff out which async library your code is running under" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "snowflake-connector-python" -version = "3.5.0" -description = "Snowflake Connector for Python" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -asn1crypto = ">0.24.0,<2.0.0" -certifi = ">=2017.4.17" -cffi = ">=1.9,<2.0.0" -charset-normalizer = ">=2,<4" -cryptography = ">=3.1.0,<42.0.0" -filelock = ">=3.5,<4" -idna = ">=2.5,<4" -keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} -packaging = "*" -pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} -platformdirs = ">=2.6.0,<4.0.0" -pyarrow = {version = "*", optional = true, markers = "extra == \"pandas\""} -pyjwt = "<3.0.0" -pyOpenSSL = ">=16.2.0,<24.0.0" -pytz = "*" -requests = "<3.0.0" -sortedcontainers = ">=2.4.0" -tomlkit = "*" -typing-extensions = ">=4.3,<5" -urllib3 = ">=1.21.1,<2.0.0" - -[package.extras] -development = ["Cython", "coverage", "more-itertools", "numpy (<1.27.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] -pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow"] -secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] - -[[package]] -name = "sortedcontainers" -version = "2.4.0" -description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -category = "main" -optional = true -python-versions = "*" - -[[package]] -name = "soupsieve" -version = "2.5" -description = "A modern CSS selector implementation for Beautiful Soup." -category = "main" -optional = true -python-versions = ">=3.8" - -[[package]] -name = "sqlalchemy" -version = "1.4.49" -description = "Database Abstraction Library" -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" - -[package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} - -[package.extras] -aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] -aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing_extensions (!=3.10.0.1)"] -asyncio = ["greenlet (!=0.4.17)"] -asyncmy = ["asyncmy (>=0.2.3,!=0.2.4)", "greenlet (!=0.4.17)"] -mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2)"] -mssql = ["pyodbc"] -mssql-pymssql = ["pymssql"] -mssql-pyodbc = ["pyodbc"] -mypy = ["mypy (>=0.910)", "sqlalchemy2-stubs"] -mysql = ["mysqlclient (>=1.4.0)", "mysqlclient (>=1.4.0,<2)"] -mysql-connector = ["mysql-connector-python"] -oracle = ["cx_oracle (>=7)", "cx_oracle (>=7,<8)"] -postgresql = ["psycopg2 (>=2.7)"] -postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] -postgresql-pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"] -postgresql-psycopg2binary = ["psycopg2-binary"] -postgresql-psycopg2cffi = ["psycopg2cffi"] -pymysql = ["pymysql", "pymysql (<1)"] -sqlcipher = ["sqlcipher3_binary"] - -[[package]] -name = "sqlalchemy-jsonfield" -version = "1.0.1.post0" -description = "SQLALchemy JSONField implementation for storing dicts at SQL" -category = "dev" -optional = false -python-versions = ">=3.7.0" - -[package.dependencies] -sqlalchemy = "*" - -[[package]] -name = "sqlalchemy-utils" -version = "0.41.1" -description = "Various utility functions for SQLAlchemy." -category = "dev" -optional = false -python-versions = ">=3.6" - -[package.dependencies] -SQLAlchemy = ">=1.3" - -[package.extras] -arrow = ["arrow (>=0.3.4)"] -babel = ["Babel (>=1.3)"] -color = ["colour (>=0.0.4)"] -encrypted = ["cryptography (>=0.6)"] -intervals = ["intervals (>=0.7.1)"] -password = ["passlib (>=1.6,<2.0)"] -pendulum = ["pendulum (>=2.0.5)"] -phone = ["phonenumbers (>=5.9.2)"] -test = ["Jinja2 (>=2.3)", "Pygments (>=1.2)", "backports.zoneinfo", "docutils (>=0.10)", "flake8 (>=2.4.0)", "flexmock (>=0.9.7)", "isort (>=4.2.2)", "pg8000 (>=1.12.4)", "psycopg (>=3.1.8)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pymysql", "pyodbc", "pytest (>=2.7.1)", "python-dateutil (>=2.6)", "pytz (>=2014.2)"] -test-all = ["Babel (>=1.3)", "Jinja2 (>=2.3)", "Pygments (>=1.2)", "arrow (>=0.3.4)", "backports.zoneinfo", "colour (>=0.0.4)", "cryptography (>=0.6)", "docutils (>=0.10)", "flake8 (>=2.4.0)", "flexmock (>=0.9.7)", "furl (>=0.4.1)", "intervals (>=0.7.1)", "isort (>=4.2.2)", "passlib (>=1.6,<2.0)", "pendulum (>=2.0.5)", "pg8000 (>=1.12.4)", "phonenumbers (>=5.9.2)", "psycopg (>=3.1.8)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pymysql", "pyodbc", "pytest (>=2.7.1)", "python-dateutil", "python-dateutil (>=2.6)", "pytz (>=2014.2)"] -timezone = ["python-dateutil"] -url = ["furl (>=0.4.1)"] - -[[package]] -name = "sqlfluff" -version = "2.3.2" -description = "The SQL Linter for Humans" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -appdirs = "*" -chardet = "*" -click = "*" -colorama = ">=0.3" -diff-cover = ">=2.5.0" -importlib-resources = {version = "*", markers = "python_version < \"3.9\""} -Jinja2 = "*" -pathspec = "*" -pytest = "*" -pyyaml = ">=5.1" -regex = "*" -tblib = "*" -toml = {version = "*", markers = "python_version < \"3.11\""} -tqdm = "*" -typing-extensions = "*" - -[[package]] -name = "sqlparse" -version = "0.4.4" -description = "A non-validating SQL parser." -category = "main" -optional = false -python-versions = ">=3.5" - -[package.extras] -dev = ["build", "flake8"] -doc = ["sphinx"] -test = ["pytest", "pytest-cov"] - -[[package]] -name = "stevedore" -version = "5.1.0" -description = "Manage dynamic plugins for Python applications" -category = "dev" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -pbr = ">=2.0.0,<2.1.0 || >2.1.0" - -[[package]] -name = "sympy" -version = "1.12" -description = "Computer algebra system (CAS) in Python" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -mpmath = ">=0.19" - -[[package]] -name = "tabulate" -version = "0.9.0" -description = "Pretty-print tabular data" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -widechars = ["wcwidth"] - -[[package]] -name = "tblib" -version = "2.0.0" -description = "Traceback serialization library." -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "tenacity" -version = "8.2.3" -description = "Retry code until it succeeds" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -doc = ["reno", "sphinx", "tornado (>=4.5)"] - -[[package]] -name = "termcolor" -version = "2.3.0" -description = "ANSI color formatting for output in terminal" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -tests = ["pytest", "pytest-cov"] - -[[package]] -name = "text-unidecode" -version = "1.3" -description = "The most basic Text::Unidecode port" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "tokenizers" -version = "0.13.3" -description = "Fast and Customizable Tokenizers" -category = "main" -optional = true -python-versions = "*" - -[package.extras] -dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] -docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] -testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] - -[[package]] -name = "toml" -version = "0.10.2" -description = "Python Library for Tom's Obvious, Minimal Language" -category = "dev" -optional = false -python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" - -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "tomli-w" -version = "1.0.0" -description = "A lil' TOML writer" -category = "dev" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "tomlkit" -version = "0.12.1" -description = "Style preserving TOML library" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "tqdm" -version = "4.66.1" -description = "Fast, Extensible Progress Meter" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[package.extras] -dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] -notebook = ["ipywidgets (>=6)"] -slack = ["slack-sdk"] -telegram = ["requests"] - -[[package]] -name = "typeapi" -version = "2.1.1" -description = "" -category = "dev" -optional = false -python-versions = ">=3.6.3,<4.0.0" - -[package.dependencies] -typing-extensions = ">=3.0.0" - -[[package]] -name = "types-awscrt" -version = "0.19.1" -description = "Type annotations and code completion for awscrt" -category = "main" -optional = false -python-versions = ">=3.7,<4.0" - -[[package]] -name = "types-cachetools" -version = "5.3.0.6" -description = "Typing stubs for cachetools" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-click" -version = "7.1.8" -description = "Typing stubs for click" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-deprecated" -version = "1.2.9.3" -description = "Typing stubs for Deprecated" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-protobuf" -version = "4.24.0.1" -description = "Typing stubs for protobuf" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-psutil" -version = "5.9.5.16" -description = "Typing stubs for psutil" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-psycopg2" -version = "2.9.21.14" -description = "Typing stubs for psycopg2" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-python-dateutil" -version = "2.8.19.14" -description = "Typing stubs for python-dateutil" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-pyyaml" -version = "6.0.12.11" -description = "Typing stubs for PyYAML" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-requests" -version = "2.31.0.2" -description = "Typing stubs for requests" -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -types-urllib3 = "*" - -[[package]] -name = "types-s3transfer" -version = "0.6.2" -description = "Type annotations and code completion for s3transfer" -category = "main" -optional = false -python-versions = ">=3.7,<4.0" - -[[package]] -name = "types-setuptools" -version = "68.1.0.1" -description = "Typing stubs for setuptools" -category = "main" -optional = false -python-versions = "*" - -[[package]] -name = "types-simplejson" -version = "3.19.0.2" -description = "Typing stubs for simplejson" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-sqlalchemy" -version = "1.4.53.38" -description = "Typing stubs for SQLAlchemy" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-tqdm" -version = "4.66.0.2" -description = "Typing stubs for tqdm" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "types-urllib3" -version = "1.26.25.14" -description = "Typing stubs for urllib3" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" -category = "main" -optional = false -python-versions = ">=3.7" - -[[package]] -name = "tzdata" -version = "2023.3" -description = "Provider of IANA time zone data" -category = "main" -optional = false -python-versions = ">=2" - -[[package]] -name = "uc-micro-py" -version = "1.0.2" -description = "Micro subset of unicode data files for linkify-it-py projects." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -test = ["coverage", "pytest", "pytest-cov"] - -[[package]] -name = "unicodecsv" -version = "0.14.1" -description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*." -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "uritemplate" -version = "4.1.1" -description = "Implementation of RFC 6570 URI Templates" -category = "dev" -optional = false -python-versions = ">=3.6" - -[[package]] -name = "urllib3" -version = "1.26.16" -description = "HTTP library with thread-safe connection pooling, file post, and more." -category = "main" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] -socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] - -[[package]] -name = "validators" -version = "0.21.0" -description = "Python Data Validation for Humans™" -category = "main" -optional = true -python-versions = ">=3.8,<4.0" - -[[package]] -name = "watchdog" -version = "3.0.0" -description = "Filesystem events monitoring" -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.extras] -watchmedo = ["PyYAML (>=3.10)"] - -[[package]] -name = "wcwidth" -version = "0.2.6" -description = "Measures the displayed width of unicode strings in a terminal" -category = "dev" -optional = false -python-versions = "*" - -[[package]] -name = "weaviate-client" -version = "3.23.2" -description = "A python native Weaviate client" -category = "main" -optional = true -python-versions = ">=3.8" - -[package.dependencies] -authlib = ">=1.1.0" -requests = ">=2.28.0,<=2.31.0" -tqdm = ">=4.59.0,<5.0.0" -validators = ">=0.18.2,<=0.21.0" - -[package.extras] -grpc = ["grpcio", "grpcio-tools"] - -[[package]] -name = "werkzeug" -version = "2.3.7" -description = "The comprehensive WSGI web application library." -category = "main" -optional = false -python-versions = ">=3.8" - -[package.dependencies] -MarkupSafe = ">=2.1.1" - -[package.extras] -watchdog = ["watchdog (>=2.3)"] - -[[package]] -name = "wheel" -version = "0.41.2" -description = "A built-package format for Python" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.extras] -test = ["pytest (>=6.0.0)", "setuptools (>=65)"] - -[[package]] -name = "wrapt" -version = "1.15.0" -description = "Module for decorators, wrappers and monkey patching." -category = "main" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" - -[[package]] -name = "wtforms" -version = "3.0.1" -description = "Form validation and rendering for Python web development." -category = "dev" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -MarkupSafe = "*" - -[package.extras] -email = ["email-validator"] - -[[package]] -name = "yapf" -version = "0.33.0" -description = "A formatter for Python code." -category = "dev" -optional = false -python-versions = "*" - -[package.dependencies] -tomli = ">=2.0.1" - -[[package]] -name = "yarl" -version = "1.9.2" -description = "Yet another URL library" -category = "main" -optional = false -python-versions = ">=3.7" - -[package.dependencies] -idna = ">=2.0" -multidict = ">=4.0" - -[[package]] -name = "zipp" -version = "3.16.2" -description = "Backport of pathlib-compatible object wrapper for zip files" -category = "main" -optional = false -python-versions = ">=3.8" - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] - -[extras] -athena = ["pyathena", "pyarrow", "s3fs", "botocore"] -az = ["adlfs"] -bigquery = ["grpcio", "google-cloud-bigquery", "pyarrow", "gcsfs"] -cli = ["pipdeptree", "cron-descriptor"] -dbt = ["dbt-core", "dbt-redshift", "dbt-bigquery", "dbt-duckdb", "dbt-snowflake", "dbt-athena-community"] -duckdb = ["duckdb"] -filesystem = ["s3fs", "botocore"] -gcp = ["grpcio", "google-cloud-bigquery", "gcsfs"] -gs = ["gcsfs"] -motherduck = ["duckdb", "pyarrow"] -mssql = ["pyodbc"] -parquet = ["pyarrow"] -postgres = ["psycopg2-binary", "psycopg2cffi"] -qdrant = ["qdrant-client"] -redshift = ["psycopg2-binary", "psycopg2cffi"] -s3 = ["s3fs", "botocore"] -snowflake = ["snowflake-connector-python"] -weaviate = ["weaviate-client"] - -[metadata] -lock-version = "1.1" -python-versions = ">=3.8.1,<3.13" -content-hash = "bbfaab078877deaa60ecf6bc95c0374e1967268ca24594a99b792b88c4ef270b" - -[metadata.files] -about-time = [ - {file = "about-time-4.2.1.tar.gz", hash = "sha256:6a538862d33ce67d997429d14998310e1dbfda6cb7d9bbfbf799c4709847fece"}, - {file = "about_time-4.2.1-py3-none-any.whl", hash = "sha256:8bbf4c75fe13cbd3d72f49a03b02c5c7dca32169b6d49117c257e7eb3eaee341"}, -] -adlfs = [ - {file = "adlfs-2023.8.0-py3-none-any.whl", hash = "sha256:3eb248a3c2a30b419f1147bd7676d156b5219f96ef7f11d47166afd2a3bdb07e"}, - {file = "adlfs-2023.8.0.tar.gz", hash = "sha256:07e804f6df4593acfcaf01025b162e30ac13e523d3570279c98b2d91a18026d9"}, -] -agate = [ - {file = "agate-1.6.3-py2.py3-none-any.whl", hash = "sha256:2d568fd68a8eb8b56c805a1299ba4bc30ca0434563be1bea309c9d1c1c8401f4"}, - {file = "agate-1.6.3.tar.gz", hash = "sha256:e0f2f813f7e12311a4cdccc97d6ba0a6781e9c1aa8eca0ab00d5931c0113a308"}, -] -aiobotocore = [ - {file = "aiobotocore-2.5.2-py3-none-any.whl", hash = "sha256:337429ffd3cc367532572d40be809a84c7b5335f3f8eca2f23e09dfaa9a9ef90"}, - {file = "aiobotocore-2.5.2.tar.gz", hash = "sha256:e7399f21570db1c287f1c0c814dd3475dfe1c8166722e2c77ce67f172cbcfa89"}, -] -aiohttp = [ - {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a94159871304770da4dd371f4291b20cac04e8c94f11bdea1c3478e557fbe0d8"}, - {file = "aiohttp-3.8.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:13bf85afc99ce6f9ee3567b04501f18f9f8dbbb2ea11ed1a2e079670403a7c84"}, - {file = "aiohttp-3.8.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ce2ac5708501afc4847221a521f7e4b245abf5178cf5ddae9d5b3856ddb2f3a"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96943e5dcc37a6529d18766597c491798b7eb7a61d48878611298afc1fca946c"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ad5c3c4590bb3cc28b4382f031f3783f25ec223557124c68754a2231d989e2b"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0c413c633d0512df4dc7fd2373ec06cc6a815b7b6d6c2f208ada7e9e93a5061d"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:df72ac063b97837a80d80dec8d54c241af059cc9bb42c4de68bd5b61ceb37caa"}, - {file = "aiohttp-3.8.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c48c5c0271149cfe467c0ff8eb941279fd6e3f65c9a388c984e0e6cf57538e14"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:368a42363c4d70ab52c2c6420a57f190ed3dfaca6a1b19afda8165ee16416a82"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7607ec3ce4993464368505888af5beb446845a014bc676d349efec0e05085905"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0d21c684808288a98914e5aaf2a7c6a3179d4df11d249799c32d1808e79503b5"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:312fcfbacc7880a8da0ae8b6abc6cc7d752e9caa0051a53d217a650b25e9a691"}, - {file = "aiohttp-3.8.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad093e823df03bb3fd37e7dec9d4670c34f9e24aeace76808fc20a507cace825"}, - {file = "aiohttp-3.8.5-cp310-cp310-win32.whl", hash = "sha256:33279701c04351a2914e1100b62b2a7fdb9a25995c4a104259f9a5ead7ed4802"}, - {file = "aiohttp-3.8.5-cp310-cp310-win_amd64.whl", hash = "sha256:6e4a280e4b975a2e7745573e3fc9c9ba0d1194a3738ce1cbaa80626cc9b4f4df"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ae871a964e1987a943d83d6709d20ec6103ca1eaf52f7e0d36ee1b5bebb8b9b9"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:461908b2578955045efde733719d62f2b649c404189a09a632d245b445c9c975"}, - {file = "aiohttp-3.8.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72a860c215e26192379f57cae5ab12b168b75db8271f111019509a1196dfc780"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cc14be025665dba6202b6a71cfcdb53210cc498e50068bc088076624471f8bb9"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8af740fc2711ad85f1a5c034a435782fbd5b5f8314c9a3ef071424a8158d7f6b"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:841cd8233cbd2111a0ef0a522ce016357c5e3aff8a8ce92bcfa14cef890d698f"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ed1c46fb119f1b59304b5ec89f834f07124cd23ae5b74288e364477641060ff"}, - {file = "aiohttp-3.8.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:84f8ae3e09a34f35c18fa57f015cc394bd1389bce02503fb30c394d04ee6b938"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62360cb771707cb70a6fd114b9871d20d7dd2163a0feafe43fd115cfe4fe845e"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23fb25a9f0a1ca1f24c0a371523546366bb642397c94ab45ad3aedf2941cec6a"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:b0ba0d15164eae3d878260d4c4df859bbdc6466e9e6689c344a13334f988bb53"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5d20003b635fc6ae3f96d7260281dfaf1894fc3aa24d1888a9b2628e97c241e5"}, - {file = "aiohttp-3.8.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0175d745d9e85c40dcc51c8f88c74bfbaef9e7afeeeb9d03c37977270303064c"}, - {file = "aiohttp-3.8.5-cp311-cp311-win32.whl", hash = "sha256:2e1b1e51b0774408f091d268648e3d57f7260c1682e7d3a63cb00d22d71bb945"}, - {file = "aiohttp-3.8.5-cp311-cp311-win_amd64.whl", hash = "sha256:043d2299f6dfdc92f0ac5e995dfc56668e1587cea7f9aa9d8a78a1b6554e5755"}, - {file = "aiohttp-3.8.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:cae533195e8122584ec87531d6df000ad07737eaa3c81209e85c928854d2195c"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f21e83f355643c345177a5d1d8079f9f28b5133bcd154193b799d380331d5d3"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a7a75ef35f2df54ad55dbf4b73fe1da96f370e51b10c91f08b19603c64004acc"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e2e9839e14dd5308ee773c97115f1e0a1cb1d75cbeeee9f33824fa5144c7634"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44e65da1de4403d0576473e2344828ef9c4c6244d65cf4b75549bb46d40b8dd"}, - {file = "aiohttp-3.8.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:78d847e4cde6ecc19125ccbc9bfac4a7ab37c234dd88fbb3c5c524e8e14da543"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:c7a815258e5895d8900aec4454f38dca9aed71085f227537208057853f9d13f2"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:8b929b9bd7cd7c3939f8bcfffa92fae7480bd1aa425279d51a89327d600c704d"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:5db3a5b833764280ed7618393832e0853e40f3d3e9aa128ac0ba0f8278d08649"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:a0215ce6041d501f3155dc219712bc41252d0ab76474615b9700d63d4d9292af"}, - {file = "aiohttp-3.8.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:fd1ed388ea7fbed22c4968dd64bab0198de60750a25fe8c0c9d4bef5abe13824"}, - {file = "aiohttp-3.8.5-cp36-cp36m-win32.whl", hash = "sha256:6e6783bcc45f397fdebc118d772103d751b54cddf5b60fbcc958382d7dd64f3e"}, - {file = "aiohttp-3.8.5-cp36-cp36m-win_amd64.whl", hash = "sha256:b5411d82cddd212644cf9360879eb5080f0d5f7d809d03262c50dad02f01421a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:01d4c0c874aa4ddfb8098e85d10b5e875a70adc63db91f1ae65a4b04d3344cda"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5980a746d547a6ba173fd5ee85ce9077e72d118758db05d229044b469d9029a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2a482e6da906d5e6e653be079b29bc173a48e381600161c9932d89dfae5942ef"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:80bd372b8d0715c66c974cf57fe363621a02f359f1ec81cba97366948c7fc873"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1161b345c0a444ebcf46bf0a740ba5dcf50612fd3d0528883fdc0eff578006a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cd56db019015b6acfaaf92e1ac40eb8434847d9bf88b4be4efe5bfd260aee692"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:153c2549f6c004d2754cc60603d4668899c9895b8a89397444a9c4efa282aaf4"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:4a01951fabc4ce26ab791da5f3f24dca6d9a6f24121746eb19756416ff2d881b"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bfb9162dcf01f615462b995a516ba03e769de0789de1cadc0f916265c257e5d8"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:7dde0009408969a43b04c16cbbe252c4f5ef4574ac226bc8815cd7342d2028b6"}, - {file = "aiohttp-3.8.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:4149d34c32f9638f38f544b3977a4c24052042affa895352d3636fa8bffd030a"}, - {file = "aiohttp-3.8.5-cp37-cp37m-win32.whl", hash = "sha256:68c5a82c8779bdfc6367c967a4a1b2aa52cd3595388bf5961a62158ee8a59e22"}, - {file = "aiohttp-3.8.5-cp37-cp37m-win_amd64.whl", hash = "sha256:2cf57fb50be5f52bda004b8893e63b48530ed9f0d6c96c84620dc92fe3cd9b9d"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:eca4bf3734c541dc4f374ad6010a68ff6c6748f00451707f39857f429ca36ced"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1274477e4c71ce8cfe6c1ec2f806d57c015ebf84d83373676036e256bc55d690"}, - {file = "aiohttp-3.8.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:28c543e54710d6158fc6f439296c7865b29e0b616629767e685a7185fab4a6b9"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:910bec0c49637d213f5d9877105d26e0c4a4de2f8b1b29405ff37e9fc0ad52b8"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5443910d662db951b2e58eb70b0fbe6b6e2ae613477129a5805d0b66c54b6cb7"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2e460be6978fc24e3df83193dc0cc4de46c9909ed92dd47d349a452ef49325b7"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb1558def481d84f03b45888473fc5a1f35747b5f334ef4e7a571bc0dfcb11f8"}, - {file = "aiohttp-3.8.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:34dd0c107799dcbbf7d48b53be761a013c0adf5571bf50c4ecad5643fe9cfcd0"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aa1990247f02a54185dc0dff92a6904521172a22664c863a03ff64c42f9b5410"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:0e584a10f204a617d71d359fe383406305a4b595b333721fa50b867b4a0a1548"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:a3cf433f127efa43fee6b90ea4c6edf6c4a17109d1d037d1a52abec84d8f2e42"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:c11f5b099adafb18e65c2c997d57108b5bbeaa9eeee64a84302c0978b1ec948b"}, - {file = "aiohttp-3.8.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:84de26ddf621d7ac4c975dbea4c945860e08cccde492269db4e1538a6a6f3c35"}, - {file = "aiohttp-3.8.5-cp38-cp38-win32.whl", hash = "sha256:ab88bafedc57dd0aab55fa728ea10c1911f7e4d8b43e1d838a1739f33712921c"}, - {file = "aiohttp-3.8.5-cp38-cp38-win_amd64.whl", hash = "sha256:5798a9aad1879f626589f3df0f8b79b3608a92e9beab10e5fda02c8a2c60db2e"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a6ce61195c6a19c785df04e71a4537e29eaa2c50fe745b732aa937c0c77169f3"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:773dd01706d4db536335fcfae6ea2440a70ceb03dd3e7378f3e815b03c97ab51"}, - {file = "aiohttp-3.8.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f83a552443a526ea38d064588613aca983d0ee0038801bc93c0c916428310c28"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f7372f7341fcc16f57b2caded43e81ddd18df53320b6f9f042acad41f8e049a"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea353162f249c8097ea63c2169dd1aa55de1e8fecbe63412a9bc50816e87b761"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e5d47ae48db0b2dcf70bc8a3bc72b3de86e2a590fc299fdbbb15af320d2659de"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d827176898a2b0b09694fbd1088c7a31836d1a505c243811c87ae53a3f6273c1"}, - {file = "aiohttp-3.8.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3562b06567c06439d8b447037bb655ef69786c590b1de86c7ab81efe1c9c15d8"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4e874cbf8caf8959d2adf572a78bba17cb0e9d7e51bb83d86a3697b686a0ab4d"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6809a00deaf3810e38c628e9a33271892f815b853605a936e2e9e5129762356c"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:33776e945d89b29251b33a7e7d006ce86447b2cfd66db5e5ded4e5cd0340585c"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:eaeed7abfb5d64c539e2db173f63631455f1196c37d9d8d873fc316470dfbacd"}, - {file = "aiohttp-3.8.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e91d635961bec2d8f19dfeb41a539eb94bd073f075ca6dae6c8dc0ee89ad6f91"}, - {file = "aiohttp-3.8.5-cp39-cp39-win32.whl", hash = "sha256:00ad4b6f185ec67f3e6562e8a1d2b69660be43070bd0ef6fcec5211154c7df67"}, - {file = "aiohttp-3.8.5-cp39-cp39-win_amd64.whl", hash = "sha256:c0a9034379a37ae42dea7ac1e048352d96286626251862e448933c0f59cbd79c"}, - {file = "aiohttp-3.8.5.tar.gz", hash = "sha256:b9552ec52cc147dbf1944ac7ac98af7602e51ea2dcd076ed194ca3c0d1c7d0bc"}, -] -aioitertools = [ - {file = "aioitertools-0.11.0-py3-none-any.whl", hash = "sha256:04b95e3dab25b449def24d7df809411c10e62aab0cbe31a50ca4e68748c43394"}, - {file = "aioitertools-0.11.0.tar.gz", hash = "sha256:42c68b8dd3a69c2bf7f2233bf7df4bb58b557bca5252ac02ed5187bbc67d6831"}, -] -aiosignal = [ - {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"}, - {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"}, -] -alembic = [ - {file = "alembic-1.12.0-py3-none-any.whl", hash = "sha256:03226222f1cf943deee6c85d9464261a6c710cd19b4fe867a3ad1f25afda610f"}, - {file = "alembic-1.12.0.tar.gz", hash = "sha256:8e7645c32e4f200675e69f0745415335eb59a3663f5feb487abfa0b30c45888b"}, -] -alive-progress = [ - {file = "alive-progress-3.1.4.tar.gz", hash = "sha256:74a95d8d0d42bc99d3a3725dbd06ebb852245f1b64e301a7c375b92b22663f7b"}, - {file = "alive_progress-3.1.4-py3-none-any.whl", hash = "sha256:c80ad87ce9c1054b01135a87fae69ecebbfc2107497ae87cbe6aec7e534903db"}, -] -annotated-types = [ - {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, - {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, -] -ansicon = [ - {file = "ansicon-1.89.0-py2.py3-none-any.whl", hash = "sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec"}, - {file = "ansicon-1.89.0.tar.gz", hash = "sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1"}, -] -anyio = [ - {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"}, - {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, -] -apache-airflow = [ - {file = "apache-airflow-2.7.2.tar.gz", hash = "sha256:c6fab3449066867d9a7728f40b6b9e27f1ea68bca39b064a27f5c5ddc3262224"}, - {file = "apache_airflow-2.7.2-py3-none-any.whl", hash = "sha256:1bc2c022bcae24b911e49fafd5fb619b49efba87ed7bc8561a2065810d8fe899"}, -] -apache-airflow-providers-common-sql = [ - {file = "apache-airflow-providers-common-sql-1.7.1.tar.gz", hash = "sha256:ba37f795d9656a87cf4661edc381b8ecfe930272c59324b59f8a158fd0971aeb"}, - {file = "apache_airflow_providers_common_sql-1.7.1-py3-none-any.whl", hash = "sha256:36da2f51b51a64765b0ed5e6a5fece8eaa3ca173dfbff803e2fe2a0afbb90944"}, -] -apache-airflow-providers-ftp = [ - {file = "apache-airflow-providers-ftp-3.5.1.tar.gz", hash = "sha256:dc6dc524dc7454857a0812154d7540172e36db3a87e48a4a91918ebf80898bbf"}, - {file = "apache_airflow_providers_ftp-3.5.1-py3-none-any.whl", hash = "sha256:e4ea77d6276355acfe2392c12155db7b9d51be460b7673b616dc1d8bee03c1d7"}, -] -apache-airflow-providers-http = [ - {file = "apache-airflow-providers-http-4.5.1.tar.gz", hash = "sha256:ec90920ff980fc264af9811dc72c37ef272bcdb3d007c7114e12366559426460"}, - {file = "apache_airflow_providers_http-4.5.1-py3-none-any.whl", hash = "sha256:702f26938bc22684eefecd297c2b0809793f9e43b8d911d807a29f21e69da179"}, -] -apache-airflow-providers-imap = [ - {file = "apache-airflow-providers-imap-3.3.1.tar.gz", hash = "sha256:40bac2a75e4dfbcd7d397776d90d03938facaf2707acc6cc119a8db684e53f77"}, - {file = "apache_airflow_providers_imap-3.3.1-py3-none-any.whl", hash = "sha256:adb6ef7864a5a8e245fbbd555bb4ef1eecf5b094d6d23ca0edc5f0aded50490d"}, -] -apache-airflow-providers-sqlite = [ - {file = "apache-airflow-providers-sqlite-3.4.3.tar.gz", hash = "sha256:347d2db03eaa5ea9fef414666565ffa5e849935cbc30e37237edcaa822b5ced8"}, - {file = "apache_airflow_providers_sqlite-3.4.3-py3-none-any.whl", hash = "sha256:4ffa6a50f0ea1b4e51240b657dfec3fb026c87bdfa71af908a56461df6a6f2e0"}, -] -apispec = [ - {file = "apispec-6.3.0-py3-none-any.whl", hash = "sha256:95a0b9355785df998bb0e9b939237a30ee4c7428fd6ef97305eae3da06b9b339"}, - {file = "apispec-6.3.0.tar.gz", hash = "sha256:6cb08d92ce73ff0b3bf46cb2ea5c00d57289b0f279fb0256a3df468182ba5344"}, -] -appdirs = [ - {file = "appdirs-1.4.4-py2.py3-none-any.whl", hash = "sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128"}, - {file = "appdirs-1.4.4.tar.gz", hash = "sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41"}, -] -argcomplete = [ - {file = "argcomplete-3.1.1-py3-none-any.whl", hash = "sha256:35fa893a88deea85ea7b20d241100e64516d6af6d7b0ae2bed1d263d26f70948"}, - {file = "argcomplete-3.1.1.tar.gz", hash = "sha256:6c4c563f14f01440aaffa3eae13441c5db2357b5eec639abe7c0b15334627dff"}, -] -asgiref = [ - {file = "asgiref-3.7.2-py3-none-any.whl", hash = "sha256:89b2ef2247e3b562a16eef663bc0e2e703ec6468e2fa8a5cd61cd449786d4f6e"}, - {file = "asgiref-3.7.2.tar.gz", hash = "sha256:9e0ce3aa93a819ba5b45120216b23878cf6e8525eb3848653452b4192b92afed"}, -] -asn1crypto = [ - {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, - {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, -] -astatine = [ - {file = "astatine-0.3.3-py3-none-any.whl", hash = "sha256:6d8c914f01fbea252cb8f31563f2e766a9ab03c02b9bcc37d18f7d9138828401"}, - {file = "astatine-0.3.3.tar.gz", hash = "sha256:0c58a7844b5890ff16da07dbfeb187341d8324cb4378940f89d795cbebebce08"}, -] -asttokens = [ - {file = "asttokens-2.3.0-py2.py3-none-any.whl", hash = "sha256:bef1a51bc256d349e9f94e7e40e44b705ed1162f55294220dd561d24583d9877"}, - {file = "asttokens-2.3.0.tar.gz", hash = "sha256:2552a88626aaa7f0f299f871479fc755bd4e7c11e89078965e928fb7bb9a6afe"}, -] -astunparse = [ - {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"}, - {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"}, -] -async-timeout = [ - {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, - {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, -] -atomicwrites = [ - {file = "atomicwrites-1.4.1.tar.gz", hash = "sha256:81b2c9071a49367a7f770170e5eec8cb66567cfbbc8c73d20ce5ca4a8d71cf11"}, -] -attrs = [ - {file = "attrs-23.1.0-py3-none-any.whl", hash = "sha256:1f28b4522cdc2fb4256ac1a020c78acf9cba2c6b461ccd2c126f3aa8e8335d04"}, - {file = "attrs-23.1.0.tar.gz", hash = "sha256:6279836d581513a26f1bf235f9acd333bc9115683f14f7e8fae46c98fc50e015"}, -] -authlib = [ - {file = "Authlib-1.2.1-py2.py3-none-any.whl", hash = "sha256:c88984ea00149a90e3537c964327da930779afa4564e354edfd98410bea01911"}, - {file = "Authlib-1.2.1.tar.gz", hash = "sha256:421f7c6b468d907ca2d9afede256f068f87e34d23dd221c07d13d4c234726afb"}, -] -azure-core = [ - {file = "azure-core-1.29.3.tar.gz", hash = "sha256:c92700af982e71c8c73de9f4c20da8b3f03ce2c22d13066e4d416b4629c87903"}, - {file = "azure_core-1.29.3-py3-none-any.whl", hash = "sha256:f8b2910f92b66293d93bd00564924ad20ad48f4a1e150577cf18d1e7d4f9263c"}, -] -azure-datalake-store = [ - {file = "azure-datalake-store-0.0.53.tar.gz", hash = "sha256:05b6de62ee3f2a0a6e6941e6933b792b800c3e7f6ffce2fc324bc19875757393"}, - {file = "azure_datalake_store-0.0.53-py2.py3-none-any.whl", hash = "sha256:a30c902a6e360aa47d7f69f086b426729784e71c536f330b691647a51dc42b2b"}, -] -azure-identity = [ - {file = "azure-identity-1.14.0.zip", hash = "sha256:72441799f8c5c89bfe21026965e266672a7c5d050c2c65119ef899dd5362e2b1"}, - {file = "azure_identity-1.14.0-py3-none-any.whl", hash = "sha256:edabf0e010eb85760e1dd19424d5e8f97ba2c9caff73a16e7b30ccbdbcce369b"}, -] -azure-storage-blob = [ - {file = "azure-storage-blob-12.17.0.zip", hash = "sha256:c14b785a17050b30fc326a315bdae6bc4a078855f4f94a4c303ad74a48dc8c63"}, - {file = "azure_storage_blob-12.17.0-py3-none-any.whl", hash = "sha256:0016e0c549a80282d7b4920c03f2f4ba35c53e6e3c7dbcd2a4a8c8eb3882c1e7"}, -] -babel = [ - {file = "Babel-2.12.1-py3-none-any.whl", hash = "sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610"}, - {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"}, -] -backoff = [ - {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, - {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, -] -bandit = [ - {file = "bandit-1.7.5-py3-none-any.whl", hash = "sha256:75665181dc1e0096369112541a056c59d1c5f66f9bb74a8d686c3c362b83f549"}, - {file = "bandit-1.7.5.tar.gz", hash = "sha256:bdfc739baa03b880c2d15d0431b31c658ffc348e907fe197e54e0389dd59e11e"}, -] -beautifulsoup4 = [ - {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"}, - {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"}, -] -black = [ - {file = "black-23.9.1-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:d6bc09188020c9ac2555a498949401ab35bb6bf76d4e0f8ee251694664df6301"}, - {file = "black-23.9.1-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:13ef033794029b85dfea8032c9d3b92b42b526f1ff4bf13b2182ce4e917f5100"}, - {file = "black-23.9.1-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:75a2dc41b183d4872d3a500d2b9c9016e67ed95738a3624f4751a0cb4818fe71"}, - {file = "black-23.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13a2e4a93bb8ca74a749b6974925c27219bb3df4d42fc45e948a5d9feb5122b7"}, - {file = "black-23.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:adc3e4442eef57f99b5590b245a328aad19c99552e0bdc7f0b04db6656debd80"}, - {file = "black-23.9.1-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:8431445bf62d2a914b541da7ab3e2b4f3bc052d2ccbf157ebad18ea126efb91f"}, - {file = "black-23.9.1-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:8fc1ddcf83f996247505db6b715294eba56ea9372e107fd54963c7553f2b6dfe"}, - {file = "black-23.9.1-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:7d30ec46de88091e4316b17ae58bbbfc12b2de05e069030f6b747dfc649ad186"}, - {file = "black-23.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:031e8c69f3d3b09e1aa471a926a1eeb0b9071f80b17689a655f7885ac9325a6f"}, - {file = "black-23.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:538efb451cd50f43aba394e9ec7ad55a37598faae3348d723b59ea8e91616300"}, - {file = "black-23.9.1-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:638619a559280de0c2aa4d76f504891c9860bb8fa214267358f0a20f27c12948"}, - {file = "black-23.9.1-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:a732b82747235e0542c03bf352c126052c0fbc458d8a239a94701175b17d4855"}, - {file = "black-23.9.1-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:cf3a4d00e4cdb6734b64bf23cd4341421e8953615cba6b3670453737a72ec204"}, - {file = "black-23.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf99f3de8b3273a8317681d8194ea222f10e0133a24a7548c73ce44ea1679377"}, - {file = "black-23.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:14f04c990259576acd093871e7e9b14918eb28f1866f91968ff5524293f9c573"}, - {file = "black-23.9.1-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:c619f063c2d68f19b2d7270f4cf3192cb81c9ec5bc5ba02df91471d0b88c4c5c"}, - {file = "black-23.9.1-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:6a3b50e4b93f43b34a9d3ef00d9b6728b4a722c997c99ab09102fd5efdb88325"}, - {file = "black-23.9.1-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c46767e8df1b7beefb0899c4a95fb43058fa8500b6db144f4ff3ca38eb2f6393"}, - {file = "black-23.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50254ebfa56aa46a9fdd5d651f9637485068a1adf42270148cd101cdf56e0ad9"}, - {file = "black-23.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:403397c033adbc45c2bd41747da1f7fc7eaa44efbee256b53842470d4ac5a70f"}, - {file = "black-23.9.1-py3-none-any.whl", hash = "sha256:6ccd59584cc834b6d127628713e4b6b968e5f79572da66284532525a042549f9"}, - {file = "black-23.9.1.tar.gz", hash = "sha256:24b6b3ff5c6d9ea08a8888f6977eae858e1f340d7260cf56d70a49823236b62d"}, -] -blessed = [ - {file = "blessed-1.20.0-py2.py3-none-any.whl", hash = "sha256:0c542922586a265e699188e52d5f5ac5ec0dd517e5a1041d90d2bbf23f906058"}, - {file = "blessed-1.20.0.tar.gz", hash = "sha256:2cdd67f8746e048f00df47a2880f4d6acbcdb399031b604e34ba8f71d5787680"}, -] -blinker = [ - {file = "blinker-1.6.2-py3-none-any.whl", hash = "sha256:c3d739772abb7bc2860abf5f2ec284223d9ad5c76da018234f6f50d6f31ab1f0"}, - {file = "blinker-1.6.2.tar.gz", hash = "sha256:4afd3de66ef3a9f8067559fb7a1cbe555c17dcbe15971b05d1b625c3e7abe213"}, -] -boto3 = [ - {file = "boto3-1.26.161-py3-none-any.whl", hash = "sha256:f66e5c9dbe7f34383bcf64fa6070771355c11a44dd75c7f1279f2f37e1c89183"}, - {file = "boto3-1.26.161.tar.gz", hash = "sha256:662731e464d14af1035f44fc6a46b0e3112ee011ac0a5ed416d205daa3e15f25"}, -] -boto3-stubs = [ - {file = "boto3-stubs-1.28.40.tar.gz", hash = "sha256:76079a82f199087319762c931f13506e02129132e80257dab0888d3da7dc11c7"}, - {file = "boto3_stubs-1.28.40-py3-none-any.whl", hash = "sha256:bd1d1cbdcbf18902a090d4a746cdecef2a7ebe31cf9a474bbe407d57eaa79a6a"}, -] -botocore = [ - {file = "botocore-1.29.161-py3-none-any.whl", hash = "sha256:b906999dd53dda2ef0ef6f7f55fcc81a4b06b9f1c8a9f65c546e0b981f959f5f"}, - {file = "botocore-1.29.161.tar.gz", hash = "sha256:a50edd715eb510343e27849f36483804aae4b871590db4d4996aa53368dcac40"}, -] -botocore-stubs = [ - {file = "botocore_stubs-1.31.40-py3-none-any.whl", hash = "sha256:aab534d7e7949cd543bc9b2fadc1a36712033cb00e6f31e2475eefe8486d19ae"}, - {file = "botocore_stubs-1.31.40.tar.gz", hash = "sha256:2001a253daf4ae2e171e6137b9982a00a7fbfc7a53449a16856dc049e7cd5214"}, -] -cachelib = [ - {file = "cachelib-0.9.0-py3-none-any.whl", hash = "sha256:811ceeb1209d2fe51cd2b62810bd1eccf70feba5c52641532498be5c675493b3"}, - {file = "cachelib-0.9.0.tar.gz", hash = "sha256:38222cc7c1b79a23606de5c2607f4925779e37cdcea1c2ad21b8bae94b5425a5"}, -] -cachetools = [ - {file = "cachetools-5.3.1-py3-none-any.whl", hash = "sha256:95ef631eeaea14ba2e36f06437f36463aac3a096799e876ee55e5cdccb102590"}, - {file = "cachetools-5.3.1.tar.gz", hash = "sha256:dce83f2d9b4e1f732a8cd44af8e8fab2dbe46201467fc98b3ef8f269092bf62b"}, -] -cattrs = [ - {file = "cattrs-23.1.2-py3-none-any.whl", hash = "sha256:b2bb14311ac17bed0d58785e5a60f022e5431aca3932e3fc5cc8ed8639de50a4"}, - {file = "cattrs-23.1.2.tar.gz", hash = "sha256:db1c821b8c537382b2c7c66678c3790091ca0275ac486c76f3c8f3920e83c657"}, -] -certifi = [ - {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, - {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, -] -cffi = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, -] -chardet = [ - {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, - {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, -] -charset-normalizer = [ - {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7c70087bfee18a42b4040bb9ec1ca15a08242cf5867c58726530bdf3945672ed"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a103b3a7069b62f5d4890ae1b8f0597618f628b286b03d4bc9195230b154bfa9"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94aea8eff76ee6d1cdacb07dd2123a68283cb5569e0250feab1240058f53b623"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db901e2ac34c931d73054d9797383d0f8009991e723dab15109740a63e7f902a"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b0dac0ff919ba34d4df1b6131f59ce95b08b9065233446be7e459f95554c0dc8"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:193cbc708ea3aca45e7221ae58f0fd63f933753a9bfb498a3b474878f12caaad"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:09393e1b2a9461950b1c9a45d5fd251dc7c6f228acab64da1c9c0165d9c7765c"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:baacc6aee0b2ef6f3d308e197b5d7a81c0e70b06beae1f1fcacffdbd124fe0e3"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:bf420121d4c8dce6b889f0e8e4ec0ca34b7f40186203f06a946fa0276ba54029"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c04a46716adde8d927adb9457bbe39cf473e1e2c2f5d0a16ceb837e5d841ad4f"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:aaf63899c94de41fe3cf934601b0f7ccb6b428c6e4eeb80da72c58eab077b19a"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:d62e51710986674142526ab9f78663ca2b0726066ae26b78b22e0f5e571238dd"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-win32.whl", hash = "sha256:04e57ab9fbf9607b77f7d057974694b4f6b142da9ed4a199859d9d4d5c63fe96"}, - {file = "charset_normalizer-3.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:48021783bdf96e3d6de03a6e39a1171ed5bd7e8bb93fc84cc649d11490f87cea"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:4957669ef390f0e6719db3613ab3a7631e68424604a7b448f079bee145da6e09"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:46fb8c61d794b78ec7134a715a3e564aafc8f6b5e338417cb19fe9f57a5a9bf2"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f779d3ad205f108d14e99bb3859aa7dd8e9c68874617c72354d7ecaec2a054ac"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f25c229a6ba38a35ae6e25ca1264621cc25d4d38dca2942a7fce0b67a4efe918"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2efb1bd13885392adfda4614c33d3b68dee4921fd0ac1d3988f8cbb7d589e72a"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f30b48dd7fa1474554b0b0f3fdfdd4c13b5c737a3c6284d3cdc424ec0ffff3a"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:246de67b99b6851627d945db38147d1b209a899311b1305dd84916f2b88526c6"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bd9b3b31adcb054116447ea22caa61a285d92e94d710aa5ec97992ff5eb7cf3"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c2f5e83493748286002f9369f3e6607c565a6a90425a3a1fef5ae32a36d749d"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3170c9399da12c9dc66366e9d14da8bf7147e1e9d9ea566067bbce7bb74bd9c2"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7a4826ad2bd6b07ca615c74ab91f32f6c96d08f6fcc3902ceeedaec8cdc3bcd6"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:3b1613dd5aee995ec6d4c69f00378bbd07614702a315a2cf6c1d21461fe17c23"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9e608aafdb55eb9f255034709e20d5a83b6d60c054df0802fa9c9883d0a937aa"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-win32.whl", hash = "sha256:f2a1d0fd4242bd8643ce6f98927cf9c04540af6efa92323e9d3124f57727bfc1"}, - {file = "charset_normalizer-3.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:681eb3d7e02e3c3655d1b16059fbfb605ac464c834a0c629048a30fad2b27489"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c57921cda3a80d0f2b8aec7e25c8aa14479ea92b5b51b6876d975d925a2ea346"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41b25eaa7d15909cf3ac4c96088c1f266a9a93ec44f87f1d13d4a0e86c81b982"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f058f6963fd82eb143c692cecdc89e075fa0828db2e5b291070485390b2f1c9c"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7647ebdfb9682b7bb97e2a5e7cb6ae735b1c25008a70b906aecca294ee96cf4"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eef9df1eefada2c09a5e7a40991b9fc6ac6ef20b1372abd48d2794a316dc0449"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e03b8895a6990c9ab2cdcd0f2fe44088ca1c65ae592b8f795c3294af00a461c3"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:ee4006268ed33370957f55bf2e6f4d263eaf4dc3cfc473d1d90baff6ed36ce4a"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c4983bf937209c57240cff65906b18bb35e64ae872da6a0db937d7b4af845dd7"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:3bb7fda7260735efe66d5107fb7e6af6a7c04c7fce9b2514e04b7a74b06bf5dd"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:72814c01533f51d68702802d74f77ea026b5ec52793c791e2da806a3844a46c3"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:70c610f6cbe4b9fce272c407dd9d07e33e6bf7b4aa1b7ffb6f6ded8e634e3592"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-win32.whl", hash = "sha256:a401b4598e5d3f4a9a811f3daf42ee2291790c7f9d74b18d75d6e21dda98a1a1"}, - {file = "charset_normalizer-3.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:c0b21078a4b56965e2b12f247467b234734491897e99c1d51cee628da9786959"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95eb302ff792e12aba9a8b8f8474ab229a83c103d74a750ec0bd1c1eea32e669"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a100c6d595a7f316f1b6f01d20815d916e75ff98c27a01ae817439ea7726329"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6339d047dab2780cc6220f46306628e04d9750f02f983ddb37439ca47ced7149"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4b749b9cc6ee664a3300bb3a273c1ca8068c46be705b6c31cf5d276f8628a94"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a38856a971c602f98472050165cea2cdc97709240373041b69030be15047691f"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f87f746ee241d30d6ed93969de31e5ffd09a2961a051e60ae6bddde9ec3583aa"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:89f1b185a01fe560bc8ae5f619e924407efca2191b56ce749ec84982fc59a32a"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1c8a2f4c69e08e89632defbfabec2feb8a8d99edc9f89ce33c4b9e36ab63037"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2f4ac36d8e2b4cc1aa71df3dd84ff8efbe3bfb97ac41242fbcfc053c67434f46"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a386ebe437176aab38c041de1260cd3ea459c6ce5263594399880bbc398225b2"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:ccd16eb18a849fd8dcb23e23380e2f0a354e8daa0c984b8a732d9cfaba3a776d"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:e6a5bf2cba5ae1bb80b154ed68a3cfa2fa00fde979a7f50d6598d3e17d9ac20c"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:45de3f87179c1823e6d9e32156fb14c1927fcc9aba21433f088fdfb555b77c10"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-win32.whl", hash = "sha256:1000fba1057b92a65daec275aec30586c3de2401ccdcd41f8a5c1e2c87078706"}, - {file = "charset_normalizer-3.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b2c760cfc7042b27ebdb4a43a4453bd829a5742503599144d54a032c5dc7e9e"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:855eafa5d5a2034b4621c74925d89c5efef61418570e5ef9b37717d9c796419c"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:203f0c8871d5a7987be20c72442488a0b8cfd0f43b7973771640fc593f56321f"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e857a2232ba53ae940d3456f7533ce6ca98b81917d47adc3c7fd55dad8fab858"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e86d77b090dbddbe78867a0275cb4df08ea195e660f1f7f13435a4649e954e5"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4fb39a81950ec280984b3a44f5bd12819953dc5fa3a7e6fa7a80db5ee853952"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2dee8e57f052ef5353cf608e0b4c871aee320dd1b87d351c28764fc0ca55f9f4"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8700f06d0ce6f128de3ccdbc1acaea1ee264d2caa9ca05daaf492fde7c2a7200"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1920d4ff15ce893210c1f0c0e9d19bfbecb7983c76b33f046c13a8ffbd570252"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c1c76a1743432b4b60ab3358c937a3fe1341c828ae6194108a94c69028247f22"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f7560358a6811e52e9c4d142d497f1a6e10103d3a6881f18d04dbce3729c0e2c"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:c8063cf17b19661471ecbdb3df1c84f24ad2e389e326ccaf89e3fb2484d8dd7e"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:cd6dbe0238f7743d0efe563ab46294f54f9bc8f4b9bcf57c3c666cc5bc9d1299"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1249cbbf3d3b04902ff081ffbb33ce3377fa6e4c7356f759f3cd076cc138d020"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-win32.whl", hash = "sha256:6c409c0deba34f147f77efaa67b8e4bb83d2f11c8806405f76397ae5b8c0d1c9"}, - {file = "charset_normalizer-3.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:7095f6fbfaa55defb6b733cfeb14efaae7a29f0b59d8cf213be4e7ca0b857b80"}, - {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, -] -click = [ - {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, - {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, -] -clickclick = [ - {file = "clickclick-20.10.2-py2.py3-none-any.whl", hash = "sha256:c8f33e6d9ec83f68416dd2136a7950125bd256ec39ccc9a85c6e280a16be2bb5"}, - {file = "clickclick-20.10.2.tar.gz", hash = "sha256:4efb13e62353e34c5eef7ed6582c4920b418d7dedc86d819e22ee089ba01802c"}, -] -colorama = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] -coloredlogs = [ - {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, - {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, -] -colorlog = [ - {file = "colorlog-4.8.0-py2.py3-none-any.whl", hash = "sha256:3dd15cb27e8119a24c1a7b5c93f9f3b455855e0f73993b1c25921b2f646f1dcd"}, - {file = "colorlog-4.8.0.tar.gz", hash = "sha256:59b53160c60902c405cdec28d38356e09d40686659048893e026ecbd589516b1"}, -] -configupdater = [ - {file = "ConfigUpdater-3.1.1-py2.py3-none-any.whl", hash = "sha256:805986dbeba317886c7a8d348b2e34986dc9e3128cd3761ecc35decbd372b286"}, - {file = "ConfigUpdater-3.1.1.tar.gz", hash = "sha256:46f0c74d73efa723776764b43c9739f68052495dd3d734319c1d0eb58511f15b"}, -] -connectorx = [ - {file = "connectorx-0.3.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:719750045e7c3b94c199271fbfe6aef47944768e711f27bcc606b498707e0054"}, - {file = "connectorx-0.3.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aed31b08acebeb3ebbe53c0df846c686e7c27c4242bff3a75b72cf517d070257"}, - {file = "connectorx-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:71d2c2678339fb01f89469bbe22e66e75cabcf727a52ed72d576fef5744ebc58"}, - {file = "connectorx-0.3.1-cp310-none-win_amd64.whl", hash = "sha256:92e576ef9610b59f8e5456c12d22e5b0752d0207f586df82701987657909888b"}, - {file = "connectorx-0.3.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:36c28cc59220998928e7b283eecf404e17e077dc3e525570096d0968b192cc64"}, - {file = "connectorx-0.3.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:c5173e7252f593c46787627a46561b0d949eb80ab23321e045bbf6bd5131945c"}, - {file = "connectorx-0.3.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3c8411631750d24c12e5e296720637909b8515d5faa3b5eaf7bb86c582d02667"}, - {file = "connectorx-0.3.1-cp37-none-win_amd64.whl", hash = "sha256:0674b6389f8f2ba62155ac2f718df18f76f9de5c50d9911a5fefe7485e1c598e"}, - {file = "connectorx-0.3.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:324c5075e8aa6698db8c877cb847f0d86172784db88ac0f3e6762aa9852330f3"}, - {file = "connectorx-0.3.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:027a3880629a7b33ae0c7a80ab4fa53286957a253af2dfe34f19adfea6b79b91"}, - {file = "connectorx-0.3.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a666b967958fcf9fc0444a7b3603483ee23a2fe39f0da3d545ff199f376f7e4b"}, - {file = "connectorx-0.3.1-cp38-none-win_amd64.whl", hash = "sha256:3c5dedfd75cf44898c17cc84a1dd0ab6ed0fa54de0461f2d6aa4bcb2c2b0dc1d"}, - {file = "connectorx-0.3.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:354c4126bcd7a9efbb8879feac92e1e7b0d0712f7e98665c392af663805491f8"}, - {file = "connectorx-0.3.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3011e1f9a27fd2a7b12c6a45bc29f6e7577a27418a3f607adaf54b301ff09068"}, - {file = "connectorx-0.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f1efb6ed547acc5837c2211e3d65d22948019d1653e7b30e522a4a4bd6d25fa8"}, - {file = "connectorx-0.3.1-cp39-none-win_amd64.whl", hash = "sha256:001b473e600b6d25af83b32674f98dccf49705a59bd6df724b5ba9beb236a0e0"}, -] -connexion = [ - {file = "connexion-2.14.1-py2.py3-none-any.whl", hash = "sha256:f343717241b4c4802a694c38fee66fb1693c897fe4ea5a957fa9b3b07caf6394"}, - {file = "connexion-2.14.1.tar.gz", hash = "sha256:99aa5781e70a7b94f8ffae8cf89f309d49cdb811bbd65a8e2f2546f3b19a01e6"}, -] -cron-descriptor = [ - {file = "cron_descriptor-1.4.0.tar.gz", hash = "sha256:b6ff4e3a988d7ca04a4ab150248e9f166fb7a5c828a85090e75bcc25aa93b4dd"}, -] -croniter = [ - {file = "croniter-1.4.1-py2.py3-none-any.whl", hash = "sha256:9595da48af37ea06ec3a9f899738f1b2c1c13da3c38cea606ef7cd03ea421128"}, - {file = "croniter-1.4.1.tar.gz", hash = "sha256:1a6df60eacec3b7a0aa52a8f2ef251ae3dd2a7c7c8b9874e73e791636d55a361"}, -] -cryptography = [ - {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"}, - {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"}, - {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"}, - {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"}, - {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"}, - {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"}, - {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"}, - {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"}, - {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"}, - {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"}, - {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"}, -] -databind-core = [ - {file = "databind.core-4.4.0-py3-none-any.whl", hash = "sha256:3c8a4d9abc93e158af9931d8cec389ddfc0514e02aec03b397948d243db11881"}, - {file = "databind.core-4.4.0.tar.gz", hash = "sha256:715d485e934c073f819f0250bbfcaf59c1319f83427365bc7cfd4c347f87576d"}, -] -databind-json = [ - {file = "databind.json-4.4.0-py3-none-any.whl", hash = "sha256:df8874118cfba6fd0e77ec3d41a87e04e26034bd545230cab0db1fe904bf1b09"}, - {file = "databind.json-4.4.0.tar.gz", hash = "sha256:4356afdf0aeefcc053eda0888650c59cc558be2686f08a58324d675ccd023586"}, -] -dbt-athena-community = [ - {file = "dbt-athena-community-1.5.2.tar.gz", hash = "sha256:9acd333ddf33514769189a7a0b6219e13966d370098211cb1d022fa32e64671a"}, - {file = "dbt_athena_community-1.5.2-py3-none-any.whl", hash = "sha256:c9f0f8425500211a1c1deddce5aff5ed24fe08530f0ffad38e63de9c9b9f3ee6"}, -] -dbt-bigquery = [ - {file = "dbt-bigquery-1.5.6.tar.gz", hash = "sha256:4655cf2ee0acda986b80e6c5d55cae57871bef22d868dfe29d8d4a5bca98a1ba"}, - {file = "dbt_bigquery-1.5.6-py3-none-any.whl", hash = "sha256:3f37544716880cbd17b32bc0c9728a0407b5615b2cd08e1bb904a7a83c46eb6c"}, -] -dbt-core = [ - {file = "dbt-core-1.5.6.tar.gz", hash = "sha256:af3c03cd4a1fc92481362888014ca1ffed2ffef0b0e0d98463ad0f26c49ef458"}, - {file = "dbt_core-1.5.6-py3-none-any.whl", hash = "sha256:030d2179f9efbf8ccea079296d0c79278d963bb2475c0bcce9ca4bbb0d8c393c"}, -] -dbt-duckdb = [ - {file = "dbt-duckdb-1.5.2.tar.gz", hash = "sha256:3407216c21bf78fd128dccfcff3ec4bf260fb145e633432015bc7d0f123e8e4b"}, - {file = "dbt_duckdb-1.5.2-py3-none-any.whl", hash = "sha256:5d18254807bbc3e61daf4f360208ad886adf44b8525e1998168290fbe73a5cbb"}, -] -dbt-extractor = [ - {file = "dbt_extractor-0.4.1-cp36-abi3-macosx_10_7_x86_64.whl", hash = "sha256:4dc715bd740e418d8dc1dd418fea508e79208a24cf5ab110b0092a3cbe96bf71"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:bc9e0050e3a2f4ea9fe58e8794bc808e6709a0c688ed710fc7c5b6ef3e5623ec"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76872cdee659075d6ce2df92dc62e59a74ba571be62acab2e297ca478b49d766"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:81435841610be1b07806d72cd89b1956c6e2a84c360b9ceb3f949c62a546d569"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:7c291f9f483eae4f60dd5859097d7ba51d5cb6c4725f08973ebd18cdea89d758"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:822b1e911db230e1b9701c99896578e711232001027b518c44c32f79a46fa3f9"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:554d27741a54599c39e5c0b7dbcab77400d83f908caba284a3e960db812e5814"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a805d51a25317f53cbff951c79b9cf75421cf48e4b3e1dfb3e9e8de6d824b76c"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cad90ddc708cb4182dc16fe2c87b1f088a1679877b93e641af068eb68a25d582"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:34783d788b133f223844e280e37b3f5244f2fb60acc457aa75c2667e418d5442"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:9da211869a1220ea55c5552c1567a3ea5233a6c52fa89ca87a22465481c37bc9"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_i686.whl", hash = "sha256:7d7c47774dc051b8c18690281a55e2e3d3320e823b17e04b06bc3ff81b1874ba"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:037907a7c7ae0391045d81338ca77ddaef899a91d80f09958f09fe374594e19b"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-win32.whl", hash = "sha256:3fe8d8e28a7bd3e0884896147269ca0202ca432d8733113386bdc84c824561bf"}, - {file = "dbt_extractor-0.4.1-cp36-abi3-win_amd64.whl", hash = "sha256:35265a0ae0a250623b0c2e3308b2738dc8212e40e0aa88407849e9ea090bb312"}, - {file = "dbt_extractor-0.4.1.tar.gz", hash = "sha256:75b1c665699ec0f1ffce1ba3d776f7dfce802156f22e70a7b9c8f0b4d7e80f42"}, -] -dbt-postgres = [ - {file = "dbt-postgres-1.5.6.tar.gz", hash = "sha256:b74e471dc661819a3d4bda2d11497935661ac2e25786c8a5b7314d8241b18582"}, - {file = "dbt_postgres-1.5.6-py3-none-any.whl", hash = "sha256:bc5711c9ab0ec4b57ab814b2c4e4c973554c8374b7da94b06814ac81c91f67ef"}, -] -dbt-redshift = [ - {file = "dbt-redshift-1.5.10.tar.gz", hash = "sha256:2b9ae1a7d05349e208b0937cd7cc920ea427341ef96096021b18e4070e927f5c"}, - {file = "dbt_redshift-1.5.10-py3-none-any.whl", hash = "sha256:b7689b043535b6b0d217c2abfe924db2336beaae71f3f36ab9aa1e920d2bb2e0"}, -] -dbt-snowflake = [ - {file = "dbt-snowflake-1.5.3.tar.gz", hash = "sha256:cf42772d2c2f1e29a2a64b039c66d80a8593f52a2dd711a144d43b4175802f9a"}, - {file = "dbt_snowflake-1.5.3-py3-none-any.whl", hash = "sha256:8aaa939d834798e5bb10a3ba4f52fc32a53e6e5568d6c0e8b3ac644f099972ff"}, -] -decopatch = [ - {file = "decopatch-1.4.10-py2.py3-none-any.whl", hash = "sha256:e151f7f93de2b1b3fd3f3272dcc7cefd1a69f68ec1c2d8e288ecd9deb36dc5f7"}, - {file = "decopatch-1.4.10.tar.gz", hash = "sha256:957f49c93f4150182c23f8fb51d13bb3213e0f17a79e09c8cca7057598b55720"}, -] -decorator = [ - {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, - {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, -] -deprecated = [ - {file = "Deprecated-1.2.14-py2.py3-none-any.whl", hash = "sha256:6fac8b097794a90302bdbb17b9b815e732d3c4720583ff1b198499d78470466c"}, - {file = "Deprecated-1.2.14.tar.gz", hash = "sha256:e5323eb936458dccc2582dc6f9c322c852a775a27065ff2b0c4970b9d53d01b3"}, -] -diff-cover = [ - {file = "diff_cover-7.7.0-py3-none-any.whl", hash = "sha256:bf86f32ec999f9a9e79bf24969f7127ea7b4e55c3ef3cd9300feb13188c89736"}, - {file = "diff_cover-7.7.0.tar.gz", hash = "sha256:60614cf7e722cf7fb1bde497afac0b514294e1e26534449622dac4da296123fb"}, -] -dill = [ - {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"}, - {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"}, -] -dnspython = [ - {file = "dnspython-2.4.2-py3-none-any.whl", hash = "sha256:57c6fbaaeaaf39c891292012060beb141791735dbb4004798328fc2c467402d8"}, - {file = "dnspython-2.4.2.tar.gz", hash = "sha256:8dcfae8c7460a2f84b4072e26f1c9f4101ca20c071649cb7c34e8b6a93d58984"}, -] -docspec = [ - {file = "docspec-2.2.1-py3-none-any.whl", hash = "sha256:7538f750095a9688c6980ff9a4e029a823a500f64bd00b6b4bdb27951feb31cb"}, - {file = "docspec-2.2.1.tar.gz", hash = "sha256:4854e77edc0e2de40e785e57e95880f7095a05fe978f8b54cef7a269586e15ff"}, -] -docspec-python = [ - {file = "docspec_python-2.2.1-py3-none-any.whl", hash = "sha256:76ac41d35a8face35b2d766c2e8a416fb8832359785d396f0d53bcb00f178e54"}, - {file = "docspec_python-2.2.1.tar.gz", hash = "sha256:c41b850b4d6f4de30999ea6f82c9cdb9183d9bcba45559ee9173d3dab7281559"}, -] -docstring-parser = [ - {file = "docstring_parser-0.11.tar.gz", hash = "sha256:93b3f8f481c7d24e37c5d9f30293c89e2933fa209421c8abd731dd3ef0715ecb"}, -] -docutils = [ - {file = "docutils-0.20.1-py3-none-any.whl", hash = "sha256:96f387a2c5562db4476f09f13bbab2192e764cac08ebbf3a34a95d9b1e4a59d6"}, - {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, -] -domdf-python-tools = [ - {file = "domdf_python_tools-3.6.1-py3-none-any.whl", hash = "sha256:e18158460850957f18e740eb94ede56f580ddb0cb162ab9d9834ed8bbb1b6431"}, - {file = "domdf_python_tools-3.6.1.tar.gz", hash = "sha256:acc04563d23bce4d437dd08af6b9bea788328c412772a044d8ca428a7ad861be"}, -] -duckdb = [ - {file = "duckdb-0.9.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6c724e105ecd78c8d86b3c03639b24e1df982392fc836705eb007e4b1b488864"}, - {file = "duckdb-0.9.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:75f12c5a3086079fb6440122565f1762ef1a610a954f2d8081014c1dd0646e1a"}, - {file = "duckdb-0.9.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:151f5410c32f8f8fe03bf23462b9604349bc0b4bd3a51049bbf5e6a482a435e8"}, - {file = "duckdb-0.9.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9c1d066fdae22b9b711b1603541651a378017645f9fbc4adc9764b2f3c9e9e4a"}, - {file = "duckdb-0.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1de56d8b7bd7a7653428c1bd4b8948316df488626d27e9c388194f2e0d1428d4"}, - {file = "duckdb-0.9.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1fb6cd590b1bb4e31fde8efd25fedfbfa19a86fa72789fa5b31a71da0d95bce4"}, - {file = "duckdb-0.9.1-cp310-cp310-win32.whl", hash = "sha256:1039e073714d668cef9069bb02c2a6756c7969cedda0bff1332520c4462951c8"}, - {file = "duckdb-0.9.1-cp310-cp310-win_amd64.whl", hash = "sha256:7e6ac4c28918e1d278a89ff26fd528882aa823868ed530df69d6c8a193ae4e41"}, - {file = "duckdb-0.9.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5eb750f2ee44397a61343f32ee9d9e8c8b5d053fa27ba4185d0e31507157f130"}, - {file = "duckdb-0.9.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:aea2a46881d75dc069a242cb164642d7a4f792889010fb98210953ab7ff48849"}, - {file = "duckdb-0.9.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed3dcedfc7a9449b6d73f9a2715c730180056e0ba837123e7967be1cd3935081"}, - {file = "duckdb-0.9.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c55397bed0087ec4445b96f8d55f924680f6d40fbaa7f2e35468c54367214a5"}, - {file = "duckdb-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3261696130f1cfb955735647c93297b4a6241753fb0de26c05d96d50986c6347"}, - {file = "duckdb-0.9.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:64c04b1728e3e37cf93748829b5d1e028227deea75115bb5ead01c608ece44b1"}, - {file = "duckdb-0.9.1-cp311-cp311-win32.whl", hash = "sha256:12cf9fb441a32702e31534330a7b4d569083d46a91bf185e0c9415000a978789"}, - {file = "duckdb-0.9.1-cp311-cp311-win_amd64.whl", hash = "sha256:fdfd85575ce9540e593d5d25c9d32050bd636c27786afd7b776aae0f6432b55e"}, - {file = "duckdb-0.9.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:704700a4b469e3bb1a7e85ac12e58037daaf2b555ef64a3fe2913ffef7bd585b"}, - {file = "duckdb-0.9.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf55b303b7b1a8c2165a96e609eb30484bc47481d94a5fb1e23123e728df0a74"}, - {file = "duckdb-0.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b70e23c14746904ca5de316436e43a685eb769c67fe3dbfaacbd3cce996c5045"}, - {file = "duckdb-0.9.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:77379f7f1f8b4dc98e01f8f6f8f15a0858cf456e2385e22507f3cb93348a88f9"}, - {file = "duckdb-0.9.1-cp37-cp37m-win32.whl", hash = "sha256:92c8f738489838666cae9ef41703f8b16f660bb146970d1eba8b2c06cb3afa39"}, - {file = "duckdb-0.9.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08c5484ac06ab714f745526d791141f547e2f5ac92f97a0a1b37dfbb3ea1bd13"}, - {file = "duckdb-0.9.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f66d3c07c7f6938d3277294677eb7dad75165e7c57c8dd505503fc5ef10f67ad"}, - {file = "duckdb-0.9.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c38044e5f78c0c7b58e9f937dcc6c34de17e9ca6be42f9f8f1a5a239f7a847a5"}, - {file = "duckdb-0.9.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:73bc0d715b79566b3ede00c367235cfcce67be0eddda06e17665c7a233d6854a"}, - {file = "duckdb-0.9.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d26622c3b4ea6a8328d95882059e3cc646cdc62d267d48d09e55988a3bba0165"}, - {file = "duckdb-0.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3367d10096ff2b7919cedddcf60d308d22d6e53e72ee2702f6e6ca03d361004a"}, - {file = "duckdb-0.9.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d88a119f1cb41911a22f08a6f084d061a8c864e28b9433435beb50a56b0d06bb"}, - {file = "duckdb-0.9.1-cp38-cp38-win32.whl", hash = "sha256:99567496e45b55c67427133dc916013e8eb20a811fc7079213f5f03b2a4f5fc0"}, - {file = "duckdb-0.9.1-cp38-cp38-win_amd64.whl", hash = "sha256:5b3da4da73422a3235c3500b3fb541ac546adb3e35642ef1119dbcd9cc7f68b8"}, - {file = "duckdb-0.9.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:eca00c0c2062c0265c6c0e78ca2f6a30611b28f3afef062036610e9fc9d4a67d"}, - {file = "duckdb-0.9.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:eb5af8e89d40fc4baab1515787ea1520a6c6cf6aa40ab9f107df6c3a75686ce1"}, - {file = "duckdb-0.9.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9fae3d4f83ebcb47995f6acad7c6d57d003a9b6f0e1b31f79a3edd6feb377443"}, - {file = "duckdb-0.9.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16b9a7efc745bc3c5d1018c3a2f58d9e6ce49c0446819a9600fdba5f78e54c47"}, - {file = "duckdb-0.9.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b0b60167f5537772e9f5af940e69dcf50e66f5247732b8bb84a493a9af6055"}, - {file = "duckdb-0.9.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4f27f5e94c47df6c4ccddf18e3277b7464eea3db07356d2c4bf033b5c88359b8"}, - {file = "duckdb-0.9.1-cp39-cp39-win32.whl", hash = "sha256:d43cd7e6f783006b59dcc5e40fcf157d21ee3d0c8dfced35278091209e9974d7"}, - {file = "duckdb-0.9.1-cp39-cp39-win_amd64.whl", hash = "sha256:e666795887d9cf1d6b6f6cbb9d487270680e5ff6205ebc54b2308151f13b8cff"}, - {file = "duckdb-0.9.1.tar.gz", hash = "sha256:603a878746015a3f2363a65eb48bcbec816261b6ee8d71eee53061117f6eef9d"}, -] -email-validator = [ - {file = "email_validator-1.3.1-py2.py3-none-any.whl", hash = "sha256:49a72f5fa6ed26be1c964f0567d931d10bf3fdeeacdf97bc26ef1cd2a44e0bda"}, - {file = "email_validator-1.3.1.tar.gz", hash = "sha256:d178c5c6fa6c6824e9b04f199cf23e79ac15756786573c190d2ad13089411ad2"}, -] -enlighten = [ - {file = "enlighten-1.11.2-py2.py3-none-any.whl", hash = "sha256:98c9eb20e022b6a57f1c8d4f17e16760780b6881e6d658c40f52d21255ea45f3"}, - {file = "enlighten-1.11.2.tar.gz", hash = "sha256:9284861dee5a272e0e1a3758cd3f3b7180b1bd1754875da76876f2a7f46ccb61"}, -] -exceptiongroup = [ - {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, - {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, -] -fastembed = [ - {file = "fastembed-0.1.1-py3-none-any.whl", hash = "sha256:131413ae52cd72f4c8cced7a675f8269dbfd1a852abade3c815e265114bcc05a"}, - {file = "fastembed-0.1.1.tar.gz", hash = "sha256:f7e524ee4f74bb8aad16be5b687d1f77f608d40e96e292c87881dc36baf8f4c7"}, -] -filelock = [ - {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, - {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, -] -flake8 = [ - {file = "flake8-5.0.4-py2.py3-none-any.whl", hash = "sha256:7a1cf6b73744f5806ab95e526f6f0d8c01c66d7bbe349562d22dfca20610b248"}, - {file = "flake8-5.0.4.tar.gz", hash = "sha256:6fbe320aad8d6b95cec8b8e47bc933004678dc63095be98528b7bdd2a9f510db"}, -] -flake8-bugbear = [ - {file = "flake8-bugbear-22.12.6.tar.gz", hash = "sha256:4cdb2c06e229971104443ae293e75e64c6107798229202fbe4f4091427a30ac0"}, - {file = "flake8_bugbear-22.12.6-py3-none-any.whl", hash = "sha256:b69a510634f8a9c298dfda2b18a8036455e6b19ecac4fe582e4d7a0abfa50a30"}, -] -flake8-builtins = [ - {file = "flake8-builtins-1.5.3.tar.gz", hash = "sha256:09998853b2405e98e61d2ff3027c47033adbdc17f9fe44ca58443d876eb00f3b"}, - {file = "flake8_builtins-1.5.3-py2.py3-none-any.whl", hash = "sha256:7706babee43879320376861897e5d1468e396a40b8918ed7bccf70e5f90b8687"}, -] -flake8-encodings = [ - {file = "flake8_encodings-0.5.0.post1-py3-none-any.whl", hash = "sha256:d2fecca0e89ba09c86e5d61cf6bdb1b337f0d74746aac67bbcf0c517b4cb6cba"}, - {file = "flake8_encodings-0.5.0.post1.tar.gz", hash = "sha256:082c0163325c85b438a8106e876283b5ed3cbfc53e68d89130d70be8be4c9977"}, -] -flake8-helper = [ - {file = "flake8_helper-0.2.1-py3-none-any.whl", hash = "sha256:9123cdf351ad32ee8a51b85036052302c478122d62fb512c0773e111b3d05241"}, - {file = "flake8_helper-0.2.1.tar.gz", hash = "sha256:479f86d1c52df8e49ff876ecd3873242699f93eeece7e6675cdca9c37c9b0a16"}, -] -flake8-tidy-imports = [ - {file = "flake8_tidy_imports-4.10.0-py3-none-any.whl", hash = "sha256:b0387fb2ea200441bd142309e716fb7b8f4b0937bdf5f8b7c0c118a5f5e2b8ed"}, - {file = "flake8_tidy_imports-4.10.0.tar.gz", hash = "sha256:bd6cf86465402d2b86903009b748d85a628e599e17b76e810c9857e3a2815173"}, -] -flask = [ - {file = "Flask-2.2.5-py3-none-any.whl", hash = "sha256:58107ed83443e86067e41eff4631b058178191a355886f8e479e347fa1285fdf"}, - {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, -] -flask-appbuilder = [ - {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, - {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, -] -flask-babel = [ - {file = "Flask-Babel-2.0.0.tar.gz", hash = "sha256:f9faf45cdb2e1a32ea2ec14403587d4295108f35017a7821a2b1acb8cfd9257d"}, - {file = "Flask_Babel-2.0.0-py3-none-any.whl", hash = "sha256:e6820a052a8d344e178cdd36dd4bb8aea09b4bda3d5f9fa9f008df2c7f2f5468"}, -] -flask-caching = [ - {file = "Flask-Caching-2.0.2.tar.gz", hash = "sha256:24b60c552d59a9605cc1b6a42c56cdb39a82a28dab4532bbedb9222ae54ecb4e"}, - {file = "Flask_Caching-2.0.2-py3-none-any.whl", hash = "sha256:19571f2570e9b8dd9dd9d2f49d7cbee69c14ebe8cc001100b1eb98c379dd80ad"}, -] -flask-jwt-extended = [ - {file = "Flask-JWT-Extended-4.5.2.tar.gz", hash = "sha256:ba56245ba43b71c8ae936784b867625dce8b9956faeedec2953222e57942fb0b"}, - {file = "Flask_JWT_Extended-4.5.2-py2.py3-none-any.whl", hash = "sha256:e0ef23d8c863746bd141046167073699e1a7b03c97169cbba70f05b8d9cd6b9e"}, -] -flask-limiter = [ - {file = "Flask-Limiter-3.5.0.tar.gz", hash = "sha256:13a3491b994c49f7cb4706587a38ca47e8162b576530472df38be68104f299c0"}, - {file = "Flask_Limiter-3.5.0-py3-none-any.whl", hash = "sha256:dbda4174f44e6cb858c6eb75e7488186f2977dd5d33d7028ba1aabf179de1bee"}, -] -flask-login = [ - {file = "Flask-Login-0.6.2.tar.gz", hash = "sha256:c0a7baa9fdc448cdd3dd6f0939df72eec5177b2f7abe6cb82fc934d29caac9c3"}, - {file = "Flask_Login-0.6.2-py3-none-any.whl", hash = "sha256:1ef79843f5eddd0f143c2cd994c1b05ac83c0401dc6234c143495af9a939613f"}, -] -flask-session = [ - {file = "Flask-Session-0.5.0.tar.gz", hash = "sha256:190875e6aebf2953c6803d42379ef3b934bc209ef8ef006f97aecb08f5aaeb86"}, - {file = "flask_session-0.5.0-py3-none-any.whl", hash = "sha256:1619bcbc16f04f64e90f8e0b17145ba5c9700090bb1294e889956c1282d58631"}, -] -flask-sqlalchemy = [ - {file = "Flask-SQLAlchemy-2.5.1.tar.gz", hash = "sha256:2bda44b43e7cacb15d4e05ff3cc1f8bc97936cc464623424102bfc2c35e95912"}, - {file = "Flask_SQLAlchemy-2.5.1-py2.py3-none-any.whl", hash = "sha256:f12c3d4cc5cc7fdcc148b9527ea05671718c3ea45d50c7e732cceb33f574b390"}, -] -flask-wtf = [ - {file = "Flask-WTF-1.1.1.tar.gz", hash = "sha256:41c4244e9ae626d63bed42ae4785b90667b885b1535d5a4095e1f63060d12aa9"}, - {file = "Flask_WTF-1.1.1-py3-none-any.whl", hash = "sha256:7887d6f1ebb3e17bf648647422f0944c9a469d0fcf63e3b66fb9a83037e38b2c"}, -] -flatbuffers = [ - {file = "flatbuffers-23.5.26-py2.py3-none-any.whl", hash = "sha256:c0ff356da363087b915fde4b8b45bdda73432fc17cddb3c8157472eab1422ad1"}, - {file = "flatbuffers-23.5.26.tar.gz", hash = "sha256:9ea1144cac05ce5d86e2859f431c6cd5e66cd9c78c558317c7955fb8d4c78d89"}, -] -frozenlist = [ - {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:764226ceef3125e53ea2cb275000e309c0aa5464d43bd72abd661e27fffc26ab"}, - {file = "frozenlist-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d6484756b12f40003c6128bfcc3fa9f0d49a687e171186c2d85ec82e3758c559"}, - {file = "frozenlist-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9ac08e601308e41eb533f232dbf6b7e4cea762f9f84f6357136eed926c15d12c"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d081f13b095d74b67d550de04df1c756831f3b83dc9881c38985834387487f1b"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71932b597f9895f011f47f17d6428252fc728ba2ae6024e13c3398a087c2cdea"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:981b9ab5a0a3178ff413bca62526bb784249421c24ad7381e39d67981be2c326"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e41f3de4df3e80de75845d3e743b3f1c4c8613c3997a912dbf0229fc61a8b963"}, - {file = "frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6918d49b1f90821e93069682c06ffde41829c346c66b721e65a5c62b4bab0300"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e5c8764c7829343d919cc2dfc587a8db01c4f70a4ebbc49abde5d4b158b007b"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8d0edd6b1c7fb94922bf569c9b092ee187a83f03fb1a63076e7774b60f9481a8"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:e29cda763f752553fa14c68fb2195150bfab22b352572cb36c43c47bedba70eb"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0c7c1b47859ee2cac3846fde1c1dc0f15da6cec5a0e5c72d101e0f83dcb67ff9"}, - {file = "frozenlist-1.4.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:901289d524fdd571be1c7be054f48b1f88ce8dddcbdf1ec698b27d4b8b9e5d62"}, - {file = "frozenlist-1.4.0-cp310-cp310-win32.whl", hash = "sha256:1a0848b52815006ea6596c395f87449f693dc419061cc21e970f139d466dc0a0"}, - {file = "frozenlist-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:b206646d176a007466358aa21d85cd8600a415c67c9bd15403336c331a10d956"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:de343e75f40e972bae1ef6090267f8260c1446a1695e77096db6cfa25e759a95"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ad2a9eb6d9839ae241701d0918f54c51365a51407fd80f6b8289e2dfca977cc3"}, - {file = "frozenlist-1.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bd7bd3b3830247580de99c99ea2a01416dfc3c34471ca1298bccabf86d0ff4dc"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bdf1847068c362f16b353163391210269e4f0569a3c166bc6a9f74ccbfc7e839"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:38461d02d66de17455072c9ba981d35f1d2a73024bee7790ac2f9e361ef1cd0c"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5a32087d720c608f42caed0ef36d2b3ea61a9d09ee59a5142d6070da9041b8f"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dd65632acaf0d47608190a71bfe46b209719bf2beb59507db08ccdbe712f969b"}, - {file = "frozenlist-1.4.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261b9f5d17cac914531331ff1b1d452125bf5daa05faf73b71d935485b0c510b"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b89ac9768b82205936771f8d2eb3ce88503b1556324c9f903e7156669f521472"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:008eb8b31b3ea6896da16c38c1b136cb9fec9e249e77f6211d479db79a4eaf01"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e74b0506fa5aa5598ac6a975a12aa8928cbb58e1f5ac8360792ef15de1aa848f"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:490132667476f6781b4c9458298b0c1cddf237488abd228b0b3650e5ecba7467"}, - {file = "frozenlist-1.4.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:76d4711f6f6d08551a7e9ef28c722f4a50dd0fc204c56b4bcd95c6cc05ce6fbb"}, - {file = "frozenlist-1.4.0-cp311-cp311-win32.whl", hash = "sha256:a02eb8ab2b8f200179b5f62b59757685ae9987996ae549ccf30f983f40602431"}, - {file = "frozenlist-1.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:515e1abc578dd3b275d6a5114030b1330ba044ffba03f94091842852f806f1c1"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f0ed05f5079c708fe74bf9027e95125334b6978bf07fd5ab923e9e55e5fbb9d3"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ca265542ca427bf97aed183c1676e2a9c66942e822b14dc6e5f42e038f92a503"}, - {file = "frozenlist-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:491e014f5c43656da08958808588cc6c016847b4360e327a62cb308c791bd2d9"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17ae5cd0f333f94f2e03aaf140bb762c64783935cc764ff9c82dff626089bebf"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e78fb68cf9c1a6aa4a9a12e960a5c9dfbdb89b3695197aa7064705662515de2"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d5655a942f5f5d2c9ed93d72148226d75369b4f6952680211972a33e59b1dfdc"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c11b0746f5d946fecf750428a95f3e9ebe792c1ee3b1e96eeba145dc631a9672"}, - {file = "frozenlist-1.4.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e66d2a64d44d50d2543405fb183a21f76b3b5fd16f130f5c99187c3fb4e64919"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:88f7bc0fcca81f985f78dd0fa68d2c75abf8272b1f5c323ea4a01a4d7a614efc"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:5833593c25ac59ede40ed4de6d67eb42928cca97f26feea219f21d0ed0959b79"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:fec520865f42e5c7f050c2a79038897b1c7d1595e907a9e08e3353293ffc948e"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:b826d97e4276750beca7c8f0f1a4938892697a6bcd8ec8217b3312dad6982781"}, - {file = "frozenlist-1.4.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ceb6ec0a10c65540421e20ebd29083c50e6d1143278746a4ef6bcf6153171eb8"}, - {file = "frozenlist-1.4.0-cp38-cp38-win32.whl", hash = "sha256:2b8bcf994563466db019fab287ff390fffbfdb4f905fc77bc1c1d604b1c689cc"}, - {file = "frozenlist-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:a6c8097e01886188e5be3e6b14e94ab365f384736aa1fca6a0b9e35bd4a30bc7"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:6c38721585f285203e4b4132a352eb3daa19121a035f3182e08e437cface44bf"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a0c6da9aee33ff0b1a451e867da0c1f47408112b3391dd43133838339e410963"}, - {file = "frozenlist-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:93ea75c050c5bb3d98016b4ba2497851eadf0ac154d88a67d7a6816206f6fa7f"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f61e2dc5ad442c52b4887f1fdc112f97caeff4d9e6ebe78879364ac59f1663e1"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa384489fefeb62321b238e64c07ef48398fe80f9e1e6afeff22e140e0850eef"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:10ff5faaa22786315ef57097a279b833ecab1a0bfb07d604c9cbb1c4cdc2ed87"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:007df07a6e3eb3e33e9a1fe6a9db7af152bbd8a185f9aaa6ece10a3529e3e1c6"}, - {file = "frozenlist-1.4.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4f399d28478d1f604c2ff9119907af9726aed73680e5ed1ca634d377abb087"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:c5374b80521d3d3f2ec5572e05adc94601985cc526fb276d0c8574a6d749f1b3"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ce31ae3e19f3c902de379cf1323d90c649425b86de7bbdf82871b8a2a0615f3d"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7211ef110a9194b6042449431e08c4d80c0481e5891e58d429df5899690511c2"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:556de4430ce324c836789fa4560ca62d1591d2538b8ceb0b4f68fb7b2384a27a"}, - {file = "frozenlist-1.4.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7645a8e814a3ee34a89c4a372011dcd817964ce8cb273c8ed6119d706e9613e3"}, - {file = "frozenlist-1.4.0-cp39-cp39-win32.whl", hash = "sha256:19488c57c12d4e8095a922f328df3f179c820c212940a498623ed39160bc3c2f"}, - {file = "frozenlist-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:6221d84d463fb110bdd7619b69cb43878a11d51cbb9394ae3105d082d5199167"}, - {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, -] -fsspec = [ - {file = "fsspec-2023.6.0-py3-none-any.whl", hash = "sha256:1cbad1faef3e391fba6dc005ae9b5bdcbf43005c9167ce78c915549c352c869a"}, - {file = "fsspec-2023.6.0.tar.gz", hash = "sha256:d0b2f935446169753e7a5c5c55681c54ea91996cc67be93c39a154fb3a2742af"}, -] -future = [ - {file = "future-0.18.3.tar.gz", hash = "sha256:34a17436ed1e96697a86f9de3d15a3b0be01d8bc8de9c1dffd59fb8234ed5307"}, -] -gcsfs = [ - {file = "gcsfs-2023.6.0-py2.py3-none-any.whl", hash = "sha256:3b3c7d8eddd4ec1380f3b49fbb861ee1e974adb223564401f10884b6260d406f"}, - {file = "gcsfs-2023.6.0.tar.gz", hash = "sha256:30b14fccadb3b7f0d99b2cd03bd8507c40f3a9a7d05847edca571f642bedbdff"}, -] -gitdb = [ - {file = "gitdb-4.0.10-py3-none-any.whl", hash = "sha256:c286cf298426064079ed96a9e4a9d39e7f3e9bf15ba60701e95f5492f28415c7"}, - {file = "gitdb-4.0.10.tar.gz", hash = "sha256:6eb990b69df4e15bad899ea868dc46572c3f75339735663b81de79b06f17eb9a"}, -] -gitpython = [ - {file = "GitPython-3.1.34-py3-none-any.whl", hash = "sha256:5d3802b98a3bae1c2b8ae0e1ff2e4aa16bcdf02c145da34d092324f599f01395"}, - {file = "GitPython-3.1.34.tar.gz", hash = "sha256:85f7d365d1f6bf677ae51039c1ef67ca59091c7ebd5a3509aa399d4eda02d6dd"}, -] -giturlparse = [ - {file = "giturlparse-0.11.1-py2.py3-none-any.whl", hash = "sha256:6422f25c8ca563e1a3cb6b85862e48614be804cd1334e6d84be5630eb26b343f"}, - {file = "giturlparse-0.11.1.tar.gz", hash = "sha256:cdbe0c062096c69e00f08397826dddebc1f73bc15b793994579c13aafc70c990"}, -] -google-api-core = [ - {file = "google-api-core-2.11.1.tar.gz", hash = "sha256:25d29e05a0058ed5f19c61c0a78b1b53adea4d9364b464d014fbda941f6d1c9a"}, - {file = "google_api_core-2.11.1-py3-none-any.whl", hash = "sha256:d92a5a92dc36dd4f4b9ee4e55528a90e432b059f93aee6ad857f9de8cc7ae94a"}, -] -google-api-python-client = [ - {file = "google-api-python-client-2.97.0.tar.gz", hash = "sha256:48277291894876a1ca7ed4127e055e81f81e6343ced1b544a7200ae2c119dcd7"}, - {file = "google_api_python_client-2.97.0-py2.py3-none-any.whl", hash = "sha256:5215f4cd577753fc4192ccfbe0bb8b55d4bb5fd68fa6268ac5cf271b6305de31"}, -] -google-auth = [ - {file = "google-auth-2.22.0.tar.gz", hash = "sha256:164cba9af4e6e4e40c3a4f90a1a6c12ee56f14c0b4868d1ca91b32826ab334ce"}, - {file = "google_auth-2.22.0-py2.py3-none-any.whl", hash = "sha256:d61d1b40897407b574da67da1a833bdc10d5a11642566e506565d1b1a46ba873"}, -] -google-auth-httplib2 = [ - {file = "google-auth-httplib2-0.1.0.tar.gz", hash = "sha256:a07c39fd632becacd3f07718dfd6021bf396978f03ad3ce4321d060015cc30ac"}, - {file = "google_auth_httplib2-0.1.0-py2.py3-none-any.whl", hash = "sha256:31e49c36c6b5643b57e82617cb3e021e3e1d2df9da63af67252c02fa9c1f4a10"}, -] -google-auth-oauthlib = [ - {file = "google-auth-oauthlib-1.0.0.tar.gz", hash = "sha256:e375064964820b47221a7e1b7ee1fd77051b6323c3f9e3e19785f78ab67ecfc5"}, - {file = "google_auth_oauthlib-1.0.0-py2.py3-none-any.whl", hash = "sha256:95880ca704928c300f48194d1770cf5b1462835b6e49db61445a520f793fd5fb"}, -] -google-cloud-bigquery = [ - {file = "google-cloud-bigquery-3.11.4.tar.gz", hash = "sha256:697df117241a2283bcbb93b21e10badc14e51c9a90800d2a7e1a3e1c7d842974"}, - {file = "google_cloud_bigquery-3.11.4-py2.py3-none-any.whl", hash = "sha256:5fa7897743a0ed949ade25a0942fc9e7557d8fce307c6f8a76d1b604cf27f1b1"}, -] -google-cloud-core = [ - {file = "google-cloud-core-2.3.3.tar.gz", hash = "sha256:37b80273c8d7eee1ae816b3a20ae43585ea50506cb0e60f3cf5be5f87f1373cb"}, - {file = "google_cloud_core-2.3.3-py2.py3-none-any.whl", hash = "sha256:fbd11cad3e98a7e5b0343dc07cb1039a5ffd7a5bb96e1f1e27cee4bda4a90863"}, -] -google-cloud-dataproc = [ - {file = "google-cloud-dataproc-5.4.3.tar.gz", hash = "sha256:d9c77c52aa5ddf52ae657736dbfb5312402933f72bab8480fc2d2afe98697402"}, - {file = "google_cloud_dataproc-5.4.3-py2.py3-none-any.whl", hash = "sha256:9cfff56cb53621cdffd0a3d6b10701e886e0a8ad54891e6c223eb67c0ff753ad"}, -] -google-cloud-storage = [ - {file = "google-cloud-storage-2.10.0.tar.gz", hash = "sha256:934b31ead5f3994e5360f9ff5750982c5b6b11604dc072bc452c25965e076dc7"}, - {file = "google_cloud_storage-2.10.0-py2.py3-none-any.whl", hash = "sha256:9433cf28801671de1c80434238fb1e7e4a1ba3087470e90f70c928ea77c2b9d7"}, -] -google-crc32c = [ +files = [ {file = "google-crc32c-1.5.0.tar.gz", hash = "sha256:89284716bc6a5a415d4eaa11b1726d2d60a0cd12aadf5439828353662ede9dd7"}, {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:596d1f98fc70232fcb6590c439f43b350cb762fb5d61ce7b0e9db4539654cc13"}, {file = "google_crc32c-1.5.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:be82c3c8cfb15b30f36768797a640e800513793d6ae1724aaaafe5bf86f8f346"}, @@ -6041,7 +3116,17 @@ google-crc32c = [ {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:635f5d4dd18758a1fbd1049a8e8d2fee4ffed124462d837d1a02a0e009c3ab31"}, {file = "google_crc32c-1.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c672d99a345849301784604bfeaeba4db0c7aae50b95be04dd651fd2a7310b93"}, ] -google-re2 = [ + +[package.extras] +testing = ["pytest"] + +[[package]] +name = "google-re2" +version = "1.1" +description = "RE2 Python bindings" +optional = false +python-versions = "~=3.8" +files = [ {file = "google-re2-1.1.tar.gz", hash = "sha256:d3a9467ee52b46ac77ca928f6d0cbeaccfd92f03ca0f0f65b9df6a95184f3a1c"}, {file = "google_re2-1.1-1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:874d2e36dfa506b03d4f9c4aef1701a65304f4004c96c7edac7d8aea08fe193e"}, {file = "google_re2-1.1-1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b66eb84850afdce09aabca40bcd6f2a0e96178a1b4990d555678edb1f59bf255"}, @@ -6092,22 +3177,79 @@ google-re2 = [ {file = "google_re2-1.1-1-cp39-cp39-win32.whl", hash = "sha256:d1b751b9ab9f8e2ab2a36d72b909281ce65f328c9115a1685acae1a2d1afd7a4"}, {file = "google_re2-1.1-1-cp39-cp39-win_amd64.whl", hash = "sha256:ac775c75cec7069351d201da4e0fb0cae4c1c5ebecd08fa34e1be89740c1d80b"}, ] -google-resumable-media = [ + +[[package]] +name = "google-resumable-media" +version = "2.5.0" +description = "Utilities for Google Media Downloads and Resumable Uploads" +optional = true +python-versions = ">= 3.7" +files = [ {file = "google-resumable-media-2.5.0.tar.gz", hash = "sha256:218931e8e2b2a73a58eb354a288e03a0fd5fb1c4583261ac6e4c078666468c93"}, {file = "google_resumable_media-2.5.0-py2.py3-none-any.whl", hash = "sha256:da1bd943e2e114a56d85d6848497ebf9be6a14d3db23e9fc57581e7c3e8170ec"}, ] -googleapis-common-protos = [ + +[package.dependencies] +google-crc32c = ">=1.0,<2.0dev" + +[package.extras] +aiohttp = ["aiohttp (>=3.6.2,<4.0.0dev)"] +requests = ["requests (>=2.18.0,<3.0.0dev)"] + +[[package]] +name = "googleapis-common-protos" +version = "1.60.0" +description = "Common protobufs used in Google APIs" +optional = false +python-versions = ">=3.7" +files = [ {file = "googleapis-common-protos-1.60.0.tar.gz", hash = "sha256:e73ebb404098db405ba95d1e1ae0aa91c3e15a71da031a2eeb6b2e23e7bc3708"}, {file = "googleapis_common_protos-1.60.0-py2.py3-none-any.whl", hash = "sha256:69f9bbcc6acde92cab2db95ce30a70bd2b81d20b12eff3f1aabaffcbe8a93918"}, ] -grapheme = [ + +[package.dependencies] +grpcio = {version = ">=1.44.0,<2.0.0.dev0", optional = true, markers = "extra == \"grpc\""} +protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0.dev0" + +[package.extras] +grpc = ["grpcio (>=1.44.0,<2.0.0.dev0)"] + +[[package]] +name = "grapheme" +version = "0.6.0" +description = "Unicode grapheme helpers" +optional = false +python-versions = "*" +files = [ {file = "grapheme-0.6.0.tar.gz", hash = "sha256:44c2b9f21bbe77cfb05835fec230bd435954275267fea1858013b102f8603cca"}, ] -graphviz = [ + +[package.extras] +test = ["pytest", "sphinx", "sphinx-autobuild", "twine", "wheel"] + +[[package]] +name = "graphviz" +version = "0.20.1" +description = "Simple Python interface for Graphviz" +optional = false +python-versions = ">=3.7" +files = [ {file = "graphviz-0.20.1-py3-none-any.whl", hash = "sha256:587c58a223b51611c0cf461132da386edd896a029524ca61a1462b880bf97977"}, {file = "graphviz-0.20.1.zip", hash = "sha256:8c58f14adaa3b947daf26c19bc1e98c4e0702cdc31cf99153e6f06904d492bf8"}, ] -greenlet = [ + +[package.extras] +dev = ["flake8", "pep8-naming", "tox (>=3)", "twine", "wheel"] +docs = ["sphinx (>=5)", "sphinx-autodoc-typehints", "sphinx-rtd-theme"] +test = ["coverage", "mock (>=4)", "pytest (>=7)", "pytest-cov", "pytest-mock (>=3)"] + +[[package]] +name = "greenlet" +version = "2.0.2" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*" +files = [ {file = "greenlet-2.0.2-cp27-cp27m-macosx_10_14_x86_64.whl", hash = "sha256:bdfea8c661e80d3c1c99ad7c3ff74e6e87184895bbaca6ee8cc61209f8b9b85d"}, {file = "greenlet-2.0.2-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:9d14b83fab60d5e8abe587d51c75b252bcc21683f24699ada8fb275d7712f5a9"}, {file = "greenlet-2.0.2-cp27-cp27m-win32.whl", hash = "sha256:6c3acb79b0bfd4fe733dff8bc62695283b57949ebcca05ae5c129eb606ff2d74"}, @@ -6169,11 +3311,34 @@ greenlet = [ {file = "greenlet-2.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:db1a39669102a1d8d12b57de2bb7e2ec9066a6f2b3da35ae511ff93b01b5d564"}, {file = "greenlet-2.0.2.tar.gz", hash = "sha256:e7c8dc13af7db097bed64a051d2dd49e9f0af495c26995c00a9ee842690d34c0"}, ] -grpc-google-iam-v1 = [ + +[package.extras] +docs = ["Sphinx", "docutils (<0.18)"] +test = ["objgraph", "psutil"] + +[[package]] +name = "grpc-google-iam-v1" +version = "0.12.6" +description = "IAM API client library" +optional = true +python-versions = ">=3.7" +files = [ {file = "grpc-google-iam-v1-0.12.6.tar.gz", hash = "sha256:2bc4b8fdf22115a65d751c9317329322602c39b7c86a289c9b72d228d960ef5f"}, {file = "grpc_google_iam_v1-0.12.6-py2.py3-none-any.whl", hash = "sha256:5c10f3d8dc2d88678ab1a9b0cb5482735c5efee71e6c0cd59f872eef22913f5c"}, ] -grpcio = [ + +[package.dependencies] +googleapis-common-protos = {version = ">=1.56.0,<2.0.0dev", extras = ["grpc"]} +grpcio = ">=1.44.0,<2.0.0dev" +protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" + +[[package]] +name = "grpcio" +version = "1.57.0" +description = "HTTP/2-based RPC framework" +optional = false +python-versions = ">=3.7" +files = [ {file = "grpcio-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:092fa155b945015754bdf988be47793c377b52b88d546e45c6a9f9579ac7f7b6"}, {file = "grpcio-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:2f7349786da979a94690cc5c2b804cab4e8774a3cf59be40d037c4342c906649"}, {file = "grpcio-1.57.0-cp310-cp310-manylinux_2_17_aarch64.whl", hash = "sha256:82640e57fb86ea1d71ea9ab54f7e942502cf98a429a200b2e743d8672171734f"}, @@ -6220,11 +3385,33 @@ grpcio = [ {file = "grpcio-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:20ec6fc4ad47d1b6e12deec5045ec3cd5402d9a1597f738263e98f490fe07056"}, {file = "grpcio-1.57.0.tar.gz", hash = "sha256:4b089f7ad1eb00a104078bab8015b0ed0ebcb3b589e527ab009c53893fd4e613"}, ] -grpcio-status = [ + +[package.extras] +protobuf = ["grpcio-tools (>=1.57.0)"] + +[[package]] +name = "grpcio-status" +version = "1.57.0" +description = "Status proto mapping for gRPC" +optional = true +python-versions = ">=3.6" +files = [ {file = "grpcio-status-1.57.0.tar.gz", hash = "sha256:b098da99df1eebe58337f8f78e50df990273ccacc1226fddeb47c590e3df9e02"}, {file = "grpcio_status-1.57.0-py3-none-any.whl", hash = "sha256:15d6af055914ebbc4ed17e55ebfb8e6bb17a45a57fea32e6af19978fb7844690"}, ] -grpcio-tools = [ + +[package.dependencies] +googleapis-common-protos = ">=1.5.5" +grpcio = ">=1.57.0" +protobuf = ">=4.21.6" + +[[package]] +name = "grpcio-tools" +version = "1.57.0" +description = "Protobuf code generator for gRPC" +optional = true +python-versions = ">=3.7" +files = [ {file = "grpcio-tools-1.57.0.tar.gz", hash = "sha256:2f16130d869ce27ecd623194547b649dd657333ec7e8644cc571c645781a9b85"}, {file = "grpcio_tools-1.57.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:4fb8a8468031f858381a576078924af364a08833d8f8f3237018252c4573a802"}, {file = "grpcio_tools-1.57.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:35bf0dad8a3562043345236c26d0053a856fb06c04d7da652f2ded914e508ae7"}, @@ -6271,120 +3458,472 @@ grpcio-tools = [ {file = "grpcio_tools-1.57.0-cp39-cp39-win32.whl", hash = "sha256:f717cce5093e6b6049d9ea6d12fdf3658efdb1a80772f7737db1f8510b876df6"}, {file = "grpcio_tools-1.57.0-cp39-cp39-win_amd64.whl", hash = "sha256:1c0e8a1a32973a5d59fbcc19232f925e5c48116e9411f788033a31c5ca5130b4"}, ] -gunicorn = [ + +[package.dependencies] +grpcio = ">=1.57.0" +protobuf = ">=4.21.6,<5.0dev" +setuptools = "*" + +[[package]] +name = "gunicorn" +version = "21.2.0" +description = "WSGI HTTP Server for UNIX" +optional = false +python-versions = ">=3.5" +files = [ {file = "gunicorn-21.2.0-py3-none-any.whl", hash = "sha256:3213aa5e8c24949e792bcacfc176fef362e7aac80b76c56f6b5122bf350722f0"}, {file = "gunicorn-21.2.0.tar.gz", hash = "sha256:88ec8bff1d634f98e61b9f65bc4bf3cd918a90806c6f5c48bc5603849ec81033"}, ] -h11 = [ + +[package.dependencies] +packaging = "*" + +[package.extras] +eventlet = ["eventlet (>=0.24.1)"] +gevent = ["gevent (>=1.4.0)"] +setproctitle = ["setproctitle"] +tornado = ["tornado (>=0.2)"] + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] -h2 = [ + +[[package]] +name = "h2" +version = "4.1.0" +description = "HTTP/2 State-Machine based protocol implementation" +optional = true +python-versions = ">=3.6.1" +files = [ {file = "h2-4.1.0-py3-none-any.whl", hash = "sha256:03a46bcf682256c95b5fd9e9a99c1323584c3eec6440d379b9903d709476bc6d"}, {file = "h2-4.1.0.tar.gz", hash = "sha256:a83aca08fbe7aacb79fec788c9c0bac936343560ed9ec18b82a13a12c28d2abb"}, ] -hexbytes = [ + +[package.dependencies] +hpack = ">=4.0,<5" +hyperframe = ">=6.0,<7" + +[[package]] +name = "hexbytes" +version = "0.3.1" +description = "hexbytes: Python `bytes` subclass that decodes hex, with a readable console output" +optional = false +python-versions = ">=3.7, <4" +files = [ {file = "hexbytes-0.3.1-py3-none-any.whl", hash = "sha256:383595ad75026cf00abd570f44b368c6cdac0c6becfae5c39ff88829877f8a59"}, {file = "hexbytes-0.3.1.tar.gz", hash = "sha256:a3fe35c6831ee8fafd048c4c086b986075fc14fd46258fa24ecb8d65745f9a9d"}, ] -hologram = [ + +[package.extras] +dev = ["black (>=22)", "bumpversion (>=0.5.3)", "eth-utils (>=1.0.1,<3)", "flake8 (==6.0.0)", "flake8-bugbear (==23.3.23)", "hypothesis (>=3.44.24,<=6.31.6)", "ipython", "isort (>=5.10.1)", "mypy (==0.971)", "pydocstyle (>=5.0.0)", "pytest (>=7.0.0)", "pytest-watch (>=4.1.0)", "pytest-xdist (>=2.4.0)", "sphinx (>=5.0.0)", "sphinx-rtd-theme (>=1.0.0)", "towncrier (>=21,<22)", "tox (>=4.0.0)", "twine", "wheel"] +doc = ["sphinx (>=5.0.0)", "sphinx-rtd-theme (>=1.0.0)", "towncrier (>=21,<22)"] +lint = ["black (>=22)", "flake8 (==6.0.0)", "flake8-bugbear (==23.3.23)", "isort (>=5.10.1)", "mypy (==0.971)", "pydocstyle (>=5.0.0)"] +test = ["eth-utils (>=1.0.1,<3)", "hypothesis (>=3.44.24,<=6.31.6)", "pytest (>=7.0.0)", "pytest-xdist (>=2.4.0)"] + +[[package]] +name = "hologram" +version = "0.0.16" +description = "JSON schema generation from dataclasses" +optional = false +python-versions = "*" +files = [ {file = "hologram-0.0.16-py3-none-any.whl", hash = "sha256:4e56bd525336bb64a18916f871977a4125b64be8aaa750233583003333cda361"}, {file = "hologram-0.0.16.tar.gz", hash = "sha256:1c2c921b4e575361623ea0e0d0aa5aee377b1a333cc6c6a879e213ed34583e55"}, ] -hpack = [ + +[package.dependencies] +jsonschema = ">=3.0" +python-dateutil = ">=2.8,<2.9" + +[[package]] +name = "hpack" +version = "4.0.0" +description = "Pure-Python HPACK header compression" +optional = true +python-versions = ">=3.6.1" +files = [ {file = "hpack-4.0.0-py3-none-any.whl", hash = "sha256:84a076fad3dc9a9f8063ccb8041ef100867b1878b25ef0ee63847a5d53818a6c"}, {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, ] -httpcore = [ + +[[package]] +name = "httpcore" +version = "0.17.3" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.7" +files = [ {file = "httpcore-0.17.3-py3-none-any.whl", hash = "sha256:c2789b767ddddfa2a5782e3199b2b7f6894540b17b16ec26b2c4d8e103510b87"}, {file = "httpcore-0.17.3.tar.gz", hash = "sha256:a6f30213335e34c1ade7be6ec7c47f19f50c56db36abef1a9dfa3815b1cb3888"}, ] -httplib2 = [ + +[package.dependencies] +anyio = ">=3.0,<5.0" +certifi = "*" +h11 = ">=0.13,<0.15" +sniffio = "==1.*" + +[package.extras] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "httplib2" +version = "0.22.0" +description = "A comprehensive HTTP client library." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "httplib2-0.22.0-py3-none-any.whl", hash = "sha256:14ae0a53c1ba8f3d37e9e27cf37eabb0fb9980f435ba405d546948b009dd64dc"}, {file = "httplib2-0.22.0.tar.gz", hash = "sha256:d7a10bc5ef5ab08322488bde8c726eeee5c8618723fdb399597ec58f3d82df81"}, ] -httpx = [ + +[package.dependencies] +pyparsing = {version = ">=2.4.2,<3.0.0 || >3.0.0,<3.0.1 || >3.0.1,<3.0.2 || >3.0.2,<3.0.3 || >3.0.3,<4", markers = "python_version > \"3.0\""} + +[[package]] +name = "httpx" +version = "0.24.1" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.7" +files = [ {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, ] -humanfriendly = [ + +[package.dependencies] +certifi = "*" +h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} +httpcore = ">=0.15.0,<0.18.0" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "humanfriendly" +version = "10.0" +description = "Human friendly output for text interfaces using Python" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, ] -humanize = [ + +[package.dependencies] +pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} + +[[package]] +name = "humanize" +version = "4.8.0" +description = "Python humanize utilities" +optional = false +python-versions = ">=3.8" +files = [ {file = "humanize-4.8.0-py3-none-any.whl", hash = "sha256:8bc9e2bb9315e61ec06bf690151ae35aeb65651ab091266941edf97c90836404"}, {file = "humanize-4.8.0.tar.gz", hash = "sha256:9783373bf1eec713a770ecaa7c2d7a7902c98398009dfa3d8a2df91eec9311e8"}, ] -hyperframe = [ + +[package.extras] +tests = ["freezegun", "pytest", "pytest-cov"] + +[[package]] +name = "hyperframe" +version = "6.0.1" +description = "HTTP/2 framing layer for Python" +optional = true +python-versions = ">=3.6.1" +files = [ {file = "hyperframe-6.0.1-py3-none-any.whl", hash = "sha256:0ec6bafd80d8ad2195c4f03aacba3a8265e57bc4cff261e802bf39970ed02a15"}, {file = "hyperframe-6.0.1.tar.gz", hash = "sha256:ae510046231dc8e9ecb1a6586f63d2347bf4c8905914aa84ba585ae85f28a914"}, ] -idna = [ + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] -importlib-metadata = [ + +[[package]] +name = "importlib-metadata" +version = "4.13.0" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.7" +files = [ {file = "importlib_metadata-4.13.0-py3-none-any.whl", hash = "sha256:8a8a81bcf996e74fee46f0d16bd3eaa382a7eb20fd82445c3ad11f4090334116"}, {file = "importlib_metadata-4.13.0.tar.gz", hash = "sha256:dd0173e8f150d6815e098fd354f6414b0f079af4644ddfe90c71e2fc6174346d"}, ] -importlib-resources = [ + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] +perf = ["ipython"] +testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] + +[[package]] +name = "importlib-resources" +version = "6.0.1" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.8" +files = [ {file = "importlib_resources-6.0.1-py3-none-any.whl", hash = "sha256:134832a506243891221b88b4ae1213327eea96ceb4e407a00d790bb0626f45cf"}, {file = "importlib_resources-6.0.1.tar.gz", hash = "sha256:4359457e42708462b9626a04657c6208ad799ceb41e5c58c57ffa0e6a098a5d4"}, ] -inflection = [ + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[[package]] +name = "inflection" +version = "0.5.1" +description = "A port of Ruby on Rails inflector to Python" +optional = false +python-versions = ">=3.5" +files = [ {file = "inflection-0.5.1-py2.py3-none-any.whl", hash = "sha256:f38b2b640938a4f35ade69ac3d053042959b62a0f1076a5bbaa1b9526605a8a2"}, {file = "inflection-0.5.1.tar.gz", hash = "sha256:1a29730d366e996aaacffb2f1f1cb9593dc38e2ddd30c91250c6dde09ea9b417"}, ] -iniconfig = [ + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] -isodate = [ + +[[package]] +name = "isodate" +version = "0.6.1" +description = "An ISO 8601 date/time/duration parser and formatter" +optional = false +python-versions = "*" +files = [ {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, ] -itsdangerous = [ + +[package.dependencies] +six = "*" + +[[package]] +name = "isort" +version = "5.12.0" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.12.0-py3-none-any.whl", hash = "sha256:f84c2818376e66cf843d497486ea8fed8700b340f308f076c6fb1229dff318b6"}, + {file = "isort-5.12.0.tar.gz", hash = "sha256:8bef7dde241278824a6d83f44a544709b065191b95b6e50894bdc722fcba0504"}, +] + +[package.extras] +colors = ["colorama (>=0.4.3)"] +pipfile-deprecated-finder = ["pip-shims (>=0.5.2)", "pipreqs", "requirementslib"] +plugins = ["setuptools"] +requirements-deprecated-finder = ["pip-api", "pipreqs"] + +[[package]] +name = "itsdangerous" +version = "2.1.2" +description = "Safely pass data to untrusted environments and back." +optional = false +python-versions = ">=3.7" +files = [ {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, ] -jaraco-classes = [ + +[[package]] +name = "jaraco-classes" +version = "3.3.0" +description = "Utility functions for Python class constructs" +optional = true +python-versions = ">=3.8" +files = [ {file = "jaraco.classes-3.3.0-py3-none-any.whl", hash = "sha256:10afa92b6743f25c0cf5f37c6bb6e18e2c5bb84a16527ccfc0040ea377e7aaeb"}, {file = "jaraco.classes-3.3.0.tar.gz", hash = "sha256:c063dd08e89217cee02c8d5e5ec560f2c8ce6cdc2fcdc2e68f7b2e5547ed3621"}, ] -jeepney = [ + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[[package]] +name = "jeepney" +version = "0.8.0" +description = "Low-level, pure Python DBus protocol wrapper." +optional = true +python-versions = ">=3.7" +files = [ {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, ] -jinja2 = [ + +[package.extras] +test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] +trio = ["async_generator", "trio"] + +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, ] -jinxed = [ + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "jinxed" +version = "1.2.0" +description = "Jinxed Terminal Library" +optional = false +python-versions = "*" +files = [ {file = "jinxed-1.2.0-py2.py3-none-any.whl", hash = "sha256:cfc2b2e4e3b4326954d546ba6d6b9a7a796ddcb0aef8d03161d005177eb0d48b"}, {file = "jinxed-1.2.0.tar.gz", hash = "sha256:032acda92d5c57cd216033cbbd53de731e6ed50deb63eb4781336ca55f72cda5"}, ] -jmespath = [ + +[package.dependencies] +ansicon = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = true +python-versions = ">=3.7" +files = [ {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, ] -jsonpath-ng = [ + +[[package]] +name = "jsonpath-ng" +version = "1.5.3" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ {file = "jsonpath-ng-1.5.3.tar.gz", hash = "sha256:a273b182a82c1256daab86a313b937059261b5c5f8c4fa3fc38b882b344dd567"}, {file = "jsonpath_ng-1.5.3-py2-none-any.whl", hash = "sha256:f75b95dbecb8a0f3b86fd2ead21c2b022c3f5770957492b9b6196ecccfeb10aa"}, {file = "jsonpath_ng-1.5.3-py3-none-any.whl", hash = "sha256:292a93569d74029ba75ac2dc3d3630fc0e17b2df26119a165fa1d498ca47bf65"}, ] -jsonschema = [ + +[package.dependencies] +decorator = "*" +ply = "*" +six = "*" + +[[package]] +name = "jsonschema" +version = "4.19.0" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = ">=3.8" +files = [ {file = "jsonschema-4.19.0-py3-none-any.whl", hash = "sha256:043dc26a3845ff09d20e4420d6012a9c91c9aa8999fa184e7efcfeccb41e32cb"}, {file = "jsonschema-4.19.0.tar.gz", hash = "sha256:6e1e7569ac13be8139b2dd2c21a55d350066ee3f80df06c608b398cdc6f30e8f"}, ] -jsonschema-specifications = [ + +[package.dependencies] +attrs = ">=22.2.0" +importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} +jsonschema-specifications = ">=2023.03.6" +pkgutil-resolve-name = {version = ">=1.3.10", markers = "python_version < \"3.9\""} +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=1.11)"] + +[[package]] +name = "jsonschema-specifications" +version = "2023.7.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +optional = false +python-versions = ">=3.8" +files = [ {file = "jsonschema_specifications-2023.7.1-py3-none-any.whl", hash = "sha256:05adf340b659828a004220a9613be00fa3f223f2b82002e273dee62fd50524b1"}, {file = "jsonschema_specifications-2023.7.1.tar.gz", hash = "sha256:c91a50404e88a1f6ba40636778e2ee08f6e24c5613fe4c53ac24578a5a7f72bb"}, ] -keyring = [ + +[package.dependencies] +importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} +referencing = ">=0.28.0" + +[[package]] +name = "keyring" +version = "24.2.0" +description = "Store and access your passwords safely." +optional = true +python-versions = ">=3.8" +files = [ {file = "keyring-24.2.0-py3-none-any.whl", hash = "sha256:4901caaf597bfd3bbd78c9a0c7c4c29fcd8310dab2cffefe749e916b6527acd6"}, {file = "keyring-24.2.0.tar.gz", hash = "sha256:ca0746a19ec421219f4d713f848fa297a661a8a8c1504867e55bfb5e09091509"}, ] -lazy-object-proxy = [ + +[package.dependencies] +importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""} +importlib-resources = {version = "*", markers = "python_version < \"3.9\""} +"jaraco.classes" = "*" +jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} +pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} +SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} + +[package.extras] +completion = ["shtab"] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[[package]] +name = "lazy-object-proxy" +version = "1.9.0" +description = "A fast and thorough lazy object proxy." +optional = false +python-versions = ">=3.7" +files = [ {file = "lazy-object-proxy-1.9.0.tar.gz", hash = "sha256:659fb5809fa4629b8a1ac5106f669cfc7bef26fbb389dda53b3e010d1ac4ebae"}, {file = "lazy_object_proxy-1.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b40387277b0ed2d0602b8293b94d7257e17d1479e257b4de114ea11a8cb7f2d7"}, {file = "lazy_object_proxy-1.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8c6cfb338b133fbdbc5cfaa10fe3c6aeea827db80c978dbd13bc9dd8526b7d4"}, @@ -6422,23 +3961,88 @@ lazy-object-proxy = [ {file = "lazy_object_proxy-1.9.0-cp39-cp39-win32.whl", hash = "sha256:9090d8e53235aa280fc9239a86ae3ea8ac58eff66a705fa6aa2ec4968b95c821"}, {file = "lazy_object_proxy-1.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:db1c1722726f47e10e0b5fdbf15ac3b8adb58c091d12b3ab713965795036985f"}, ] -leather = [ + +[[package]] +name = "leather" +version = "0.3.4" +description = "Python charting for 80% of humans." +optional = false +python-versions = "*" +files = [ {file = "leather-0.3.4-py2.py3-none-any.whl", hash = "sha256:5e741daee96e9f1e9e06081b8c8a10c4ac199301a0564cdd99b09df15b4603d2"}, {file = "leather-0.3.4.tar.gz", hash = "sha256:b43e21c8fa46b2679de8449f4d953c06418666dc058ce41055ee8a8d3bb40918"}, ] -limits = [ + +[package.dependencies] +six = ">=1.6.1" + +[[package]] +name = "limits" +version = "3.6.0" +description = "Rate limiting utilities" +optional = false +python-versions = ">=3.7" +files = [ {file = "limits-3.6.0-py3-none-any.whl", hash = "sha256:32fe29a398352c71bc43d53773117d47e22c5ea4200aef28d3f5fdee10334cd7"}, {file = "limits-3.6.0.tar.gz", hash = "sha256:57a9c69fd37ad1e4fa3886dff8d035227e1f6af87f47e9118627e72cf1ced3bf"}, ] -linkify-it-py = [ + +[package.dependencies] +deprecated = ">=1.2" +importlib-resources = ">=1.3" +packaging = ">=21,<24" +typing-extensions = "*" + +[package.extras] +all = ["aetcd", "coredis (>=3.4.0,<5)", "emcache (>=0.6.1)", "emcache (>=1)", "etcd3", "motor (>=3,<4)", "pymemcache (>3,<5.0.0)", "pymongo (>4.1,<5)", "redis (>3,!=4.5.2,!=4.5.3,<6.0.0)", "redis (>=4.2.0,!=4.5.2,!=4.5.3)"] +async-etcd = ["aetcd"] +async-memcached = ["emcache (>=0.6.1)", "emcache (>=1)"] +async-mongodb = ["motor (>=3,<4)"] +async-redis = ["coredis (>=3.4.0,<5)"] +etcd = ["etcd3"] +memcached = ["pymemcache (>3,<5.0.0)"] +mongodb = ["pymongo (>4.1,<5)"] +redis = ["redis (>3,!=4.5.2,!=4.5.3,<6.0.0)"] +rediscluster = ["redis (>=4.2.0,!=4.5.2,!=4.5.3)"] + +[[package]] +name = "linkify-it-py" +version = "2.0.2" +description = "Links recognition library with FULL unicode support." +optional = false +python-versions = ">=3.7" +files = [ {file = "linkify-it-py-2.0.2.tar.gz", hash = "sha256:19f3060727842c254c808e99d465c80c49d2c7306788140987a1a7a29b0d6ad2"}, {file = "linkify_it_py-2.0.2-py3-none-any.whl", hash = "sha256:a3a24428f6c96f27370d7fe61d2ac0be09017be5190d68d8658233171f1b6541"}, ] -lockfile = [ + +[package.dependencies] +uc-micro-py = "*" + +[package.extras] +benchmark = ["pytest", "pytest-benchmark"] +dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] +doc = ["myst-parser", "sphinx", "sphinx-book-theme"] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "lockfile" +version = "0.12.2" +description = "Platform-independent file locking module" +optional = false +python-versions = "*" +files = [ {file = "lockfile-0.12.2-py2.py3-none-any.whl", hash = "sha256:6c3cb24f344923d30b2785d5ad75182c8ea7ac1b6171b08657258ec7429d50fa"}, {file = "lockfile-0.12.2.tar.gz", hash = "sha256:6aed02de03cba24efabcd600b30540140634fc06cfa603822d508d5361e9f799"}, ] -logbook = [ + +[[package]] +name = "logbook" +version = "1.5.3" +description = "A logging replacement for Python" +optional = false +python-versions = "*" +files = [ {file = "Logbook-1.5.3-cp27-cp27m-win32.whl", hash = "sha256:56ee54c11df3377314cedcd6507638f015b4b88c0238c2e01b5eb44fd3a6ad1b"}, {file = "Logbook-1.5.3-cp27-cp27m-win_amd64.whl", hash = "sha256:2dc85f1510533fddb481e97677bb7bca913560862734c0b3b289bfed04f78c92"}, {file = "Logbook-1.5.3-cp35-cp35m-win32.whl", hash = "sha256:94e2e11ff3c2304b0d09a36c6208e5ae756eb948b210e5cbd63cd8d27f911542"}, @@ -6449,17 +4053,32 @@ logbook = [ {file = "Logbook-1.5.3-cp37-cp37m-win_amd64.whl", hash = "sha256:0cf2cdbfb65a03b5987d19109dacad13417809dcf697f66e1a7084fb21744ea9"}, {file = "Logbook-1.5.3.tar.gz", hash = "sha256:66f454ada0f56eae43066f604a222b09893f98c1adc18df169710761b8f32fe8"}, ] -lxml = [ + +[package.extras] +all = ["Jinja2", "brotli", "cython", "execnet (>=1.0.9)", "mock", "pytest", "pytest-cov (<2.6)", "pyzmq", "redis", "sqlalchemy"] +compression = ["brotli"] +dev = ["cython", "mock", "pytest", "pytest-cov (<2.6)"] +execnet = ["execnet (>=1.0.9)"] +jinja = ["Jinja2"] +redis = ["redis"] +sqlalchemy = ["sqlalchemy"] +test = ["mock", "pytest", "pytest-cov (<2.6)"] +zmq = ["pyzmq"] + +[[package]] +name = "lxml" +version = "4.9.3" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" +files = [ {file = "lxml-4.9.3-cp27-cp27m-macosx_11_0_x86_64.whl", hash = "sha256:b0a545b46b526d418eb91754565ba5b63b1c0b12f9bd2f808c852d9b4b2f9b5c"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:075b731ddd9e7f68ad24c635374211376aa05a281673ede86cbe1d1b3455279d"}, {file = "lxml-4.9.3-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1e224d5755dba2f4a9498e150c43792392ac9b5380aa1b845f98a1618c94eeef"}, - {file = "lxml-4.9.3-cp27-cp27m-win32.whl", hash = "sha256:2c74524e179f2ad6d2a4f7caf70e2d96639c0954c943ad601a9e146c76408ed7"}, - {file = "lxml-4.9.3-cp27-cp27m-win_amd64.whl", hash = "sha256:4f1026bc732b6a7f96369f7bfe1a4f2290fb34dce00d8644bc3036fb351a4ca1"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c0781a98ff5e6586926293e59480b64ddd46282953203c76ae15dbbbf302e8bb"}, {file = "lxml-4.9.3-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cef2502e7e8a96fe5ad686d60b49e1ab03e438bd9123987994528febd569868e"}, {file = "lxml-4.9.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:b86164d2cff4d3aaa1f04a14685cbc072efd0b4f99ca5708b2ad1b9b5988a991"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:42871176e7896d5d45138f6d28751053c711ed4d48d8e30b498da155af39aebd"}, - {file = "lxml-4.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ae8b9c6deb1e634ba4f1930eb67ef6e6bf6a44b6eb5ad605642b2d6d5ed9ce3c"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:411007c0d88188d9f621b11d252cce90c4a2d1a49db6c068e3c16422f306eab8"}, {file = "lxml-4.9.3-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:cd47b4a0d41d2afa3e58e5bf1f62069255aa2fd6ff5ee41604418ca925911d76"}, {file = "lxml-4.9.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0e2cb47860da1f7e9a5256254b74ae331687b9672dfa780eed355c4c9c3dbd23"}, @@ -6468,7 +4087,6 @@ lxml = [ {file = "lxml-4.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:97047f0d25cd4bcae81f9ec9dc290ca3e15927c192df17331b53bebe0e3ff96d"}, {file = "lxml-4.9.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:1f447ea5429b54f9582d4b955f5f1985f278ce5cf169f72eea8afd9502973dd5"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:57d6ba0ca2b0c462f339640d22882acc711de224d769edf29962b09f77129cbf"}, - {file = "lxml-4.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:9767e79108424fb6c3edf8f81e6730666a50feb01a328f4a016464a5893f835a"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:71c52db65e4b56b8ddc5bb89fb2e66c558ed9d1a74a45ceb7dcb20c191c3df2f"}, {file = "lxml-4.9.3-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:d73d8ecf8ecf10a3bd007f2192725a34bd62898e8da27eb9d32a58084f93962b"}, {file = "lxml-4.9.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0a3d3487f07c1d7f150894c238299934a2a074ef590b583103a45002035be120"}, @@ -6488,7 +4106,6 @@ lxml = [ {file = "lxml-4.9.3-cp36-cp36m-macosx_11_0_x86_64.whl", hash = "sha256:64f479d719dc9f4c813ad9bb6b28f8390360660b73b2e4beb4cb0ae7104f1c12"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:dd708cf4ee4408cf46a48b108fb9427bfa00b9b85812a9262b5c668af2533ea5"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c31c7462abdf8f2ac0577d9f05279727e698f97ecbb02f17939ea99ae8daa98"}, - {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e3cd95e10c2610c360154afdc2f1480aea394f4a4f1ea0a5eacce49640c9b190"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:4930be26af26ac545c3dffb662521d4e6268352866956672231887d18f0eaab2"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4aec80cde9197340bc353d2768e2a75f5f60bacda2bab72ab1dc499589b3878c"}, {file = "lxml-4.9.3-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:14e019fd83b831b2e61baed40cab76222139926b1fb5ed0e79225bc0cae14584"}, @@ -6498,7 +4115,6 @@ lxml = [ {file = "lxml-4.9.3-cp36-cp36m-win_amd64.whl", hash = "sha256:bef4e656f7d98aaa3486d2627e7d2df1157d7e88e7efd43a65aa5dd4714916cf"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:46f409a2d60f634fe550f7133ed30ad5321ae2e6630f13657fb9479506b00601"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4c28a9144688aef80d6ea666c809b4b0e50010a2aca784c97f5e6bf143d9f129"}, - {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:141f1d1a9b663c679dc524af3ea1773e618907e96075262726c7612c02b149a4"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:53ace1c1fd5a74ef662f844a0413446c0629d151055340e9893da958a374f70d"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:17a753023436a18e27dd7769e798ce302963c236bc4114ceee5b25c18c52c693"}, {file = "lxml-4.9.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:7d298a1bd60c067ea75d9f684f5f3992c9d6766fadbc0bcedd39750bf344c2f4"}, @@ -6508,7 +4124,6 @@ lxml = [ {file = "lxml-4.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:120fa9349a24c7043854c53cae8cec227e1f79195a7493e09e0c12e29f918e52"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4d2d1edbca80b510443f51afd8496be95529db04a509bc8faee49c7b0fb6d2cc"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:8d7e43bd40f65f7d97ad8ef5c9b1778943d02f04febef12def25f7583d19baac"}, - {file = "lxml-4.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:71d66ee82e7417828af6ecd7db817913cb0cf9d4e61aa0ac1fde0583d84358db"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:6fc3c450eaa0b56f815c7b62f2b7fba7266c4779adcf1cece9e6deb1de7305ce"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:65299ea57d82fb91c7f019300d24050c4ddeb7c5a190e076b5f48a2b43d19c42"}, {file = "lxml-4.9.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:eadfbbbfb41b44034a4c757fd5d70baccd43296fb894dba0295606a7cf3124aa"}, @@ -6518,7 +4133,6 @@ lxml = [ {file = "lxml-4.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:92af161ecbdb2883c4593d5ed4815ea71b31fafd7fd05789b23100d081ecac96"}, {file = "lxml-4.9.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:9bb6ad405121241e99a86efff22d3ef469024ce22875a7ae045896ad23ba2340"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8ed74706b26ad100433da4b9d807eae371efaa266ffc3e9191ea436087a9d6a7"}, - {file = "lxml-4.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:fbf521479bcac1e25a663df882c46a641a9bff6b56dc8b0fafaebd2f66fb231b"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:303bf1edce6ced16bf67a18a1cf8339d0db79577eec5d9a6d4a80f0fb10aa2da"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:5515edd2a6d1a5a70bfcdee23b42ec33425e405c5b351478ab7dc9347228f96e"}, {file = "lxml-4.9.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:690dafd0b187ed38583a648076865d8c229661ed20e48f2335d68e2cf7dc829d"}, @@ -6529,37 +4143,103 @@ lxml = [ {file = "lxml-4.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:4dd9a263e845a72eacb60d12401e37c616438ea2e5442885f65082c276dfb2b2"}, {file = "lxml-4.9.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6689a3d7fd13dc687e9102a27e98ef33730ac4fe37795d5036d18b4d527abd35"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:f6bdac493b949141b733c5345b6ba8f87a226029cbabc7e9e121a413e49441e0"}, - {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:05186a0f1346ae12553d66df1cfce6f251589fea3ad3da4f3ef4e34b2d58c6a3"}, {file = "lxml-4.9.3-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c2006f5c8d28dee289f7020f721354362fa304acbaaf9745751ac4006650254b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-macosx_11_0_x86_64.whl", hash = "sha256:5c245b783db29c4e4fbbbfc9c5a78be496c9fea25517f90606aa1f6b2b3d5f7b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:4fb960a632a49f2f089d522f70496640fdf1218f1243889da3822e0a9f5f3ba7"}, - {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:50670615eaf97227d5dc60de2dc99fb134a7130d310d783314e7724bf163f75d"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9719fe17307a9e814580af1f5c6e05ca593b12fb7e44fe62450a5384dbf61b4b"}, {file = "lxml-4.9.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3331bece23c9ee066e0fb3f96c61322b9e0f54d775fccefff4c38ca488de283a"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-macosx_11_0_x86_64.whl", hash = "sha256:ed667f49b11360951e201453fc3967344d0d0263aa415e1619e85ae7fd17b4e0"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_24_i686.whl", hash = "sha256:8b77946fd508cbf0fccd8e400a7f71d4ac0e1595812e66025bac475a8e811694"}, - {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e4da8ca0c0c0aea88fd46be8e44bd49716772358d648cce45fe387f7b92374a7"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fe4bda6bd4340caa6e5cf95e73f8fea5c4bfc55763dd42f1b50a94c1b4a2fbd4"}, {file = "lxml-4.9.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f3df3db1d336b9356dd3112eae5f5c2b8b377f3bc826848567f10bfddfee77e9"}, {file = "lxml-4.9.3.tar.gz", hash = "sha256:48628bd53a426c9eb9bc066a923acaa0878d1e86129fd5359aee99285f4eed9c"}, ] -makefun = [ + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=0.29.35)"] + +[[package]] +name = "makefun" +version = "1.15.1" +description = "Small library to dynamically create python functions." +optional = false +python-versions = "*" +files = [ {file = "makefun-1.15.1-py2.py3-none-any.whl", hash = "sha256:a63cfc7b47a539c76d97bd4fdb833c7d0461e759fd1225f580cb4be6200294d4"}, {file = "makefun-1.15.1.tar.gz", hash = "sha256:40b0f118b6ded0d8d78c78f1eb679b8b6b2462e3c1b3e05fb1b2da8cd46b48a5"}, ] -mako = [ + +[[package]] +name = "mako" +version = "1.2.4" +description = "A super-fast templating language that borrows the best ideas from the existing templating languages." +optional = false +python-versions = ">=3.7" +files = [ {file = "Mako-1.2.4-py3-none-any.whl", hash = "sha256:c97c79c018b9165ac9922ae4f32da095ffd3c4e6872b45eded42926deea46818"}, {file = "Mako-1.2.4.tar.gz", hash = "sha256:d60a3903dc3bb01a18ad6a89cdbe2e4eadc69c0bc8ef1e3773ba53d44c3f7a34"}, ] -markdown = [ + +[package.dependencies] +MarkupSafe = ">=0.9.2" + +[package.extras] +babel = ["Babel"] +lingua = ["lingua"] +testing = ["pytest"] + +[[package]] +name = "markdown" +version = "3.4.4" +description = "Python implementation of John Gruber's Markdown." +optional = false +python-versions = ">=3.7" +files = [ {file = "Markdown-3.4.4-py3-none-any.whl", hash = "sha256:a4c1b65c0957b4bd9e7d86ddc7b3c9868fb9670660f6f99f6d1bca8954d5a941"}, {file = "Markdown-3.4.4.tar.gz", hash = "sha256:225c6123522495d4119a90b3a3ba31a1e87a70369e03f14799ea9c0d7183a3d6"}, ] -markdown-it-py = [ + +[package.dependencies] +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.0)", "mkdocs-nature (>=0.4)"] +testing = ["coverage", "pyyaml"] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, ] -markupsafe = [ + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "2.1.3" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa"}, {file = "MarkupSafe-2.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57"}, {file = "MarkupSafe-2.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f"}, @@ -6611,54 +4291,215 @@ markupsafe = [ {file = "MarkupSafe-2.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba"}, {file = "MarkupSafe-2.1.3.tar.gz", hash = "sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad"}, ] -marshmallow = [ + +[[package]] +name = "marshmallow" +version = "3.20.1" +description = "A lightweight library for converting complex datatypes to and from native Python datatypes." +optional = false +python-versions = ">=3.8" +files = [ {file = "marshmallow-3.20.1-py3-none-any.whl", hash = "sha256:684939db93e80ad3561392f47be0230743131560a41c5110684c16e21ade0a5c"}, {file = "marshmallow-3.20.1.tar.gz", hash = "sha256:5d2371bbe42000f2b3fb5eaa065224df7d8f8597bc19a1bbfa5bfe7fba8da889"}, ] -marshmallow-oneofschema = [ + +[package.dependencies] +packaging = ">=17.0" + +[package.extras] +dev = ["flake8 (==6.0.0)", "flake8-bugbear (==23.7.10)", "mypy (==1.4.1)", "pre-commit (>=2.4,<4.0)", "pytest", "pytz", "simplejson", "tox"] +docs = ["alabaster (==0.7.13)", "autodocsumm (==0.2.11)", "sphinx (==7.0.1)", "sphinx-issues (==3.0.1)", "sphinx-version-warning (==1.1.2)"] +lint = ["flake8 (==6.0.0)", "flake8-bugbear (==23.7.10)", "mypy (==1.4.1)", "pre-commit (>=2.4,<4.0)"] +tests = ["pytest", "pytz", "simplejson"] + +[[package]] +name = "marshmallow-oneofschema" +version = "3.0.1" +description = "marshmallow multiplexing schema" +optional = false +python-versions = ">=3.6" +files = [ {file = "marshmallow-oneofschema-3.0.1.tar.gz", hash = "sha256:62cd2099b29188c92493c2940ee79d1bf2f2619a71721664e5a98ec2faa58237"}, {file = "marshmallow_oneofschema-3.0.1-py2.py3-none-any.whl", hash = "sha256:bd29410a9f2f7457a2b428286e2a80ef76b8ddc3701527dc1f935a88914b02f2"}, ] -marshmallow-sqlalchemy = [ + +[package.dependencies] +marshmallow = ">=3.0.0,<4.0.0" + +[package.extras] +dev = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "mock", "pre-commit (>=2.7,<3.0)", "pytest", "tox"] +lint = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.7,<3.0)"] +tests = ["mock", "pytest"] + +[[package]] +name = "marshmallow-sqlalchemy" +version = "0.26.1" +description = "SQLAlchemy integration with the marshmallow (de)serialization library" +optional = false +python-versions = ">=3.6" +files = [ {file = "marshmallow-sqlalchemy-0.26.1.tar.gz", hash = "sha256:d8525f74de51554b5c8491effe036f60629a426229befa33ff614c8569a16a73"}, {file = "marshmallow_sqlalchemy-0.26.1-py2.py3-none-any.whl", hash = "sha256:ba7493eeb8669a3bf00d8f906b657feaa87a740ae9e4ecf829cfd6ddf763d276"}, ] -mashumaro = [ + +[package.dependencies] +marshmallow = ">=3.0.0" +SQLAlchemy = ">=1.2.0" + +[package.extras] +dev = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.0,<3.0)", "pytest", "pytest-lazy-fixture", "tox"] +docs = ["alabaster (==0.7.12)", "sphinx (==4.0.2)", "sphinx-issues (==1.2.0)"] +lint = ["flake8 (==3.9.2)", "flake8-bugbear (==21.4.3)", "pre-commit (>=2.0,<3.0)"] +tests = ["pytest", "pytest-lazy-fixture"] + +[[package]] +name = "mashumaro" +version = "3.6" +description = "Fast serialization library on top of dataclasses" +optional = false +python-versions = ">=3.7" +files = [ {file = "mashumaro-3.6-py3-none-any.whl", hash = "sha256:77403e3e2ecd0a7d0e22d472c08e33282460e48726eabe356c5163efbdf9c7ee"}, {file = "mashumaro-3.6.tar.gz", hash = "sha256:ceb3de53029219bbbb0385ca600b59348dcd14e0c68523986c6d51889ad338f5"}, ] -mccabe = [ + +[package.dependencies] +msgpack = {version = ">=0.5.6", optional = true, markers = "extra == \"msgpack\""} +typing-extensions = ">=4.1.0" + +[package.extras] +msgpack = ["msgpack (>=0.5.6)"] +orjson = ["orjson"] +toml = ["tomli (>=1.1.0)", "tomli-w (>=1.0)"] +yaml = ["pyyaml (>=3.13)"] + +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +files = [ {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] -mdit-py-plugins = [ + +[[package]] +name = "mdit-py-plugins" +version = "0.4.0" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +files = [ {file = "mdit_py_plugins-0.4.0-py3-none-any.whl", hash = "sha256:b51b3bb70691f57f974e257e367107857a93b36f322a9e6d44ca5bf28ec2def9"}, {file = "mdit_py_plugins-0.4.0.tar.gz", hash = "sha256:d8ab27e9aed6c38aa716819fedfde15ca275715955f8a185a8e1cf90fb1d2c1b"}, ] -mdurl = [ + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, ] -minimal-snowplow-tracker = [ + +[[package]] +name = "minimal-snowplow-tracker" +version = "0.0.2" +description = "A minimal snowplow event tracker for Python. Add analytics to your Python and Django apps, webapps and games" +optional = false +python-versions = "*" +files = [ {file = "minimal-snowplow-tracker-0.0.2.tar.gz", hash = "sha256:acabf7572db0e7f5cbf6983d495eef54081f71be392330eb3aadb9ccb39daaa4"}, ] -more-itertools = [ + +[package.dependencies] +requests = ">=2.2.1,<3.0" +six = ">=1.9.0,<2.0" + +[[package]] +name = "more-itertools" +version = "10.1.0" +description = "More routines for operating on iterables, beyond itertools" +optional = true +python-versions = ">=3.8" +files = [ {file = "more-itertools-10.1.0.tar.gz", hash = "sha256:626c369fa0eb37bac0291bce8259b332fd59ac792fa5497b59837309cd5b114a"}, {file = "more_itertools-10.1.0-py3-none-any.whl", hash = "sha256:64e0735fcfdc6f3464ea133afe8ea4483b1c5fe3a3d69852e6503b43a0b222e6"}, ] -mpmath = [ + +[[package]] +name = "mpmath" +version = "1.3.0" +description = "Python library for arbitrary-precision floating-point arithmetic" +optional = true +python-versions = "*" +files = [ {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, ] -msal = [ + +[package.extras] +develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] +docs = ["sphinx"] +gmpy = ["gmpy2 (>=2.1.0a4)"] +tests = ["pytest (>=4.6)"] + +[[package]] +name = "msal" +version = "1.23.0" +description = "The Microsoft Authentication Library (MSAL) for Python library enables your app to access the Microsoft Cloud by supporting authentication of users with Microsoft Azure Active Directory accounts (AAD) and Microsoft Accounts (MSA) using industry standard OAuth2 and OpenID Connect." +optional = true +python-versions = "*" +files = [ {file = "msal-1.23.0-py2.py3-none-any.whl", hash = "sha256:3342e0837a047007f9d479e814b559c3219767453d57920dc40a31986862048b"}, {file = "msal-1.23.0.tar.gz", hash = "sha256:25c9a33acf84301f93d1fdbe9f1a9c60cd38af0d5fffdbfa378138fc7bc1e86b"}, ] -msal-extensions = [ + +[package.dependencies] +cryptography = ">=0.6,<44" +PyJWT = {version = ">=1.0.0,<3", extras = ["crypto"]} +requests = ">=2.0.0,<3" + +[package.extras] +broker = ["pymsalruntime (>=0.13.2,<0.14)"] + +[[package]] +name = "msal-extensions" +version = "1.0.0" +description = "Microsoft Authentication Library extensions (MSAL EX) provides a persistence API that can save your data on disk, encrypted on Windows, macOS and Linux. Concurrent data access will be coordinated by a file lock mechanism." +optional = true +python-versions = "*" +files = [ {file = "msal-extensions-1.0.0.tar.gz", hash = "sha256:c676aba56b0cce3783de1b5c5ecfe828db998167875126ca4b47dc6436451354"}, {file = "msal_extensions-1.0.0-py2.py3-none-any.whl", hash = "sha256:91e3db9620b822d0ed2b4d1850056a0f133cba04455e62f11612e40f5502f2ee"}, ] -msgpack = [ + +[package.dependencies] +msal = ">=0.4.1,<2.0.0" +portalocker = [ + {version = ">=1.0,<3", markers = "python_version >= \"3.5\" and platform_system != \"Windows\""}, + {version = ">=1.6,<3", markers = "python_version >= \"3.5\" and platform_system == \"Windows\""}, +] + +[[package]] +name = "msgpack" +version = "1.0.5" +description = "MessagePack serializer" +optional = false +python-versions = "*" +files = [ {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:525228efd79bb831cf6830a732e2e80bc1b05436b086d4264814b4b2955b2fa9"}, {file = "msgpack-1.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4f8d8b3bf1ff2672567d6b5c725a1b347fe838b912772aa8ae2bf70338d5a198"}, {file = "msgpack-1.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdc793c50be3f01106245a61b739328f7dccc2c648b501e237f0699fe1395b81"}, @@ -6723,7 +4564,14 @@ msgpack = [ {file = "msgpack-1.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:06f5174b5f8ed0ed919da0e62cbd4ffde676a374aba4020034da05fab67b9164"}, {file = "msgpack-1.0.5.tar.gz", hash = "sha256:c075544284eadc5cddc70f4757331d99dcbc16b2bbd4849d15f8aae4cf36d31c"}, ] -multidict = [ + +[[package]] +name = "multidict" +version = "6.0.4" +description = "multidict implementation" +optional = false +python-versions = ">=3.7" +files = [ {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"}, {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"}, {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"}, @@ -6799,7 +4647,14 @@ multidict = [ {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"}, {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"}, ] -mypy = [ + +[[package]] +name = "mypy" +version = "1.6.1" +description = "Optional static typing for Python" +optional = false +python-versions = ">=3.8" +files = [ {file = "mypy-1.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e5012e5cc2ac628177eaac0e83d622b2dd499e28253d4107a08ecc59ede3fc2c"}, {file = "mypy-1.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d8fbb68711905f8912e5af474ca8b78d077447d8f3918997fecbf26943ff3cbb"}, {file = "mypy-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21a1ad938fee7d2d96ca666c77b7c494c3c5bd88dff792220e1afbebb2925b5e"}, @@ -6828,47 +4683,161 @@ mypy = [ {file = "mypy-1.6.1-py3-none-any.whl", hash = "sha256:4cbe68ef919c28ea561165206a2dcb68591c50f3bcf777932323bc208d949cf1"}, {file = "mypy-1.6.1.tar.gz", hash = "sha256:4d01c00d09a0be62a4ca3f933e315455bde83f37f892ba4b08ce92f3cf44bcc1"}, ] -mypy-boto3-athena = [ + +[package.dependencies] +mypy-extensions = ">=1.0.0" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = ">=4.1.0" + +[package.extras] +dmypy = ["psutil (>=4.0)"] +install-types = ["pip"] +reports = ["lxml"] + +[[package]] +name = "mypy-boto3-athena" +version = "1.28.36" +description = "Type annotations for boto3.Athena 1.28.36 service generated with mypy-boto3-builder 7.18.0" +optional = true +python-versions = ">=3.7" +files = [ {file = "mypy-boto3-athena-1.28.36.tar.gz", hash = "sha256:a76df6aace3dc1d91b3f74640d617cd1b4802e5f348a22db2f16dfce0b01ee26"}, {file = "mypy_boto3_athena-1.28.36-py3-none-any.whl", hash = "sha256:b79b77df6ba30c55ff2f1f8b36de410f537c8c978d892e958b4c5e165797915a"}, ] -mypy-boto3-glue = [ + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} + +[[package]] +name = "mypy-boto3-glue" +version = "1.28.36" +description = "Type annotations for boto3.Glue 1.28.36 service generated with mypy-boto3-builder 7.18.0" +optional = true +python-versions = ">=3.7" +files = [ {file = "mypy-boto3-glue-1.28.36.tar.gz", hash = "sha256:161771252bb6a220a0bfd8e6ad71da8548599c611f95fe8a94846f4a3386d2ae"}, {file = "mypy_boto3_glue-1.28.36-py3-none-any.whl", hash = "sha256:73bc14616ac65a5c02adea5efba7bbbcf8207cd0c0e3237c13d351ebc916338d"}, ] -mypy-boto3-lakeformation = [ + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} + +[[package]] +name = "mypy-boto3-lakeformation" +version = "1.28.36" +description = "Type annotations for boto3.LakeFormation 1.28.36 service generated with mypy-boto3-builder 7.18.0" +optional = true +python-versions = ">=3.7" +files = [ {file = "mypy-boto3-lakeformation-1.28.36.tar.gz", hash = "sha256:9327cf0d28a09abf5bd90ae946ce7420b32a3b979a1a3554ac93716c3dceacb0"}, {file = "mypy_boto3_lakeformation-1.28.36-py3-none-any.whl", hash = "sha256:9525a8ab3d69632d4ec83eb565ff7fdfa1181fbdf032bcff4a20d4f8a0350688"}, ] -mypy-boto3-sts = [ + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} + +[[package]] +name = "mypy-boto3-sts" +version = "1.28.37" +description = "Type annotations for boto3.STS 1.28.37 service generated with mypy-boto3-builder 7.18.2" +optional = true +python-versions = ">=3.7" +files = [ {file = "mypy-boto3-sts-1.28.37.tar.gz", hash = "sha256:54d64ca695ab90a51c68ac1e67ff9eae7ec69f926649e320a3b90ed1ec841a95"}, {file = "mypy_boto3_sts-1.28.37-py3-none-any.whl", hash = "sha256:24106ff30ecfe7ad0538657bbd00b6009418a5382b323cac46e0e26c1f5d50fb"}, ] -mypy-extensions = [ + +[package.dependencies] +typing-extensions = {version = ">=4.1.0", markers = "python_version < \"3.12\""} + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] -natsort = [ + +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +optional = false +python-versions = ">=3.7" +files = [ {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, ] -networkx = [ + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + +[[package]] +name = "networkx" +version = "2.8.8" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.8" +files = [ {file = "networkx-2.8.8-py3-none-any.whl", hash = "sha256:e435dfa75b1d7195c7b8378c3859f0445cd88c6b0375c181ed66823a9ceb7524"}, {file = "networkx-2.8.8.tar.gz", hash = "sha256:230d388117af870fce5647a3c52401fcf753e94720e6ea6b4197a5355648885e"}, ] -nr-date = [ + +[package.extras] +default = ["matplotlib (>=3.4)", "numpy (>=1.19)", "pandas (>=1.3)", "scipy (>=1.8)"] +developer = ["mypy (>=0.982)", "pre-commit (>=2.20)"] +doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-theme (>=0.11)", "sphinx (>=5.2)", "sphinx-gallery (>=0.11)", "texext (>=0.6.6)"] +extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"] +test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] + +[[package]] +name = "nr-date" +version = "2.1.0" +description = "" +optional = false +python-versions = ">=3.6,<4.0" +files = [ {file = "nr_date-2.1.0-py3-none-any.whl", hash = "sha256:bd672a9dfbdcf7c4b9289fea6750c42490eaee08036a72059dcc78cb236ed568"}, {file = "nr_date-2.1.0.tar.gz", hash = "sha256:0643aea13bcdc2a8bc56af9d5e6a89ef244c9744a1ef00cdc735902ba7f7d2e6"}, ] -nr-stream = [ + +[[package]] +name = "nr-stream" +version = "1.1.5" +description = "" +optional = false +python-versions = ">=3.6,<4.0" +files = [ {file = "nr_stream-1.1.5-py3-none-any.whl", hash = "sha256:47e12150b331ad2cb729cfd9d2abd281c9949809729ba461c6aa87dd9927b2d4"}, {file = "nr_stream-1.1.5.tar.gz", hash = "sha256:eb0216c6bfc61a46d4568dba3b588502c610ec8ddef4ac98f3932a2bd7264f65"}, ] -nr-util = [ + +[[package]] +name = "nr-util" +version = "0.8.12" +description = "General purpose Python utility library." +optional = false +python-versions = ">=3.7,<4.0" +files = [ {file = "nr.util-0.8.12-py3-none-any.whl", hash = "sha256:91da02ac9795eb8e015372275c1efe54bac9051231ee9b0e7e6f96b0b4e7d2bb"}, {file = "nr.util-0.8.12.tar.gz", hash = "sha256:a4549c2033d99d2f0379b3f3d233fd2a8ade286bbf0b3ad0cc7cea16022214f4"}, ] -numpy = [ + +[package.dependencies] +deprecated = ">=1.2.0,<2.0.0" +typing-extensions = ">=3.0.0" + +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.8" +files = [ {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, @@ -6897,6 +4866,15 @@ numpy = [ {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + +[[package]] +name = "numpy" +version = "1.26.1" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = "<3.13,>=3.9" +files = [ {file = "numpy-1.26.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82e871307a6331b5f09efda3c22e03c095d957f04bf6bc1804f30048d0e5e7af"}, {file = "numpy-1.26.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdd9ec98f0063d93baeb01aad472a1a0840dee302842a2746a7a8e92968f9575"}, {file = "numpy-1.26.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d78f269e0c4fd365fc2992c00353e4530d274ba68f15e968d8bc3c69ce5f5244"}, @@ -6930,11 +4908,30 @@ numpy = [ {file = "numpy-1.26.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:6965888d65d2848e8768824ca8288db0a81263c1efccec881cb35a0d805fcd2f"}, {file = "numpy-1.26.1.tar.gz", hash = "sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe"}, ] -oauthlib = [ + +[[package]] +name = "oauthlib" +version = "3.2.2" +description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +optional = false +python-versions = ">=3.6" +files = [ {file = "oauthlib-3.2.2-py3-none-any.whl", hash = "sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca"}, {file = "oauthlib-3.2.2.tar.gz", hash = "sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918"}, ] -onnx = [ + +[package.extras] +rsa = ["cryptography (>=3.0.0)"] +signals = ["blinker (>=1.4.0)"] +signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] + +[[package]] +name = "onnx" +version = "1.15.0" +description = "Open Neural Network Exchange" +optional = true +python-versions = ">=3.8" +files = [ {file = "onnx-1.15.0-cp310-cp310-macosx_10_12_universal2.whl", hash = "sha256:51cacb6aafba308aaf462252ced562111f6991cdc7bc57a6c554c3519453a8ff"}, {file = "onnx-1.15.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:0aee26b6f7f7da7e840de75ad9195a77a147d0662c94eaa6483be13ba468ffc1"}, {file = "onnx-1.15.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baf6ef6c93b3b843edb97a8d5b3d229a1301984f3f8dee859c29634d2083e6f9"}, @@ -6961,7 +4958,21 @@ onnx = [ {file = "onnx-1.15.0-cp39-cp39-win_amd64.whl", hash = "sha256:95d7a3e2d79d371e272e39ae3f7547e0b116d0c7f774a4004e97febe6c93507f"}, {file = "onnx-1.15.0.tar.gz", hash = "sha256:b18461a7d38f286618ca2a6e78062a2a9c634ce498e631e708a8041b00094825"}, ] -onnxruntime = [ + +[package.dependencies] +numpy = "*" +protobuf = ">=3.20.2" + +[package.extras] +reference = ["Pillow", "google-re2"] + +[[package]] +name = "onnxruntime" +version = "1.16.1" +description = "ONNX Runtime is a runtime accelerator for Machine Learning models" +optional = true +python-versions = "*" +files = [ {file = "onnxruntime-1.16.1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:28b2c7f444b4119950b69370801cd66067f403d19cbaf2a444735d7c269cce4a"}, {file = "onnxruntime-1.16.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c24e04f33e7899f6aebb03ed51e51d346c1f906b05c5569d58ac9a12d38a2f58"}, {file = "onnxruntime-1.16.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fa93b166f2d97063dc9f33c5118c5729a4a5dd5617296b6dbef42f9047b3e81"}, @@ -6987,39 +4998,152 @@ onnxruntime = [ {file = "onnxruntime-1.16.1-cp39-cp39-win32.whl", hash = "sha256:85771adb75190db9364b25ddec353ebf07635b83eb94b64ed014f1f6d57a3857"}, {file = "onnxruntime-1.16.1-cp39-cp39-win_amd64.whl", hash = "sha256:d32d2b30799c1f950123c60ae8390818381fd5f88bdf3627eeca10071c155dc5"}, ] -opentelemetry-api = [ + +[package.dependencies] +coloredlogs = "*" +flatbuffers = "*" +numpy = ">=1.21.6" +packaging = "*" +protobuf = "*" +sympy = "*" + +[[package]] +name = "opentelemetry-api" +version = "1.15.0" +description = "OpenTelemetry Python API" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_api-1.15.0-py3-none-any.whl", hash = "sha256:e6c2d2e42140fd396e96edf75a7ceb11073f4efb4db87565a431cc9d0f93f2e0"}, {file = "opentelemetry_api-1.15.0.tar.gz", hash = "sha256:79ab791b4aaad27acc3dc3ba01596db5b5aac2ef75c70622c6038051d6c2cded"}, ] -opentelemetry-exporter-otlp = [ + +[package.dependencies] +deprecated = ">=1.2.6" +setuptools = ">=16.0" + +[[package]] +name = "opentelemetry-exporter-otlp" +version = "1.15.0" +description = "OpenTelemetry Collector Exporters" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_exporter_otlp-1.15.0-py3-none-any.whl", hash = "sha256:79f22748b6a54808a0448093dfa189c8490e729f67c134d4c992533d9393b33e"}, {file = "opentelemetry_exporter_otlp-1.15.0.tar.gz", hash = "sha256:4f7c49751d9720e2e726e13b0bb958ccade4e29122c305d92c033da432c8d2c5"}, ] -opentelemetry-exporter-otlp-proto-grpc = [ + +[package.dependencies] +opentelemetry-exporter-otlp-proto-grpc = "1.15.0" +opentelemetry-exporter-otlp-proto-http = "1.15.0" + +[[package]] +name = "opentelemetry-exporter-otlp-proto-grpc" +version = "1.15.0" +description = "OpenTelemetry Collector Protobuf over gRPC Exporter" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_exporter_otlp_proto_grpc-1.15.0-py3-none-any.whl", hash = "sha256:c2a5492ba7d140109968135d641d06ce3c5bd73c50665f787526065d57d7fd1d"}, {file = "opentelemetry_exporter_otlp_proto_grpc-1.15.0.tar.gz", hash = "sha256:844f2a4bb9bcda34e4eb6fe36765e5031aacb36dc60ed88c90fc246942ea26e7"}, ] -opentelemetry-exporter-otlp-proto-http = [ + +[package.dependencies] +backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} +googleapis-common-protos = ">=1.52,<2.0" +grpcio = ">=1.0.0,<2.0.0" +opentelemetry-api = ">=1.12,<2.0" +opentelemetry-proto = "1.15.0" +opentelemetry-sdk = ">=1.12,<2.0" + +[package.extras] +test = ["pytest-grpc"] + +[[package]] +name = "opentelemetry-exporter-otlp-proto-http" +version = "1.15.0" +description = "OpenTelemetry Collector Protobuf over HTTP Exporter" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_exporter_otlp_proto_http-1.15.0-py3-none-any.whl", hash = "sha256:3ec2a02196c8a54bf5cbf7fe623a5238625638e83b6047a983bdf96e2bbb74c0"}, {file = "opentelemetry_exporter_otlp_proto_http-1.15.0.tar.gz", hash = "sha256:11b2c814249a49b22f6cca7a06b05701f561d577b747f3660dfd67b6eb9daf9c"}, ] -opentelemetry-proto = [ + +[package.dependencies] +backoff = {version = ">=1.10.0,<3.0.0", markers = "python_version >= \"3.7\""} +googleapis-common-protos = ">=1.52,<2.0" +opentelemetry-api = ">=1.12,<2.0" +opentelemetry-proto = "1.15.0" +opentelemetry-sdk = ">=1.12,<2.0" +requests = ">=2.7,<3.0" + +[package.extras] +test = ["responses (==0.22.0)"] + +[[package]] +name = "opentelemetry-proto" +version = "1.15.0" +description = "OpenTelemetry Python Proto" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_proto-1.15.0-py3-none-any.whl", hash = "sha256:044b6d044b4d10530f250856f933442b8753a17f94ae37c207607f733fb9a844"}, {file = "opentelemetry_proto-1.15.0.tar.gz", hash = "sha256:9c4008e40ac8cab359daac283fbe7002c5c29c77ea2674ad5626a249e64e0101"}, ] -opentelemetry-sdk = [ + +[package.dependencies] +protobuf = ">=3.19,<5.0" + +[[package]] +name = "opentelemetry-sdk" +version = "1.15.0" +description = "OpenTelemetry Python SDK" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_sdk-1.15.0-py3-none-any.whl", hash = "sha256:555c533e9837766119bbccc7a80458c9971d853a6f1da683a2246cd5e53b4645"}, {file = "opentelemetry_sdk-1.15.0.tar.gz", hash = "sha256:98dbffcfeebcbff12c0c974292d6ea603180a145904cf838b1fe4d5c99078425"}, ] -opentelemetry-semantic-conventions = [ + +[package.dependencies] +opentelemetry-api = "1.15.0" +opentelemetry-semantic-conventions = "0.36b0" +setuptools = ">=16.0" +typing-extensions = ">=3.7.4" + +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.36b0" +description = "OpenTelemetry Semantic Conventions" +optional = false +python-versions = ">=3.7" +files = [ {file = "opentelemetry_semantic_conventions-0.36b0-py3-none-any.whl", hash = "sha256:adc05635e87b9d3e007c9f530eed487fc3ef2177d02f82f674f28ebf9aff8243"}, {file = "opentelemetry_semantic_conventions-0.36b0.tar.gz", hash = "sha256:829dc221795467d98b773c04096e29be038d77526dc8d6ac76f546fb6279bf01"}, ] -ordered-set = [ + +[[package]] +name = "ordered-set" +version = "4.1.0" +description = "An OrderedSet is a custom MutableSet that remembers its order, so that every" +optional = false +python-versions = ">=3.7" +files = [ {file = "ordered-set-4.1.0.tar.gz", hash = "sha256:694a8e44c87657c59292ede72891eb91d34131f6531463aab3009191c77364a8"}, {file = "ordered_set-4.1.0-py3-none-any.whl", hash = "sha256:046e1132c71fcf3330438a539928932caf51ddbc582496833e23de611de14562"}, ] -orjson = [ + +[package.extras] +dev = ["black", "mypy", "pytest"] + +[[package]] +name = "orjson" +version = "3.9.5" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +optional = false +python-versions = ">=3.7" +files = [ {file = "orjson-3.9.5-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:ad6845912a71adcc65df7c8a7f2155eba2096cf03ad2c061c93857de70d699ad"}, {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e298e0aacfcc14ef4476c3f409e85475031de24e5b23605a465e9bf4b2156273"}, {file = "orjson-3.9.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:83c9939073281ef7dd7c5ca7f54cceccb840b440cec4b8a326bda507ff88a0a6"}, @@ -7081,11 +5205,25 @@ orjson = [ {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] -packaging = [ + +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] -pandas = [ + +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.8" +files = [ {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, @@ -7112,23 +5250,98 @@ pandas = [ {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, ] -parsedatetime = [ + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + +[[package]] +name = "parsedatetime" +version = "2.4" +description = "Parse human-readable date/time text." +optional = false +python-versions = "*" +files = [ {file = "parsedatetime-2.4-py2-none-any.whl", hash = "sha256:9ee3529454bf35c40a77115f5a596771e59e1aee8c53306f346c461b8e913094"}, {file = "parsedatetime-2.4.tar.gz", hash = "sha256:3d817c58fb9570d1eec1dd46fa9448cd644eeed4fb612684b02dfda3a79cb84b"}, ] -pathspec = [ + +[package.dependencies] +future = "*" + +[[package]] +name = "pathspec" +version = "0.11.2" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.7" +files = [ {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, ] -pathvalidate = [ + +[[package]] +name = "pathvalidate" +version = "3.1.0" +description = "pathvalidate is a Python library to sanitize/validate a string such as filenames/file-paths/etc." +optional = false +python-versions = ">=3.7" +files = [ {file = "pathvalidate-3.1.0-py3-none-any.whl", hash = "sha256:912fd1d2e1a2a6a6f98da36a91f21ed86746473810ff625b9c34f3d06c0caa1d"}, {file = "pathvalidate-3.1.0.tar.gz", hash = "sha256:426970226e24199fd90d93995d223c1e28bda967cdf4370755a14cdf72a2a8ee"}, ] -pbr = [ + +[package.extras] +docs = ["Sphinx (>=2.4)", "sphinx-rtd-theme (>=1.2.2)", "urllib3 (<2)"] +test = ["Faker (>=1.0.8)", "allpairspy (>=2)", "click (>=6.2)", "pytest (>=6.0.1)", "pytest-discord (>=0.1.2)", "pytest-md-report (>=0.3)"] + +[[package]] +name = "pbr" +version = "5.11.1" +description = "Python Build Reasonableness" +optional = false +python-versions = ">=2.6" +files = [ {file = "pbr-5.11.1-py2.py3-none-any.whl", hash = "sha256:567f09558bae2b3ab53cb3c1e2e33e726ff3338e7bae3db5dc954b3a44eef12b"}, {file = "pbr-5.11.1.tar.gz", hash = "sha256:aefc51675b0b533d56bb5fd1c8c6c0522fe31896679882e1c4c63d5e4a0fccb3"}, ] -pendulum = [ + +[[package]] +name = "pendulum" +version = "2.1.2" +description = "Python datetimes made easy" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ {file = "pendulum-2.1.2-cp27-cp27m-macosx_10_15_x86_64.whl", hash = "sha256:b6c352f4bd32dff1ea7066bd31ad0f71f8d8100b9ff709fb343f3b86cee43efe"}, {file = "pendulum-2.1.2-cp27-cp27m-win_amd64.whl", hash = "sha256:318f72f62e8e23cd6660dbafe1e346950281a9aed144b5c596b2ddabc1d19739"}, {file = "pendulum-2.1.2-cp35-cp35m-macosx_10_15_x86_64.whl", hash = "sha256:0731f0c661a3cb779d398803655494893c9f581f6488048b3fb629c2342b5394"}, @@ -7151,43 +5364,149 @@ pendulum = [ {file = "pendulum-2.1.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:94b1fc947bfe38579b28e1cccb36f7e28a15e841f30384b5ad6c5e31055c85d7"}, {file = "pendulum-2.1.2.tar.gz", hash = "sha256:b06a0ca1bfe41c990bbf0c029f0b6501a7f2ec4e38bfec730712015e8860f207"}, ] -pipdeptree = [ + +[package.dependencies] +python-dateutil = ">=2.6,<3.0" +pytzdata = ">=2020.1" + +[[package]] +name = "pipdeptree" +version = "2.9.6" +description = "Command line utility to show dependency tree of packages." +optional = true +python-versions = ">=3.7" +files = [ {file = "pipdeptree-2.9.6-py3-none-any.whl", hash = "sha256:de93f990d21224297c9f03e057da5a3dc65ff732a0147945dd9421671f13626b"}, {file = "pipdeptree-2.9.6.tar.gz", hash = "sha256:f815caf165e89c576ce659b866c7a82ae4590420c2d020a92d32e45097f8bc73"}, ] -pkgutil-resolve-name = [ + +[package.extras] +graphviz = ["graphviz (>=0.20.1)"] +test = ["covdefaults (>=2.3)", "diff-cover (>=7.6)", "pip (>=23.1.2)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "virtualenv (>=20.23.1,<21)"] + +[[package]] +name = "pkgutil-resolve-name" +version = "1.3.10" +description = "Resolve a name to an object." +optional = false +python-versions = ">=3.6" +files = [ {file = "pkgutil_resolve_name-1.3.10-py3-none-any.whl", hash = "sha256:ca27cc078d25c5ad71a9de0a7a330146c4e014c2462d9af19c6b828280649c5e"}, {file = "pkgutil_resolve_name-1.3.10.tar.gz", hash = "sha256:357d6c9e6a755653cfd78893817c0853af365dd51ec97f3d358a819373bbd174"}, ] -platformdirs = [ + +[[package]] +name = "platformdirs" +version = "3.8.1" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = ">=3.7" +files = [ {file = "platformdirs-3.8.1-py3-none-any.whl", hash = "sha256:cec7b889196b9144d088e4c57d9ceef7374f6c39694ad1577a0aab50d27ea28c"}, {file = "platformdirs-3.8.1.tar.gz", hash = "sha256:f87ca4fcff7d2b0f81c6a748a77973d7af0f4d526f98f308477c3c436c74d528"}, ] -pluggy = [ + +[package.extras] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)"] + +[[package]] +name = "pluggy" +version = "1.3.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, ] -ply = [ + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, ] -portalocker = [ + +[[package]] +name = "portalocker" +version = "2.7.0" +description = "Wraps the portalocker recipe for easy usage" +optional = true +python-versions = ">=3.5" +files = [ {file = "portalocker-2.7.0-py2.py3-none-any.whl", hash = "sha256:a07c5b4f3985c3cf4798369631fb7011adb498e2a46d8440efc75a8f29a0f983"}, {file = "portalocker-2.7.0.tar.gz", hash = "sha256:032e81d534a88ec1736d03f780ba073f047a06c478b06e2937486f334e955c51"}, ] -prefixed = [ + +[package.dependencies] +pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} + +[package.extras] +docs = ["sphinx (>=1.7.1)"] +redis = ["redis"] +tests = ["pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-timeout (>=2.1.0)", "redis", "sphinx (>=6.0.0)"] + +[[package]] +name = "prefixed" +version = "0.7.0" +description = "Prefixed alternative numeric library" +optional = false +python-versions = "*" +files = [ {file = "prefixed-0.7.0-py2.py3-none-any.whl", hash = "sha256:537b0e4ff4516c4578f277a41d7104f769d6935ae9cdb0f88fed82ec7b3c0ca5"}, {file = "prefixed-0.7.0.tar.gz", hash = "sha256:0b54d15e602eb8af4ac31b1db21a37ea95ce5890e0741bb0dd9ded493cefbbe9"}, ] -prison = [ + +[[package]] +name = "prison" +version = "0.2.1" +description = "Rison encoder/decoder" +optional = false +python-versions = "*" +files = [ {file = "prison-0.2.1-py2.py3-none-any.whl", hash = "sha256:f90bab63fca497aa0819a852f64fb21a4e181ed9f6114deaa5dc04001a7555c5"}, {file = "prison-0.2.1.tar.gz", hash = "sha256:e6cd724044afcb1a8a69340cad2f1e3151a5839fd3a8027fd1357571e797c599"}, ] -proto-plus = [ + +[package.dependencies] +six = "*" + +[package.extras] +dev = ["nose", "pipreqs", "twine"] + +[[package]] +name = "proto-plus" +version = "1.22.3" +description = "Beautiful, Pythonic protocol buffers." +optional = true +python-versions = ">=3.6" +files = [ {file = "proto-plus-1.22.3.tar.gz", hash = "sha256:fdcd09713cbd42480740d2fe29c990f7fbd885a67efc328aa8be6ee3e9f76a6b"}, {file = "proto_plus-1.22.3-py3-none-any.whl", hash = "sha256:a49cd903bc0b6ab41f76bf65510439d56ca76f868adf0274e738bfdd096894df"}, ] -protobuf = [ + +[package.dependencies] +protobuf = ">=3.19.0,<5.0.0dev" + +[package.extras] +testing = ["google-api-core[grpc] (>=1.31.5)"] + +[[package]] +name = "protobuf" +version = "4.24.2" +description = "" +optional = false +python-versions = ">=3.7" +files = [ {file = "protobuf-4.24.2-cp310-abi3-win32.whl", hash = "sha256:58e12d2c1aa428ece2281cef09bbaa6938b083bcda606db3da4e02e991a0d924"}, {file = "protobuf-4.24.2-cp310-abi3-win_amd64.whl", hash = "sha256:77700b55ba41144fc64828e02afb41901b42497b8217b558e4a001f18a85f2e3"}, {file = "protobuf-4.24.2-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:237b9a50bd3b7307d0d834c1b0eb1a6cd47d3f4c2da840802cd03ea288ae8880"}, @@ -7202,7 +5521,14 @@ protobuf = [ {file = "protobuf-4.24.2-py3-none-any.whl", hash = "sha256:3b7b170d3491ceed33f723bbf2d5a260f8a4e23843799a3906f16ef736ef251e"}, {file = "protobuf-4.24.2.tar.gz", hash = "sha256:7fda70797ddec31ddfa3576cbdcc3ddbb6b3078b737a1a87ab9136af0570cd6e"}, ] -psutil = [ + +[[package]] +name = "psutil" +version = "5.9.5" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "psutil-5.9.5-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:be8929ce4313f9f8146caad4272f6abb8bf99fc6cf59344a3167ecd74f4f203f"}, {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:ab8ed1a1d77c95453db1ae00a3f9c50227ebd955437bcf2a574ba8adbf6a74d5"}, {file = "psutil-5.9.5-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:4aef137f3345082a3d3232187aeb4ac4ef959ba3d7c10c33dd73763fbc063da4"}, @@ -7218,7 +5544,17 @@ psutil = [ {file = "psutil-5.9.5-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:c607bb3b57dc779d55e1554846352b4e358c10fff3abf3514a7a6601beebdb30"}, {file = "psutil-5.9.5.tar.gz", hash = "sha256:5410638e4df39c54d957fc51ce03048acd8e6d60abc0f5107af51e5fb566eb3c"}, ] -psycopg2-binary = [ + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + +[[package]] +name = "psycopg2-binary" +version = "2.9.7" +description = "psycopg2 - Python-PostgreSQL Database Adapter" +optional = true +python-versions = ">=3.6" +files = [ {file = "psycopg2-binary-2.9.7.tar.gz", hash = "sha256:1b918f64a51ffe19cd2e230b3240ba481330ce1d4b7875ae67305bd1d37b041c"}, {file = "psycopg2_binary-2.9.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ea5f8ee87f1eddc818fc04649d952c526db4426d26bab16efbe5a0c52b27d6ab"}, {file = "psycopg2_binary-2.9.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2993ccb2b7e80844d534e55e0f12534c2871952f78e0da33c35e648bf002bbff"}, @@ -7280,14 +5616,39 @@ psycopg2-binary = [ {file = "psycopg2_binary-2.9.7-cp39-cp39-win32.whl", hash = "sha256:18f12632ab516c47c1ac4841a78fddea6508a8284c7cf0f292cb1a523f2e2379"}, {file = "psycopg2_binary-2.9.7-cp39-cp39-win_amd64.whl", hash = "sha256:eb3b8d55924a6058a26db69fb1d3e7e32695ff8b491835ba9f479537e14dcf9f"}, ] -psycopg2cffi = [ + +[[package]] +name = "psycopg2cffi" +version = "2.9.0" +description = ".. image:: https://travis-ci.org/chtd/psycopg2cffi.svg?branch=master" +optional = true +python-versions = "*" +files = [ {file = "psycopg2cffi-2.9.0.tar.gz", hash = "sha256:7e272edcd837de3a1d12b62185eb85c45a19feda9e62fa1b120c54f9e8d35c52"}, ] -py = [ + +[package.dependencies] +cffi = ">=1.0" +six = "*" + +[[package]] +name = "py" +version = "1.11.0" +description = "library with cross-python path, ini-parsing, io, code, log facilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"}, {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] -pyarrow = [ + +[[package]] +name = "pyarrow" +version = "14.0.1" +description = "Python library for Apache Arrow" +optional = true +python-versions = ">=3.8" +files = [ {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, @@ -7325,31 +5686,106 @@ pyarrow = [ {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, ] -pyasn1 = [ + +[package.dependencies] +numpy = ">=1.16.6" + +[[package]] +name = "pyasn1" +version = "0.5.0" +description = "Pure-Python implementation of ASN.1 types and DER/BER/CER codecs (X.208)" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, {file = "pyasn1-0.5.0.tar.gz", hash = "sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde"}, ] -pyasn1-modules = [ + +[[package]] +name = "pyasn1-modules" +version = "0.3.0" +description = "A collection of ASN.1-based protocols modules" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ {file = "pyasn1_modules-0.3.0-py2.py3-none-any.whl", hash = "sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d"}, {file = "pyasn1_modules-0.3.0.tar.gz", hash = "sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c"}, ] -pyathena = [ + +[package.dependencies] +pyasn1 = ">=0.4.6,<0.6.0" + +[[package]] +name = "pyathena" +version = "3.0.6" +description = "Python DB API 2.0 (PEP 249) client for Amazon Athena" +optional = true +python-versions = ">=3.8.1" +files = [ {file = "pyathena-3.0.6-py3-none-any.whl", hash = "sha256:27fb606a73644e62be8ef9b86cdf583ab3cb9f8cac9c2ad8f05b7ad6d4eaaa87"}, {file = "pyathena-3.0.6.tar.gz", hash = "sha256:ee6ea175134894209af2c6be1859b7be4371f7741faa7a58f9f97905ff6a73a4"}, ] -pycodestyle = [ + +[package.dependencies] +boto3 = ">=1.26.4" +botocore = ">=1.29.4" +fsspec = "*" +tenacity = ">=4.1.0" + +[package.extras] +arrow = ["pyarrow (>=7.0.0)"] +fastparquet = ["fastparquet (>=0.4.0)"] +pandas = ["pandas (>=1.3.0)"] +sqlalchemy = ["sqlalchemy (>=1.0.0)"] + +[[package]] +name = "pycodestyle" +version = "2.9.1" +description = "Python style guide checker" +optional = false +python-versions = ">=3.6" +files = [ {file = "pycodestyle-2.9.1-py2.py3-none-any.whl", hash = "sha256:d1735fc58b418fd7c5f658d28d943854f8a849b01a5d0a1e6f3f3fdd0166804b"}, {file = "pycodestyle-2.9.1.tar.gz", hash = "sha256:2c9607871d58c76354b697b42f5d57e1ada7d261c261efac224b664affdc5785"}, ] -pycparser = [ + +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -pydantic = [ + +[[package]] +name = "pydantic" +version = "2.5.0" +description = "Data validation using Python type hints" +optional = false +python-versions = ">=3.7" +files = [ {file = "pydantic-2.5.0-py3-none-any.whl", hash = "sha256:7ce6e766c456ad026fe5712f7bcf036efc34bd5d107b3e669ef7ea01b3a9050c"}, {file = "pydantic-2.5.0.tar.gz", hash = "sha256:69bd6fb62d2d04b7055f59a396993486a2ee586c43a0b89231ce0000de07627c"}, ] -pydantic-core = [ + +[package.dependencies] +annotated-types = ">=0.4.0" +pydantic-core = "2.14.1" +typing-extensions = ">=4.6.1" + +[package.extras] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.14.1" +description = "" +optional = false +python-versions = ">=3.7" +files = [ {file = "pydantic_core-2.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:812beca1dcb2b722cccc7e9c620bd972cbc323321194ec2725eab3222e6ac573"}, {file = "pydantic_core-2.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2ccdc53cb88e51c7d47d74c59630d7be844428f6b8d463055ffad6f0392d8da"}, {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd937733bf2fe7d6a8bf208c12741f1f730b7bf5636033877767a75093c29b8a"}, @@ -7452,23 +5888,89 @@ pydantic-core = [ {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d983222223f63e323a5f497f5b85e211557a5d8fb670dc88f343784502b466ba"}, {file = "pydantic_core-2.14.1.tar.gz", hash = "sha256:0d82a6ee815388a362885186e431fac84c7a06623bc136f508e9f88261d8cadb"}, ] -pydoc-markdown = [ + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" + +[[package]] +name = "pydoc-markdown" +version = "4.8.2" +description = "Create Python API documentation in Markdown format." +optional = false +python-versions = ">=3.7,<4.0" +files = [ {file = "pydoc_markdown-4.8.2-py3-none-any.whl", hash = "sha256:203f74119e6bb2f9deba43d452422de7c8ec31955b61e0620fa4dd8c2611715f"}, {file = "pydoc_markdown-4.8.2.tar.gz", hash = "sha256:fb6c927e31386de17472d42f9bd3d3be2905977d026f6216881c65145aa67f0b"}, ] -pyflakes = [ + +[package.dependencies] +click = ">=7.1,<9.0" +"databind.core" = ">=4.4.0,<5.0.0" +"databind.json" = ">=4.4.0,<5.0.0" +docspec = ">=2.2.1,<3.0.0" +docspec-python = ">=2.2.1,<3.0.0" +docstring-parser = ">=0.11,<0.12" +jinja2 = ">=3.0.0,<4.0.0" +"nr.util" = ">=0.7.5,<1.0.0" +PyYAML = ">=5.0,<7.0" +requests = ">=2.23.0,<3.0.0" +tomli = ">=2.0.0,<3.0.0" +tomli_w = ">=1.0.0,<2.0.0" +watchdog = "*" +yapf = ">=0.30.0" + +[[package]] +name = "pyflakes" +version = "2.5.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.6" +files = [ {file = "pyflakes-2.5.0-py2.py3-none-any.whl", hash = "sha256:4579f67d887f804e67edb544428f264b7b24f435b263c4614f384135cea553d2"}, {file = "pyflakes-2.5.0.tar.gz", hash = "sha256:491feb020dca48ccc562a8c0cbe8df07ee13078df59813b83959cbdada312ea3"}, ] -pygments = [ + +[[package]] +name = "pygments" +version = "2.16.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ {file = "Pygments-2.16.1-py3-none-any.whl", hash = "sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692"}, {file = "Pygments-2.16.1.tar.gz", hash = "sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29"}, ] -pyjwt = [ + +[package.extras] +plugins = ["importlib-metadata"] + +[[package]] +name = "pyjwt" +version = "2.8.0" +description = "JSON Web Token implementation in Python" +optional = false +python-versions = ">=3.7" +files = [ {file = "PyJWT-2.8.0-py3-none-any.whl", hash = "sha256:59127c392cc44c2da5bb3192169a91f429924e17aff6534d70fdc02ab3e04320"}, {file = "PyJWT-2.8.0.tar.gz", hash = "sha256:57e28d156e3d5c10088e0c68abb90bfac3df82b40a71bd0daa20c65ccd5c23de"}, ] -pymongo = [ + +[package.dependencies] +cryptography = {version = ">=3.4.0", optional = true, markers = "extra == \"crypto\""} + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + +[[package]] +name = "pymongo" +version = "4.6.0" +description = "Python driver for MongoDB " +optional = false +python-versions = ">=3.7" +files = [ {file = "pymongo-4.6.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c011bd5ad03cc096f99ffcfdd18a1817354132c1331bed7a837a25226659845f"}, {file = "pymongo-4.6.0-cp310-cp310-manylinux1_i686.whl", hash = "sha256:5e63146dbdb1eac207464f6e0cfcdb640c9c5ff0f57b754fa96fe252314a1dc6"}, {file = "pymongo-4.6.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:2972dd1f1285866aba027eff2f4a2bbf8aa98563c2ced14cb34ee5602b36afdf"}, @@ -7551,11 +6053,41 @@ pymongo = [ {file = "pymongo-4.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:107a234dc55affc5802acb3b6d83cbb8c87355b38a9457fcd8806bdeb8bce161"}, {file = "pymongo-4.6.0.tar.gz", hash = "sha256:fb1c56d891f9e34303c451998ef62ba52659648bb0d75b03c5e4ac223a3342c2"}, ] -pymysql = [ + +[package.dependencies] +dnspython = ">=1.16.0,<3.0.0" + +[package.extras] +aws = ["pymongo-auth-aws (<2.0.0)"] +encryption = ["certifi", "pymongo[aws]", "pymongocrypt (>=1.6.0,<2.0.0)"] +gssapi = ["pykerberos", "winkerberos (>=0.5.0)"] +ocsp = ["certifi", "cryptography (>=2.5)", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +snappy = ["python-snappy"] +test = ["pytest (>=7)"] +zstd = ["zstandard"] + +[[package]] +name = "pymysql" +version = "1.1.0" +description = "Pure Python MySQL Driver" +optional = false +python-versions = ">=3.7" +files = [ {file = "PyMySQL-1.1.0-py3-none-any.whl", hash = "sha256:8969ec6d763c856f7073c4c64662882675702efcb114b4bcbb955aea3a069fa7"}, {file = "PyMySQL-1.1.0.tar.gz", hash = "sha256:4f13a7df8bf36a51e81dd9f3605fede45a4878fe02f9236349fd82a3f0612f96"}, ] -pyodbc = [ + +[package.extras] +ed25519 = ["PyNaCl (>=1.4.0)"] +rsa = ["cryptography"] + +[[package]] +name = "pyodbc" +version = "4.0.39" +description = "DB API Module for ODBC" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ {file = "pyodbc-4.0.39-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74af348dbaee4885998858daf50c8964e767629ecf6c195868b016367b0bb861"}, {file = "pyodbc-4.0.39-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0f5901b57eaef0761f4cf02bca8e7c63f589fd0fd723a79f6ccf1ea1275372e5"}, {file = "pyodbc-4.0.39-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0db69478d00fcd8d0b9bdde8aca0b0eada341fd6ed8c2da84b594b928c84106"}, @@ -7592,74 +6124,276 @@ pyodbc = [ {file = "pyodbc-4.0.39-cp39-cp39-win_amd64.whl", hash = "sha256:305c7d6337e2d4c8350677cc641b343fc0197b7b9bc167815c66b64545c67a53"}, {file = "pyodbc-4.0.39.tar.gz", hash = "sha256:e528bb70dd6d6299ee429868925df0866e3e919c772b9eff79c8e17920d8f116"}, ] -pyopenssl = [ + +[[package]] +name = "pyopenssl" +version = "23.2.0" +description = "Python wrapper module around the OpenSSL library" +optional = true +python-versions = ">=3.6" +files = [ {file = "pyOpenSSL-23.2.0-py3-none-any.whl", hash = "sha256:24f0dc5227396b3e831f4c7f602b950a5e9833d292c8e4a2e06b709292806ae2"}, {file = "pyOpenSSL-23.2.0.tar.gz", hash = "sha256:276f931f55a452e7dea69c7173e984eb2a4407ce413c918aa34b55f82f9b8bac"}, ] -pyparsing = [ + +[package.dependencies] +cryptography = ">=38.0.0,<40.0.0 || >40.0.0,<40.0.1 || >40.0.1,<42" + +[package.extras] +docs = ["sphinx (!=5.2.0,!=5.2.0.post0)", "sphinx-rtd-theme"] +test = ["flaky", "pretend", "pytest (>=3.0.1)"] + +[[package]] +name = "pyparsing" +version = "3.1.1" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.6.8" +files = [ {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, ] -pypdf2 = [ + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "pypdf2" +version = "3.0.1" +description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" +optional = false +python-versions = ">=3.6" +files = [ {file = "PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440"}, {file = "pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928"}, ] -pyreadline3 = [ + +[package.dependencies] +typing_extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +crypto = ["PyCryptodome"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] +full = ["Pillow", "PyCryptodome"] +image = ["Pillow"] + +[[package]] +name = "pyreadline3" +version = "3.4.1" +description = "A python implementation of GNU readline." +optional = true +python-versions = "*" +files = [ {file = "pyreadline3-3.4.1-py3-none-any.whl", hash = "sha256:b0efb6516fd4fb07b45949053826a62fa4cb353db5be2bbb4a7aa1fdd1e345fb"}, {file = "pyreadline3-3.4.1.tar.gz", hash = "sha256:6f3d1f7b8a31ba32b73917cefc1f28cc660562f39aea8646d30bd6eff21f7bae"}, ] -pytest = [ + +[[package]] +name = "pytest" +version = "6.2.5" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.6" +files = [ {file = "pytest-6.2.5-py3-none-any.whl", hash = "sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134"}, {file = "pytest-6.2.5.tar.gz", hash = "sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89"}, ] -pytest-cases = [ + +[package.dependencies] +atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} +attrs = ">=19.2.0" +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +py = ">=1.8.2" +toml = "*" + +[package.extras] +testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xmlschema"] + +[[package]] +name = "pytest-cases" +version = "3.6.14" +description = "Separate test code from test cases in pytest." +optional = false +python-versions = "*" +files = [ {file = "pytest-cases-3.6.14.tar.gz", hash = "sha256:7455e6ca57a544c1bfdd8b56ace08c1c1ce4c6572a8aab8f1bd351dc25a10b6b"}, {file = "pytest_cases-3.6.14-py2.py3-none-any.whl", hash = "sha256:a087f3d019efd8942d0f0dc3fb526bedf9f83d742c40289e9623f6788aff7257"}, ] -pytest-console-scripts = [ + +[package.dependencies] +decopatch = "*" +makefun = ">=1.9.5" + +[[package]] +name = "pytest-console-scripts" +version = "1.4.1" +description = "Pytest plugin for testing console scripts" +optional = false +python-versions = ">=3.8" +files = [ {file = "pytest-console-scripts-1.4.1.tar.gz", hash = "sha256:5a826ed84cc0afa202eb9e44381d7d762f7bdda8e0c23f9f79a7f1f44cf4a895"}, {file = "pytest_console_scripts-1.4.1-py3-none-any.whl", hash = "sha256:ad860a951a90eca4bd3bd1159b8f5428633ba4ea01abd5c9526b67a95f65437a"}, ] -pytest-forked = [ + +[package.dependencies] +importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""} +pytest = ">=4.0.0" + +[[package]] +name = "pytest-forked" +version = "1.6.0" +description = "run tests in isolated forked subprocesses" +optional = false +python-versions = ">=3.7" +files = [ {file = "pytest-forked-1.6.0.tar.gz", hash = "sha256:4dafd46a9a600f65d822b8f605133ecf5b3e1941ebb3588e943b4e3eb71a5a3f"}, {file = "pytest_forked-1.6.0-py3-none-any.whl", hash = "sha256:810958f66a91afb1a1e2ae83089d8dc1cd2437ac96b12963042fbb9fb4d16af0"}, ] -pytest-order = [ + +[package.dependencies] +py = "*" +pytest = ">=3.10" + +[[package]] +name = "pytest-order" +version = "1.1.0" +description = "pytest plugin to run your tests in a specific order" +optional = false +python-versions = ">=3.6" +files = [ {file = "pytest-order-1.1.0.tar.gz", hash = "sha256:139d25b30826b78eebb42722f747eab14c44b88059d7a71d4f79d14a057269a5"}, {file = "pytest_order-1.1.0-py3-none-any.whl", hash = "sha256:3b3730969c97900fa5cd31ecff80847680ed56b2490954565c14949ba60d9371"}, ] -pytest-pythonpath = [ + +[package.dependencies] +pytest = [ + {version = ">=5.0", markers = "python_version < \"3.10\""}, + {version = ">=6.2.4", markers = "python_version >= \"3.10\""}, +] + +[[package]] +name = "pytest-pythonpath" +version = "0.7.4" +description = "pytest plugin for adding to the PYTHONPATH from command line or configs." +optional = false +python-versions = ">=2.6, <4" +files = [ {file = "pytest-pythonpath-0.7.4.tar.gz", hash = "sha256:64e195b23a8f8c0c631fb16882d9ad6fa4137ed1f2961ddd15d52065cd435db6"}, {file = "pytest_pythonpath-0.7.4-py3-none-any.whl", hash = "sha256:e73e11dab2f0b83e73229e261242b251f0a369d7f527dbfec068822fd26a6ce5"}, ] -python-daemon = [ + +[package.dependencies] +pytest = ">=2.5.2,<7" + +[[package]] +name = "python-daemon" +version = "3.0.1" +description = "Library to implement a well-behaved Unix daemon process." +optional = false +python-versions = ">=3" +files = [ {file = "python-daemon-3.0.1.tar.gz", hash = "sha256:6c57452372f7eaff40934a1c03ad1826bf5e793558e87fef49131e6464b4dae5"}, {file = "python_daemon-3.0.1-py3-none-any.whl", hash = "sha256:42bb848a3260a027fa71ad47ecd959e471327cb34da5965962edd5926229f341"}, ] -python-dateutil = [ + +[package.dependencies] +docutils = "*" +lockfile = ">=0.10" +setuptools = ">=62.4.0" + +[package.extras] +devel = ["coverage", "docutils", "isort", "testscenarios (>=0.4)", "testtools", "twine"] +test = ["coverage", "docutils", "testscenarios (>=0.4)", "testtools"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] -python-nvd3 = [ + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "python-nvd3" +version = "0.15.0" +description = "Python NVD3 - Chart Library for d3.js" +optional = false +python-versions = "*" +files = [ {file = "python-nvd3-0.15.0.tar.gz", hash = "sha256:fbd75ff47e0ef255b4aa4f3a8b10dc8b4024aa5a9a7abed5b2406bd3cb817715"}, ] -python-slugify = [ + +[package.dependencies] +Jinja2 = ">=2.8" +python-slugify = ">=1.2.5" + +[[package]] +name = "python-slugify" +version = "8.0.1" +description = "A Python slugify application that also handles Unicode" +optional = false +python-versions = ">=3.7" +files = [ {file = "python-slugify-8.0.1.tar.gz", hash = "sha256:ce0d46ddb668b3be82f4ed5e503dbc33dd815d83e2eb6824211310d3fb172a27"}, {file = "python_slugify-8.0.1-py2.py3-none-any.whl", hash = "sha256:70ca6ea68fe63ecc8fa4fcf00ae651fc8a5d02d93dcd12ae6d4fc7ca46c4d395"}, ] -pytimeparse = [ + +[package.dependencies] +text-unidecode = ">=1.3" + +[package.extras] +unidecode = ["Unidecode (>=1.1.1)"] + +[[package]] +name = "pytimeparse" +version = "1.1.8" +description = "Time expression parser" +optional = false +python-versions = "*" +files = [ {file = "pytimeparse-1.1.8-py2.py3-none-any.whl", hash = "sha256:04b7be6cc8bd9f5647a6325444926c3ac34ee6bc7e69da4367ba282f076036bd"}, {file = "pytimeparse-1.1.8.tar.gz", hash = "sha256:e86136477be924d7e670646a98561957e8ca7308d44841e21f5ddea757556a0a"}, ] -pytz = [ + +[[package]] +name = "pytz" +version = "2023.3" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"}, {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"}, ] -pytzdata = [ + +[[package]] +name = "pytzdata" +version = "2020.1" +description = "The Olson timezone database for Python." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "pytzdata-2020.1-py2.py3-none-any.whl", hash = "sha256:e1e14750bcf95016381e4d472bad004eef710f2d6417240904070b3d6654485f"}, {file = "pytzdata-2020.1.tar.gz", hash = "sha256:3efa13b335a00a8de1d345ae41ec78dd11c9f8807f522d39850f2dd828681540"}, ] -pywin32 = [ + +[[package]] +name = "pywin32" +version = "306" +description = "Python for Window Extensions" +optional = true +python-versions = "*" +files = [ {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, @@ -7675,11 +6409,25 @@ pywin32 = [ {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, ] -pywin32-ctypes = [ + +[[package]] +name = "pywin32-ctypes" +version = "0.2.2" +description = "A (partial) reimplementation of pywin32 using ctypes/cffi" +optional = true +python-versions = ">=3.6" +files = [ {file = "pywin32-ctypes-0.2.2.tar.gz", hash = "sha256:3426e063bdd5fd4df74a14fa3cf80a0b42845a87e1d1e81f6549f9daec593a60"}, {file = "pywin32_ctypes-0.2.2-py3-none-any.whl", hash = "sha256:bf490a1a709baf35d688fe0ecf980ed4de11d2b3e37b51e5442587a75d9957e7"}, ] -pyyaml = [ + +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, @@ -7721,18 +6469,80 @@ pyyaml = [ {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] -qdrant-client = [ + +[[package]] +name = "qdrant-client" +version = "1.6.4" +description = "Client library for the Qdrant vector search engine" +optional = true +python-versions = ">=3.8,<3.13" +files = [ {file = "qdrant_client-1.6.4-py3-none-any.whl", hash = "sha256:db4696978d6a62d78ff60f70b912383f1e467bda3053f732b01ddb5f93281b10"}, {file = "qdrant_client-1.6.4.tar.gz", hash = "sha256:bbd65f383b6a55a9ccf4e301250fa925179340dd90cfde9b93ce4230fd68867b"}, ] -redshift-connector = [ + +[package.dependencies] +fastembed = {version = "0.1.1", optional = true, markers = "python_version < \"3.12\" and extra == \"fastembed\""} +grpcio = ">=1.41.0" +grpcio-tools = ">=1.41.0" +httpx = {version = ">=0.14.0", extras = ["http2"]} +numpy = [ + {version = ">=1.21", markers = "python_version >= \"3.8\" and python_version < \"3.12\""}, + {version = ">=1.26", markers = "python_version >= \"3.12\""}, +] +portalocker = ">=2.7.0,<3.0.0" +pydantic = ">=1.10.8" +urllib3 = ">=1.26.14,<2.0.0" + +[package.extras] +fastembed = ["fastembed (==0.1.1)"] + +[[package]] +name = "redshift-connector" +version = "2.0.913" +description = "Redshift interface library" +optional = true +python-versions = ">=3.6" +files = [ {file = "redshift_connector-2.0.913-py3-none-any.whl", hash = "sha256:bd70395c5b7ec9fcae9565daff6bcb88c7d3ea6182dafba2bac6138f68d00582"}, ] -referencing = [ + +[package.dependencies] +beautifulsoup4 = ">=4.7.0,<5.0.0" +boto3 = ">=1.9.201,<2.0.0" +botocore = ">=1.12.201,<2.0.0" +lxml = ">=4.6.5" +packaging = "*" +pytz = ">=2020.1" +requests = ">=2.23.0,<3.0.0" +scramp = ">=1.2.0,<1.5.0" +setuptools = "*" + +[package.extras] +full = ["numpy", "pandas"] + +[[package]] +name = "referencing" +version = "0.30.2" +description = "JSON Referencing + Python" +optional = false +python-versions = ">=3.8" +files = [ {file = "referencing-0.30.2-py3-none-any.whl", hash = "sha256:449b6669b6121a9e96a7f9e410b245d471e8d48964c67113ce9afe50c8dd7bdf"}, {file = "referencing-0.30.2.tar.gz", hash = "sha256:794ad8003c65938edcdbc027f1933215e0d0ccc0291e3ce20a4d87432b59efc0"}, ] -regex = [ + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" + +[[package]] +name = "regex" +version = "2023.8.8" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.6" +files = [ {file = "regex-2023.8.8-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:88900f521c645f784260a8d346e12a1590f79e96403971241e64c3a265c8ecdb"}, {file = "regex-2023.8.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3611576aff55918af2697410ff0293d6071b7e00f4b09e005d614686ac4cd57c"}, {file = "regex-2023.8.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b8a0ccc8f2698f120e9e5742f4b38dc944c38744d4bdfc427616f3a163dd9de5"}, @@ -7822,39 +6632,147 @@ regex = [ {file = "regex-2023.8.8-cp39-cp39-win_amd64.whl", hash = "sha256:5543c055d8ec7801901e1193a51570643d6a6ab8751b1f7dd9af71af467538bb"}, {file = "regex-2023.8.8.tar.gz", hash = "sha256:fcbdc5f2b0f1cd0f6a56cdb46fe41d2cce1e644e3b68832f3eeebc5fb0f7712e"}, ] -requests = [ + +[[package]] +name = "requests" +version = "2.31.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, ] -requests-mock = [ + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-mock" +version = "1.11.0" +description = "Mock out responses from the requests package" +optional = false +python-versions = "*" +files = [ {file = "requests-mock-1.11.0.tar.gz", hash = "sha256:ef10b572b489a5f28e09b708697208c4a3b2b89ef80a9f01584340ea357ec3c4"}, {file = "requests_mock-1.11.0-py2.py3-none-any.whl", hash = "sha256:f7fae383f228633f6bececebdab236c478ace2284d6292c6e7e2867b9ab74d15"}, ] -requests-oauthlib = [ + +[package.dependencies] +requests = ">=2.3,<3" +six = "*" + +[package.extras] +fixture = ["fixtures"] +test = ["fixtures", "mock", "purl", "pytest", "requests-futures", "sphinx", "testtools"] + +[[package]] +name = "requests-oauthlib" +version = "1.3.1" +description = "OAuthlib authentication support for Requests." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "requests-oauthlib-1.3.1.tar.gz", hash = "sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a"}, {file = "requests_oauthlib-1.3.1-py2.py3-none-any.whl", hash = "sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5"}, ] -requests-toolbelt = [ + +[package.dependencies] +oauthlib = ">=3.0.0" +requests = ">=2.0.0" + +[package.extras] +rsa = ["oauthlib[signedtoken] (>=3.0.0)"] + +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, ] -requirements-parser = [ + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + +[[package]] +name = "requirements-parser" +version = "0.5.0" +description = "This is a small Python module for parsing Pip requirement files." +optional = false +python-versions = ">=3.6,<4.0" +files = [ {file = "requirements-parser-0.5.0.tar.gz", hash = "sha256:3336f3a3ae23e06d3f0f88595e4052396e3adf91688787f637e5d2ca1a904069"}, {file = "requirements_parser-0.5.0-py3-none-any.whl", hash = "sha256:e7fcdcd04f2049e73a9fb150d8a0f9d51ce4108f5f7cbeac74c484e17b12bcd9"}, ] -rfc3339-validator = [ + +[package.dependencies] +types-setuptools = ">=57.0.0" + +[[package]] +name = "rfc3339-validator" +version = "0.1.4" +description = "A pure python RFC3339 validator" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ {file = "rfc3339_validator-0.1.4-py2.py3-none-any.whl", hash = "sha256:24f6ec1eda14ef823da9e36ec7113124b39c04d50a4d3d3a3c2859577e7791fa"}, {file = "rfc3339_validator-0.1.4.tar.gz", hash = "sha256:138a2abdf93304ad60530167e51d2dfb9549521a836871b88d7f4695d0022f6b"}, ] -rich = [ + +[package.dependencies] +six = "*" + +[[package]] +name = "rich" +version = "13.5.2" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ {file = "rich-13.5.2-py3-none-any.whl", hash = "sha256:146a90b3b6b47cac4a73c12866a499e9817426423f57c5a66949c086191a8808"}, {file = "rich-13.5.2.tar.gz", hash = "sha256:fb9d6c0a0f643c99eed3875b5377a184132ba9be4d61516a55273d3554d75a39"}, ] -rich-argparse = [ + +[package.dependencies] +markdown-it-py = ">=2.2.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[[package]] +name = "rich-argparse" +version = "1.3.0" +description = "Rich help formatters for argparse and optparse" +optional = false +python-versions = ">=3.7" +files = [ {file = "rich_argparse-1.3.0-py3-none-any.whl", hash = "sha256:1a5eda1659c0a215862fe3630fcbe68d7792f18a8106baaf4e005b9896acc6f6"}, {file = "rich_argparse-1.3.0.tar.gz", hash = "sha256:974cc1ba0aaa0d6aabc09ab1b78f9ba928670e08590f9551121bcbc60c75b74a"}, ] -rpds-py = [ + +[package.dependencies] +rich = ">=11.0.0" + +[[package]] +name = "rpds-py" +version = "0.10.0" +description = "Python bindings to Rust's persistent data structures (rpds)" +optional = false +python-versions = ">=3.8" +files = [ {file = "rpds_py-0.10.0-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:c1e0e9916301e3b3d970814b1439ca59487f0616d30f36a44cead66ee1748c31"}, {file = "rpds_py-0.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8ce8caa29ebbdcde67e5fd652c811d34bc01f249dbc0d61e5cc4db05ae79a83b"}, {file = "rpds_py-0.10.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad277f74b1c164f7248afa968700e410651eb858d7c160d109fb451dc45a2f09"}, @@ -7953,35 +6871,148 @@ rpds-py = [ {file = "rpds_py-0.10.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:872f3dcaa8bf2245944861d7311179d2c0c9b2aaa7d3b464d99a7c2e401f01fa"}, {file = "rpds_py-0.10.0.tar.gz", hash = "sha256:e36d7369363d2707d5f68950a64c4e025991eb0177db01ccb6aa6facae48b69f"}, ] -rsa = [ + +[[package]] +name = "rsa" +version = "4.9" +description = "Pure-Python RSA implementation" +optional = false +python-versions = ">=3.6,<4" +files = [ {file = "rsa-4.9-py3-none-any.whl", hash = "sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7"}, {file = "rsa-4.9.tar.gz", hash = "sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21"}, ] -s3fs = [ + +[package.dependencies] +pyasn1 = ">=0.1.3" + +[[package]] +name = "s3fs" +version = "2023.6.0" +description = "Convenient Filesystem interface over S3" +optional = true +python-versions = ">= 3.8" +files = [ {file = "s3fs-2023.6.0-py3-none-any.whl", hash = "sha256:d1a0a423d0d2e17fb2a193d9531935dc3f45ba742693448a461b6b34f6a92a24"}, {file = "s3fs-2023.6.0.tar.gz", hash = "sha256:63fd8ddf05eb722de784b7b503196107f2a518061298cf005a8a4715b4d49117"}, ] -s3transfer = [ + +[package.dependencies] +aiobotocore = ">=2.5.0,<2.6.0" +aiohttp = "<4.0.0a0 || >4.0.0a0,<4.0.0a1 || >4.0.0a1" +fsspec = "2023.6.0" + +[package.extras] +awscli = ["aiobotocore[awscli] (>=2.5.0,<2.6.0)"] +boto3 = ["aiobotocore[boto3] (>=2.5.0,<2.6.0)"] + +[[package]] +name = "s3transfer" +version = "0.6.2" +description = "An Amazon S3 Transfer Manager" +optional = true +python-versions = ">= 3.7" +files = [ {file = "s3transfer-0.6.2-py3-none-any.whl", hash = "sha256:b014be3a8a2aab98cfe1abc7229cc5a9a0cf05eb9c1f2b86b230fd8df3f78084"}, {file = "s3transfer-0.6.2.tar.gz", hash = "sha256:cab66d3380cca3e70939ef2255d01cd8aece6a4907a9528740f668c4b0611861"}, ] -scramp = [ + +[package.dependencies] +botocore = ">=1.12.36,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] + +[[package]] +name = "scramp" +version = "1.4.4" +description = "An implementation of the SCRAM protocol." +optional = true +python-versions = ">=3.7" +files = [ {file = "scramp-1.4.4-py3-none-any.whl", hash = "sha256:b142312df7c2977241d951318b7ee923d6b7a4f75ba0f05b621ece1ed616faa3"}, {file = "scramp-1.4.4.tar.gz", hash = "sha256:b7022a140040f33cf863ab2657917ed05287a807b917950489b89b9f685d59bc"}, ] -secretstorage = [ + +[package.dependencies] +asn1crypto = ">=1.5.1" + +[[package]] +name = "secretstorage" +version = "3.3.3" +description = "Python bindings to FreeDesktop.org Secret Service API" +optional = true +python-versions = ">=3.6" +files = [ {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, ] -semver = [ + +[package.dependencies] +cryptography = ">=2.0" +jeepney = ">=0.6" + +[[package]] +name = "semver" +version = "3.0.1" +description = "Python helper for Semantic Versioning (https://semver.org)" +optional = false +python-versions = ">=3.7" +files = [ {file = "semver-3.0.1-py3-none-any.whl", hash = "sha256:2a23844ba1647362c7490fe3995a86e097bb590d16f0f32dfc383008f19e4cdf"}, {file = "semver-3.0.1.tar.gz", hash = "sha256:9ec78c5447883c67b97f98c3b6212796708191d22e4ad30f4570f840171cbce1"}, ] -sentry-sdk = [ + +[[package]] +name = "sentry-sdk" +version = "1.30.0" +description = "Python client for Sentry (https://sentry.io)" +optional = false +python-versions = "*" +files = [ {file = "sentry-sdk-1.30.0.tar.gz", hash = "sha256:7dc873b87e1faf4d00614afd1058bfa1522942f33daef8a59f90de8ed75cd10c"}, {file = "sentry_sdk-1.30.0-py2.py3-none-any.whl", hash = "sha256:2e53ad63f96bb9da6570ba2e755c267e529edcf58580a2c0d2a11ef26e1e678b"}, ] -setproctitle = [ + +[package.dependencies] +certifi = "*" +urllib3 = {version = ">=1.26.11", markers = "python_version >= \"3.6\""} + +[package.extras] +aiohttp = ["aiohttp (>=3.5)"] +arq = ["arq (>=0.23)"] +beam = ["apache-beam (>=2.12)"] +bottle = ["bottle (>=0.12.13)"] +celery = ["celery (>=3)"] +chalice = ["chalice (>=1.16.0)"] +django = ["django (>=1.8)"] +falcon = ["falcon (>=1.4)"] +fastapi = ["fastapi (>=0.79.0)"] +flask = ["blinker (>=1.1)", "flask (>=0.11)", "markupsafe"] +grpcio = ["grpcio (>=1.21.1)"] +httpx = ["httpx (>=0.16.0)"] +huey = ["huey (>=2)"] +loguru = ["loguru (>=0.5)"] +opentelemetry = ["opentelemetry-distro (>=0.35b0)"] +opentelemetry-experimental = ["opentelemetry-distro (>=0.40b0,<1.0)", "opentelemetry-instrumentation-aiohttp-client (>=0.40b0,<1.0)", "opentelemetry-instrumentation-django (>=0.40b0,<1.0)", "opentelemetry-instrumentation-fastapi (>=0.40b0,<1.0)", "opentelemetry-instrumentation-flask (>=0.40b0,<1.0)", "opentelemetry-instrumentation-requests (>=0.40b0,<1.0)", "opentelemetry-instrumentation-sqlite3 (>=0.40b0,<1.0)", "opentelemetry-instrumentation-urllib (>=0.40b0,<1.0)"] +pure-eval = ["asttokens", "executing", "pure-eval"] +pymongo = ["pymongo (>=3.1)"] +pyspark = ["pyspark (>=2.4.4)"] +quart = ["blinker (>=1.1)", "quart (>=0.16.1)"] +rq = ["rq (>=0.6)"] +sanic = ["sanic (>=0.8)"] +sqlalchemy = ["sqlalchemy (>=1.2)"] +starlette = ["starlette (>=0.19.1)"] +starlite = ["starlite (>=1.48)"] +tornado = ["tornado (>=5)"] + +[[package]] +name = "setproctitle" +version = "1.3.2" +description = "A Python module to customize the process title" +optional = false +python-versions = ">=3.7" +files = [ {file = "setproctitle-1.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:288943dec88e178bb2fd868adf491197cc0fc8b6810416b1c6775e686bab87fe"}, {file = "setproctitle-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:630f6fe5e24a619ccf970c78e084319ee8be5be253ecc9b5b216b0f474f5ef18"}, {file = "setproctitle-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c877691b90026670e5a70adfbcc735460a9f4c274d35ec5e8a43ce3f8443005"}, @@ -8055,11 +7086,33 @@ setproctitle = [ {file = "setproctitle-1.3.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7aa0aac1711fadffc1d51e9d00a3bea61f68443d6ac0241a224e4d622489d665"}, {file = "setproctitle-1.3.2.tar.gz", hash = "sha256:b9fb97907c830d260fa0658ed58afd48a86b2b88aac521135c352ff7fd3477fd"}, ] -setuptools = [ + +[package.extras] +test = ["pytest"] + +[[package]] +name = "setuptools" +version = "68.1.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ {file = "setuptools-68.1.2-py3-none-any.whl", hash = "sha256:3d8083eed2d13afc9426f227b24fd1659489ec107c0e86cec2ffdde5c92e790b"}, {file = "setuptools-68.1.2.tar.gz", hash = "sha256:3d4dfa6d95f1b101d695a6160a7626e15583af71a5f52176efa5d39a054d475d"}, ] -simplejson = [ + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5,<=7.1.2)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "simplejson" +version = "3.19.1" +description = "Simple, fast, extensible JSON encoder/decoder for Python" +optional = false +python-versions = ">=2.5, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ {file = "simplejson-3.19.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:412e58997a30c5deb8cab5858b8e2e5b40ca007079f7010ee74565cc13d19665"}, {file = "simplejson-3.19.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e765b1f47293dedf77946f0427e03ee45def2862edacd8868c6cf9ab97c8afbd"}, {file = "simplejson-3.19.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:3231100edee292da78948fa0a77dee4e5a94a0a60bcba9ed7a9dc77f4d4bb11e"}, @@ -8146,19 +7199,47 @@ simplejson = [ {file = "simplejson-3.19.1-py3-none-any.whl", hash = "sha256:4710806eb75e87919b858af0cba4ffedc01b463edc3982ded7b55143f39e41e1"}, {file = "simplejson-3.19.1.tar.gz", hash = "sha256:6277f60848a7d8319d27d2be767a7546bc965535b28070e310b3a9af90604a4c"}, ] -six = [ + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, ] -smmap = [ + +[[package]] +name = "smmap" +version = "5.0.0" +description = "A pure Python implementation of a sliding window memory map manager" +optional = false +python-versions = ">=3.6" +files = [ {file = "smmap-5.0.0-py3-none-any.whl", hash = "sha256:2aba19d6a040e78d8b09de5c57e96207b09ed71d8e55ce0959eeee6c8e190d94"}, {file = "smmap-5.0.0.tar.gz", hash = "sha256:c840e62059cd3be204b0c9c9f74be2c09d5648eddd4580d9314c3ecde0b30936"}, ] -sniffio = [ + +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] -snowflake-connector-python = [ + +[[package]] +name = "snowflake-connector-python" +version = "3.5.0" +description = "Snowflake Connector for Python" +optional = true +python-versions = ">=3.8" +files = [ {file = "snowflake-connector-python-3.5.0.tar.gz", hash = "sha256:654e4a1f68a491544bd8f7c5ab02eb8531df67c5f4309d5253bd204044f8a1b3"}, {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a365fa4f23be27a4a46d04f73a48ccb1ddad5b9558f100ba592a49571c90a33c"}, {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5b648b8f32aa540e9adf14e84ea5d77a6c3c6cbc3cbcf172622a0b8db0e99384"}, @@ -8181,15 +7262,63 @@ snowflake-connector-python = [ {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee97a8ac0aaf40a7b7420c8936a66d8d33376cd40498ac3d38efa7bb5712d14a"}, {file = "snowflake_connector_python-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8cd747e2719ba44dd2ce0e9b1e6f8b03485b2b335a352f3b45138b56fad5888"}, ] -sortedcontainers = [ + +[package.dependencies] +asn1crypto = ">0.24.0,<2.0.0" +certifi = ">=2017.4.17" +cffi = ">=1.9,<2.0.0" +charset-normalizer = ">=2,<4" +cryptography = ">=3.1.0,<42.0.0" +filelock = ">=3.5,<4" +idna = ">=2.5,<4" +keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} +packaging = "*" +pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} +platformdirs = ">=2.6.0,<4.0.0" +pyarrow = {version = "*", optional = true, markers = "extra == \"pandas\""} +pyjwt = "<3.0.0" +pyOpenSSL = ">=16.2.0,<24.0.0" +pytz = "*" +requests = "<3.0.0" +sortedcontainers = ">=2.4.0" +tomlkit = "*" +typing-extensions = ">=4.3,<5" +urllib3 = ">=1.21.1,<2.0.0" + +[package.extras] +development = ["Cython", "coverage", "more-itertools", "numpy (<1.27.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] +pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow"] +secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +description = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +optional = true +python-versions = "*" +files = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, ] -soupsieve = [ + +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = true +python-versions = ">=3.8" +files = [ {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, ] -sqlalchemy = [ + +[[package]] +name = "sqlalchemy" +version = "1.4.49" +description = "Database Abstraction Library" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ {file = "SQLAlchemy-1.4.49-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:2e126cf98b7fd38f1e33c64484406b78e937b1a280e078ef558b95bf5b6895f6"}, {file = "SQLAlchemy-1.4.49-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:03db81b89fe7ef3857b4a00b63dedd632d6183d4ea5a31c5d8a92e000a41fc71"}, {file = "SQLAlchemy-1.4.49-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:95b9df9afd680b7a3b13b38adf6e3a38995da5e162cc7524ef08e3be4e5ed3e1"}, @@ -8229,51 +7358,216 @@ sqlalchemy = [ {file = "SQLAlchemy-1.4.49-cp39-cp39-win_amd64.whl", hash = "sha256:bbdf16372859b8ed3f4d05f925a984771cd2abd18bd187042f24be4886c2a15f"}, {file = "SQLAlchemy-1.4.49.tar.gz", hash = "sha256:06ff25cbae30c396c4b7737464f2a7fc37a67b7da409993b182b024cec80aed9"}, ] -sqlalchemy-jsonfield = [ + +[package.dependencies] +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} + +[package.extras] +aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] +aiosqlite = ["aiosqlite", "greenlet (!=0.4.17)", "typing-extensions (!=3.10.0.1)"] +asyncio = ["greenlet (!=0.4.17)"] +asyncmy = ["asyncmy (>=0.2.3,!=0.2.4)", "greenlet (!=0.4.17)"] +mariadb-connector = ["mariadb (>=1.0.1,!=1.1.2)"] +mssql = ["pyodbc"] +mssql-pymssql = ["pymssql"] +mssql-pyodbc = ["pyodbc"] +mypy = ["mypy (>=0.910)", "sqlalchemy2-stubs"] +mysql = ["mysqlclient (>=1.4.0)", "mysqlclient (>=1.4.0,<2)"] +mysql-connector = ["mysql-connector-python"] +oracle = ["cx-oracle (>=7)", "cx-oracle (>=7,<8)"] +postgresql = ["psycopg2 (>=2.7)"] +postgresql-asyncpg = ["asyncpg", "greenlet (!=0.4.17)"] +postgresql-pg8000 = ["pg8000 (>=1.16.6,!=1.29.0)"] +postgresql-psycopg2binary = ["psycopg2-binary"] +postgresql-psycopg2cffi = ["psycopg2cffi"] +pymysql = ["pymysql", "pymysql (<1)"] +sqlcipher = ["sqlcipher3-binary"] + +[[package]] +name = "sqlalchemy-jsonfield" +version = "1.0.1.post0" +description = "SQLALchemy JSONField implementation for storing dicts at SQL" +optional = false +python-versions = ">=3.7.0" +files = [ {file = "SQLAlchemy-JSONField-1.0.1.post0.tar.gz", hash = "sha256:72a5e714fe0493d2660abd7484a9fd9f492f493a0856288dd22a5decb29f5dc4"}, {file = "SQLAlchemy_JSONField-1.0.1.post0-py3-none-any.whl", hash = "sha256:d6f1e5ee329a3c0d9d164e40d81a2143ac8332e09988fbbaff84179dac5503d4"}, ] -sqlalchemy-utils = [ + +[package.dependencies] +sqlalchemy = "*" + +[[package]] +name = "sqlalchemy-utils" +version = "0.41.1" +description = "Various utility functions for SQLAlchemy." +optional = false +python-versions = ">=3.6" +files = [ {file = "SQLAlchemy-Utils-0.41.1.tar.gz", hash = "sha256:a2181bff01eeb84479e38571d2c0718eb52042f9afd8c194d0d02877e84b7d74"}, {file = "SQLAlchemy_Utils-0.41.1-py3-none-any.whl", hash = "sha256:6c96b0768ea3f15c0dc56b363d386138c562752b84f647fb8d31a2223aaab801"}, ] -sqlfluff = [ + +[package.dependencies] +SQLAlchemy = ">=1.3" + +[package.extras] +arrow = ["arrow (>=0.3.4)"] +babel = ["Babel (>=1.3)"] +color = ["colour (>=0.0.4)"] +encrypted = ["cryptography (>=0.6)"] +intervals = ["intervals (>=0.7.1)"] +password = ["passlib (>=1.6,<2.0)"] +pendulum = ["pendulum (>=2.0.5)"] +phone = ["phonenumbers (>=5.9.2)"] +test = ["Jinja2 (>=2.3)", "Pygments (>=1.2)", "backports.zoneinfo", "docutils (>=0.10)", "flake8 (>=2.4.0)", "flexmock (>=0.9.7)", "isort (>=4.2.2)", "pg8000 (>=1.12.4)", "psycopg (>=3.1.8)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pymysql", "pyodbc", "pytest (>=2.7.1)", "python-dateutil (>=2.6)", "pytz (>=2014.2)"] +test-all = ["Babel (>=1.3)", "Jinja2 (>=2.3)", "Pygments (>=1.2)", "arrow (>=0.3.4)", "backports.zoneinfo", "colour (>=0.0.4)", "cryptography (>=0.6)", "docutils (>=0.10)", "flake8 (>=2.4.0)", "flexmock (>=0.9.7)", "furl (>=0.4.1)", "intervals (>=0.7.1)", "isort (>=4.2.2)", "passlib (>=1.6,<2.0)", "pendulum (>=2.0.5)", "pg8000 (>=1.12.4)", "phonenumbers (>=5.9.2)", "psycopg (>=3.1.8)", "psycopg2 (>=2.5.1)", "psycopg2cffi (>=2.8.1)", "pymysql", "pyodbc", "pytest (>=2.7.1)", "python-dateutil", "python-dateutil (>=2.6)", "pytz (>=2014.2)"] +timezone = ["python-dateutil"] +url = ["furl (>=0.4.1)"] + +[[package]] +name = "sqlfluff" +version = "2.3.2" +description = "The SQL Linter for Humans" +optional = false +python-versions = ">=3.7" +files = [ {file = "sqlfluff-2.3.2-py3-none-any.whl", hash = "sha256:85c8b683e283ff632fe28529ddb60585ea2d1d3c614fc7a1db171632b99dcce3"}, {file = "sqlfluff-2.3.2.tar.gz", hash = "sha256:3403ce7e9133766d7336b7e26638657ec6cc9e5610e35186b7f02cc427dd49b7"}, ] -sqlparse = [ + +[package.dependencies] +appdirs = "*" +chardet = "*" +click = "*" +colorama = ">=0.3" +diff-cover = ">=2.5.0" +importlib-resources = {version = "*", markers = "python_version < \"3.9\""} +Jinja2 = "*" +pathspec = "*" +pytest = "*" +pyyaml = ">=5.1" +regex = "*" +tblib = "*" +toml = {version = "*", markers = "python_version < \"3.11\""} +tqdm = "*" +typing-extensions = "*" + +[[package]] +name = "sqlparse" +version = "0.4.4" +description = "A non-validating SQL parser." +optional = false +python-versions = ">=3.5" +files = [ {file = "sqlparse-0.4.4-py3-none-any.whl", hash = "sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3"}, {file = "sqlparse-0.4.4.tar.gz", hash = "sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c"}, ] -stevedore = [ + +[package.extras] +dev = ["build", "flake8"] +doc = ["sphinx"] +test = ["pytest", "pytest-cov"] + +[[package]] +name = "stevedore" +version = "5.1.0" +description = "Manage dynamic plugins for Python applications" +optional = false +python-versions = ">=3.8" +files = [ {file = "stevedore-5.1.0-py3-none-any.whl", hash = "sha256:8cc040628f3cea5d7128f2e76cf486b2251a4e543c7b938f58d9a377f6694a2d"}, {file = "stevedore-5.1.0.tar.gz", hash = "sha256:a54534acf9b89bc7ed264807013b505bf07f74dbe4bcfa37d32bd063870b087c"}, ] -sympy = [ + +[package.dependencies] +pbr = ">=2.0.0,<2.1.0 || >2.1.0" + +[[package]] +name = "sympy" +version = "1.12" +description = "Computer algebra system (CAS) in Python" +optional = true +python-versions = ">=3.8" +files = [ {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, ] -tabulate = [ + +[package.dependencies] +mpmath = ">=0.19" + +[[package]] +name = "tabulate" +version = "0.9.0" +description = "Pretty-print tabular data" +optional = false +python-versions = ">=3.7" +files = [ {file = "tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f"}, {file = "tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c"}, ] -tblib = [ + +[package.extras] +widechars = ["wcwidth"] + +[[package]] +name = "tblib" +version = "2.0.0" +description = "Traceback serialization library." +optional = false +python-versions = ">=3.7" +files = [ {file = "tblib-2.0.0-py3-none-any.whl", hash = "sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a"}, {file = "tblib-2.0.0.tar.gz", hash = "sha256:a6df30f272c08bf8be66e0775fad862005d950a6b8449b94f7c788731d70ecd7"}, ] -tenacity = [ + +[[package]] +name = "tenacity" +version = "8.2.3" +description = "Retry code until it succeeds" +optional = false +python-versions = ">=3.7" +files = [ {file = "tenacity-8.2.3-py3-none-any.whl", hash = "sha256:ce510e327a630c9e1beaf17d42e6ffacc88185044ad85cf74c0a8887c6a0f88c"}, {file = "tenacity-8.2.3.tar.gz", hash = "sha256:5398ef0d78e63f40007c1fb4c0bff96e1911394d2fa8d194f77619c05ff6cc8a"}, ] -termcolor = [ + +[package.extras] +doc = ["reno", "sphinx", "tornado (>=4.5)"] + +[[package]] +name = "termcolor" +version = "2.3.0" +description = "ANSI color formatting for output in terminal" +optional = false +python-versions = ">=3.7" +files = [ {file = "termcolor-2.3.0-py3-none-any.whl", hash = "sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475"}, {file = "termcolor-2.3.0.tar.gz", hash = "sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a"}, ] -text-unidecode = [ + +[package.extras] +tests = ["pytest", "pytest-cov"] + +[[package]] +name = "text-unidecode" +version = "1.3" +description = "The most basic Text::Unidecode port" +optional = false +python-versions = "*" +files = [ {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, ] -tokenizers = [ + +[[package]] +name = "tokenizers" +version = "0.13.3" +description = "Fast and Customizable Tokenizers" +optional = true +python-versions = "*" +files = [ {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"}, {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"}, {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"}, @@ -8315,122 +7609,360 @@ tokenizers = [ {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"}, {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"}, ] -toml = [ + +[package.extras] +dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] +docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] +testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] + +[[package]] +name = "toml" +version = "0.10.2" +description = "Python Library for Tom's Obvious, Minimal Language" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ {file = "toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b"}, {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -tomli = [ + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -tomli-w = [ + +[[package]] +name = "tomli-w" +version = "1.0.0" +description = "A lil' TOML writer" +optional = false +python-versions = ">=3.7" +files = [ {file = "tomli_w-1.0.0-py3-none-any.whl", hash = "sha256:9f2a07e8be30a0729e533ec968016807069991ae2fd921a78d42f429ae5f4463"}, {file = "tomli_w-1.0.0.tar.gz", hash = "sha256:f463434305e0336248cac9c2dc8076b707d8a12d019dd349f5c1e382dd1ae1b9"}, ] -tomlkit = [ + +[[package]] +name = "tomlkit" +version = "0.12.1" +description = "Style preserving TOML library" +optional = false +python-versions = ">=3.7" +files = [ {file = "tomlkit-0.12.1-py3-none-any.whl", hash = "sha256:712cbd236609acc6a3e2e97253dfc52d4c2082982a88f61b640ecf0817eab899"}, {file = "tomlkit-0.12.1.tar.gz", hash = "sha256:38e1ff8edb991273ec9f6181244a6a391ac30e9f5098e7535640ea6be97a7c86"}, ] -tqdm = [ + +[[package]] +name = "tqdm" +version = "4.66.1" +description = "Fast, Extensible Progress Meter" +optional = false +python-versions = ">=3.7" +files = [ {file = "tqdm-4.66.1-py3-none-any.whl", hash = "sha256:d302b3c5b53d47bce91fea46679d9c3c6508cf6332229aa1e7d8653723793386"}, {file = "tqdm-4.66.1.tar.gz", hash = "sha256:d88e651f9db8d8551a62556d3cff9e3034274ca5d66e93197cf2490e2dcb69c7"}, ] -typeapi = [ + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout", "pytest-xdist"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + +[[package]] +name = "typeapi" +version = "2.1.1" +description = "" +optional = false +python-versions = ">=3.6.3,<4.0.0" +files = [ {file = "typeapi-2.1.1-py3-none-any.whl", hash = "sha256:ef41577f316bfd362572e727ba349dab80a7362318a80fc72e6a807017d04c5c"}, {file = "typeapi-2.1.1.tar.gz", hash = "sha256:49b3c1d3382e27dccbb59132a3a823c61954f679a0c61f119fd6d8470073a298"}, ] -types-awscrt = [ + +[package.dependencies] +typing-extensions = ">=3.0.0" + +[[package]] +name = "types-awscrt" +version = "0.19.1" +description = "Type annotations and code completion for awscrt" +optional = false +python-versions = ">=3.7,<4.0" +files = [ {file = "types_awscrt-0.19.1-py3-none-any.whl", hash = "sha256:68fffeb75396e9e7614cd930b2d52295f680230774750907bcafb56f11514043"}, {file = "types_awscrt-0.19.1.tar.gz", hash = "sha256:61833aa140e724a9098025610f4b8cde3dcf65b842631d7447378f9f5db4e1fd"}, ] -types-cachetools = [ + +[[package]] +name = "types-cachetools" +version = "5.3.0.6" +description = "Typing stubs for cachetools" +optional = false +python-versions = "*" +files = [ {file = "types-cachetools-5.3.0.6.tar.gz", hash = "sha256:595f0342d246c8ba534f5a762cf4c2f60ecb61e8002b8b2277fd5cf791d4e851"}, {file = "types_cachetools-5.3.0.6-py3-none-any.whl", hash = "sha256:f7f8a25bfe306f2e6bc2ad0a2f949d9e72f2d91036d509c36d3810bf728bc6e1"}, ] -types-click = [ + +[[package]] +name = "types-click" +version = "7.1.8" +description = "Typing stubs for click" +optional = false +python-versions = "*" +files = [ {file = "types-click-7.1.8.tar.gz", hash = "sha256:b6604968be6401dc516311ca50708a0a28baa7a0cb840efd7412f0dbbff4e092"}, {file = "types_click-7.1.8-py3-none-any.whl", hash = "sha256:8cb030a669e2e927461be9827375f83c16b8178c365852c060a34e24871e7e81"}, ] -types-deprecated = [ + +[[package]] +name = "types-deprecated" +version = "1.2.9.3" +description = "Typing stubs for Deprecated" +optional = false +python-versions = "*" +files = [ {file = "types-Deprecated-1.2.9.3.tar.gz", hash = "sha256:ef87327adf3e3c4a4c7d8e06e58f6476710d3466ecfb53c49efb080804a70ef3"}, {file = "types_Deprecated-1.2.9.3-py3-none-any.whl", hash = "sha256:24da9210763e5e1b3d0d4f6f8bba9ad3bb6af3fe7f6815fc37e3ede4681704f5"}, ] -types-protobuf = [ + +[[package]] +name = "types-protobuf" +version = "4.24.0.1" +description = "Typing stubs for protobuf" +optional = false +python-versions = "*" +files = [ {file = "types-protobuf-4.24.0.1.tar.gz", hash = "sha256:90adea3b693d6a40d8ef075c58fe6b5cc6e01fe1496301a7e6fc70398dcff92e"}, {file = "types_protobuf-4.24.0.1-py3-none-any.whl", hash = "sha256:df203a204e4ae97d4cca4c9cf725262579dd7857a19f9e7fc74871ccfa073c01"}, ] -types-psutil = [ + +[[package]] +name = "types-psutil" +version = "5.9.5.16" +description = "Typing stubs for psutil" +optional = false +python-versions = "*" +files = [ {file = "types-psutil-5.9.5.16.tar.gz", hash = "sha256:4e9b219efb625d3d04f6bf106934f87cab49aa41a94b0a3b3089403f47a79228"}, {file = "types_psutil-5.9.5.16-py3-none-any.whl", hash = "sha256:fec713104d5d143afea7b976cfa691ca1840f5d19e8714a5d02a96ebd061363e"}, ] -types-psycopg2 = [ + +[[package]] +name = "types-psycopg2" +version = "2.9.21.14" +description = "Typing stubs for psycopg2" +optional = false +python-versions = "*" +files = [ {file = "types-psycopg2-2.9.21.14.tar.gz", hash = "sha256:bf73a0ac4da4e278c89bf1b01fc596d5a5ac7a356cfe6ac0249f47b9e259f868"}, {file = "types_psycopg2-2.9.21.14-py3-none-any.whl", hash = "sha256:cd9c5350631f3bc6184ec8d48f2ed31d4ea660f89d0fffe78239450782f383c5"}, ] -types-python-dateutil = [ + +[[package]] +name = "types-python-dateutil" +version = "2.8.19.14" +description = "Typing stubs for python-dateutil" +optional = false +python-versions = "*" +files = [ {file = "types-python-dateutil-2.8.19.14.tar.gz", hash = "sha256:1f4f10ac98bb8b16ade9dbee3518d9ace017821d94b057a425b069f834737f4b"}, {file = "types_python_dateutil-2.8.19.14-py3-none-any.whl", hash = "sha256:f977b8de27787639986b4e28963263fd0e5158942b3ecef91b9335c130cb1ce9"}, ] -types-pyyaml = [ + +[[package]] +name = "types-pyyaml" +version = "6.0.12.11" +description = "Typing stubs for PyYAML" +optional = false +python-versions = "*" +files = [ {file = "types-PyYAML-6.0.12.11.tar.gz", hash = "sha256:7d340b19ca28cddfdba438ee638cd4084bde213e501a3978738543e27094775b"}, {file = "types_PyYAML-6.0.12.11-py3-none-any.whl", hash = "sha256:a461508f3096d1d5810ec5ab95d7eeecb651f3a15b71959999988942063bf01d"}, ] -types-requests = [ + +[[package]] +name = "types-requests" +version = "2.31.0.2" +description = "Typing stubs for requests" +optional = false +python-versions = "*" +files = [ {file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"}, {file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"}, ] -types-s3transfer = [ + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-s3transfer" +version = "0.6.2" +description = "Type annotations and code completion for s3transfer" +optional = false +python-versions = ">=3.7,<4.0" +files = [ {file = "types_s3transfer-0.6.2-py3-none-any.whl", hash = "sha256:1068877b6e59be5226fa3006ae64371ac9d5bc590dfdbd9c66fd0a075d3254ac"}, {file = "types_s3transfer-0.6.2.tar.gz", hash = "sha256:4ba9b483796fdcd026aa162ee03bdcedd2bf7d08e9387c820dcdd158b0102057"}, ] -types-setuptools = [ + +[[package]] +name = "types-setuptools" +version = "68.1.0.1" +description = "Typing stubs for setuptools" +optional = false +python-versions = "*" +files = [ {file = "types-setuptools-68.1.0.1.tar.gz", hash = "sha256:271ed8da44885cd9a701c86e48cc6d3cc988052260e72b3ce26c26b3028f86ed"}, {file = "types_setuptools-68.1.0.1-py3-none-any.whl", hash = "sha256:a9a0d2ca1da8a15924890d464adcee4004deb07b6a99bd0b1881eac5c73cb3a7"}, ] -types-simplejson = [ + +[[package]] +name = "types-simplejson" +version = "3.19.0.2" +description = "Typing stubs for simplejson" +optional = false +python-versions = "*" +files = [ {file = "types-simplejson-3.19.0.2.tar.gz", hash = "sha256:ebc81f886f89d99d6b80c726518aa2228bc77c26438f18fd81455e4f79f8ee1b"}, {file = "types_simplejson-3.19.0.2-py3-none-any.whl", hash = "sha256:8ba093dc7884f59b3e62aed217144085e675a269debc32678fd80e0b43b2b86f"}, ] -types-sqlalchemy = [ + +[[package]] +name = "types-sqlalchemy" +version = "1.4.53.38" +description = "Typing stubs for SQLAlchemy" +optional = false +python-versions = "*" +files = [ {file = "types-SQLAlchemy-1.4.53.38.tar.gz", hash = "sha256:5bb7463537e04e1aa5a3557eb725930df99226dcfd3c9bf93008025bfe5c169e"}, {file = "types_SQLAlchemy-1.4.53.38-py3-none-any.whl", hash = "sha256:7e60e74f823931cc9a9e8adb0a4c05e5533e6708b8a266807893a739faf4eaaa"}, ] -types-tqdm = [ + +[[package]] +name = "types-tqdm" +version = "4.66.0.2" +description = "Typing stubs for tqdm" +optional = false +python-versions = "*" +files = [ {file = "types-tqdm-4.66.0.2.tar.gz", hash = "sha256:9553a5e44c1d485fce19f505b8bd65c0c3e87e870678d1f2ed764ae59a55d45f"}, {file = "types_tqdm-4.66.0.2-py3-none-any.whl", hash = "sha256:13dddd38908834abdf0acdc2b70cab7ac4bcc5ad7356ced450471662e58a0ffc"}, ] -types-urllib3 = [ + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +description = "Typing stubs for urllib3" +optional = false +python-versions = "*" +files = [ {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, ] -typing-extensions = [ + +[[package]] +name = "typing-extensions" +version = "4.7.1" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] -tzdata = [ + +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, ] -uc-micro-py = [ + +[[package]] +name = "uc-micro-py" +version = "1.0.2" +description = "Micro subset of unicode data files for linkify-it-py projects." +optional = false +python-versions = ">=3.7" +files = [ {file = "uc-micro-py-1.0.2.tar.gz", hash = "sha256:30ae2ac9c49f39ac6dce743bd187fcd2b574b16ca095fa74cd9396795c954c54"}, {file = "uc_micro_py-1.0.2-py3-none-any.whl", hash = "sha256:8c9110c309db9d9e87302e2f4ad2c3152770930d88ab385cd544e7a7e75f3de0"}, ] -unicodecsv = [ + +[package.extras] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "unicodecsv" +version = "0.14.1" +description = "Python2's stdlib csv module is nice, but it doesn't support unicode. This module is a drop-in replacement which *does*." +optional = false +python-versions = "*" +files = [ {file = "unicodecsv-0.14.1.tar.gz", hash = "sha256:018c08037d48649a0412063ff4eda26eaa81eff1546dbffa51fa5293276ff7fc"}, ] -uritemplate = [ + +[[package]] +name = "uritemplate" +version = "4.1.1" +description = "Implementation of RFC 6570 URI Templates" +optional = false +python-versions = ">=3.6" +files = [ {file = "uritemplate-4.1.1-py2.py3-none-any.whl", hash = "sha256:830c08b8d99bdd312ea4ead05994a38e8936266f84b9a7878232db50b044e02e"}, {file = "uritemplate-4.1.1.tar.gz", hash = "sha256:4346edfc5c3b79f694bccd6d6099a322bbeb628dbf2cd86eea55a456ce5124f0"}, ] -urllib3 = [ + +[[package]] +name = "urllib3" +version = "1.26.16" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +files = [ {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"}, {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"}, ] -validators = [ + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] + +[[package]] +name = "validators" +version = "0.21.0" +description = "Python Data Validation for Humans™" +optional = true +python-versions = ">=3.8,<4.0" +files = [ {file = "validators-0.21.0-py3-none-any.whl", hash = "sha256:3470db6f2384c49727ee319afa2e97aec3f8fad736faa6067e0fd7f9eaf2c551"}, {file = "validators-0.21.0.tar.gz", hash = "sha256:245b98ab778ed9352a7269c6a8f6c2a839bed5b2a7e3e60273ce399d247dd4b3"}, ] -watchdog = [ + +[[package]] +name = "watchdog" +version = "3.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.7" +files = [ {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:336adfc6f5cc4e037d52db31194f7581ff744b67382eb6021c868322e32eef41"}, {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a70a8dcde91be523c35b2bf96196edc5730edb347e374c7de7cd20c43ed95397"}, {file = "watchdog-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:adfdeab2da79ea2f76f87eb42a3ab1966a5313e5a69a0213a3cc06ef692b0e96"}, @@ -8459,23 +7991,79 @@ watchdog = [ {file = "watchdog-3.0.0-py3-none-win_ia64.whl", hash = "sha256:5d9f3a10e02d7371cd929b5d8f11e87d4bad890212ed3901f9b4d68767bee759"}, {file = "watchdog-3.0.0.tar.gz", hash = "sha256:4d98a320595da7a7c5a18fc48cb633c2e73cda78f93cac2ef42d42bf609a33f9"}, ] -wcwidth = [ + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + +[[package]] +name = "wcwidth" +version = "0.2.6" +description = "Measures the displayed width of unicode strings in a terminal" +optional = false +python-versions = "*" +files = [ {file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"}, {file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"}, ] -weaviate-client = [ + +[[package]] +name = "weaviate-client" +version = "3.23.2" +description = "A python native Weaviate client" +optional = true +python-versions = ">=3.8" +files = [ {file = "weaviate-client-3.23.2.tar.gz", hash = "sha256:1c8c94df032dd2fa5a4ea615fc69ccb983ffad5cc02974f78c793839e61ac150"}, {file = "weaviate_client-3.23.2-py3-none-any.whl", hash = "sha256:88ffc38cca07806d64726cc74bc194c7da50b222aa4e2cd129f4c1f5e53e9b61"}, ] -werkzeug = [ + +[package.dependencies] +authlib = ">=1.1.0" +requests = ">=2.28.0,<=2.31.0" +tqdm = ">=4.59.0,<5.0.0" +validators = ">=0.18.2,<=0.21.0" + +[package.extras] +grpc = ["grpcio", "grpcio-tools"] + +[[package]] +name = "werkzeug" +version = "2.3.7" +description = "The comprehensive WSGI web application library." +optional = false +python-versions = ">=3.8" +files = [ {file = "werkzeug-2.3.7-py3-none-any.whl", hash = "sha256:effc12dba7f3bd72e605ce49807bbe692bd729c3bb122a3b91747a6ae77df528"}, {file = "werkzeug-2.3.7.tar.gz", hash = "sha256:2b8c0e447b4b9dbcc85dd97b6eeb4dcbaf6c8b6c3be0bd654e25553e0a2157d8"}, ] -wheel = [ + +[package.dependencies] +MarkupSafe = ">=2.1.1" + +[package.extras] +watchdog = ["watchdog (>=2.3)"] + +[[package]] +name = "wheel" +version = "0.41.2" +description = "A built-package format for Python" +optional = false +python-versions = ">=3.7" +files = [ {file = "wheel-0.41.2-py3-none-any.whl", hash = "sha256:75909db2664838d015e3d9139004ee16711748a52c8f336b52882266540215d8"}, {file = "wheel-0.41.2.tar.gz", hash = "sha256:0c5ac5ff2afb79ac23ab82bab027a0be7b5dbcf2e54dc50efe4bf507de1f7985"}, ] -wrapt = [ + +[package.extras] +test = ["pytest (>=6.0.0)", "setuptools (>=65)"] + +[[package]] +name = "wrapt" +version = "1.15.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" +files = [ {file = "wrapt-1.15.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:ca1cccf838cd28d5a0883b342474c630ac48cac5df0ee6eacc9c7290f76b11c1"}, {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:e826aadda3cae59295b95343db8f3d965fb31059da7de01ee8d1c40a60398b29"}, {file = "wrapt-1.15.0-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:5fc8e02f5984a55d2c653f5fea93531e9836abbd84342c1d1e17abc4a15084c2"}, @@ -8552,15 +8140,45 @@ wrapt = [ {file = "wrapt-1.15.0-py3-none-any.whl", hash = "sha256:64b1df0f83706b4ef4cfb4fb0e4c2669100fd7ecacfb59e091fad300d4e04640"}, {file = "wrapt-1.15.0.tar.gz", hash = "sha256:d06730c6aed78cee4126234cf2d071e01b44b915e725a6cb439a879ec9754a3a"}, ] -wtforms = [ + +[[package]] +name = "wtforms" +version = "3.0.1" +description = "Form validation and rendering for Python web development." +optional = false +python-versions = ">=3.7" +files = [ {file = "WTForms-3.0.1-py3-none-any.whl", hash = "sha256:837f2f0e0ca79481b92884962b914eba4e72b7a2daaf1f939c890ed0124b834b"}, {file = "WTForms-3.0.1.tar.gz", hash = "sha256:6b351bbb12dd58af57ffef05bc78425d08d1914e0fd68ee14143b7ade023c5bc"}, ] -yapf = [ + +[package.dependencies] +MarkupSafe = "*" + +[package.extras] +email = ["email-validator"] + +[[package]] +name = "yapf" +version = "0.33.0" +description = "A formatter for Python code." +optional = false +python-versions = "*" +files = [ {file = "yapf-0.33.0-py2.py3-none-any.whl", hash = "sha256:4c2b59bd5ffe46f3a7da48df87596877189148226ce267c16e8b44240e51578d"}, {file = "yapf-0.33.0.tar.gz", hash = "sha256:da62bdfea3df3673553351e6246abed26d9fe6780e548a5af9e70f6d2b4f5b9a"}, ] -yarl = [ + +[package.dependencies] +tomli = ">=2.0.1" + +[[package]] +name = "yarl" +version = "1.9.2" +description = "Yet another URL library" +optional = false +python-versions = ">=3.7" +files = [ {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8c2ad583743d16ddbdf6bb14b5cd76bf43b0d0006e918809d5d4ddf7bde8dd82"}, {file = "yarl-1.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:82aa6264b36c50acfb2424ad5ca537a2060ab6de158a5bd2a72a032cc75b9eb8"}, {file = "yarl-1.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c0c77533b5ed4bcc38e943178ccae29b9bcf48ffd1063f5821192f23a1bd27b9"}, @@ -8636,7 +8254,47 @@ yarl = [ {file = "yarl-1.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:61016e7d582bc46a5378ffdd02cd0314fb8ba52f40f9cf4d9a5e7dbef88dee18"}, {file = "yarl-1.9.2.tar.gz", hash = "sha256:04ab9d4b9f587c06d801c2abfe9317b77cdf996c65a90d5e84ecc45010823571"}, ] -zipp = [ + +[package.dependencies] +idna = ">=2.0" +multidict = ">=4.0" + +[[package]] +name = "zipp" +version = "3.16.2" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.8" +files = [ {file = "zipp-3.16.2-py3-none-any.whl", hash = "sha256:679e51dd4403591b2d6838a48de3d283f3d188412a9782faadf845f298736ba0"}, {file = "zipp-3.16.2.tar.gz", hash = "sha256:ebc15946aa78bd63458992fc81ec3b6f7b1e92d51c35e6de1c3804e73b799147"}, ] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + +[extras] +athena = ["botocore", "pyarrow", "pyathena", "s3fs"] +az = ["adlfs"] +bigquery = ["gcsfs", "google-cloud-bigquery", "grpcio", "pyarrow"] +cli = ["cron-descriptor", "pipdeptree"] +dbt = ["dbt-athena-community", "dbt-bigquery", "dbt-core", "dbt-duckdb", "dbt-redshift", "dbt-snowflake"] +duckdb = ["duckdb"] +filesystem = ["botocore", "s3fs"] +gcp = ["gcsfs", "google-cloud-bigquery", "grpcio"] +gs = ["gcsfs"] +motherduck = ["duckdb", "pyarrow"] +mssql = ["pyodbc"] +parquet = ["pyarrow"] +postgres = ["psycopg2-binary", "psycopg2cffi"] +qdrant = ["qdrant-client"] +redshift = ["psycopg2-binary", "psycopg2cffi"] +s3 = ["botocore", "s3fs"] +snowflake = ["snowflake-connector-python"] +weaviate = ["weaviate-client"] + +[metadata] +lock-version = "2.0" +python-versions = ">=3.8.1,<3.13" +content-hash = "0bac5ade4c724f804899880ac258e6bb9212bb310de4c1664ae2a46838b0e837" From eba666e806920da7fa63ec8761ee915f035f3be1 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 22 Nov 2023 16:32:23 +0100 Subject: [PATCH 04/10] disable isort --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 9055170e8e..49982de2c6 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ dev: has-poetry lint: ./check-package.sh poetry run black ./ --diff --exclude=".*syntax_error.py" - poetry run isort ./ --diff + # poetry run isort ./ --diff poetry run mypy --config-file mypy.ini dlt tests poetry run flake8 --max-line-length=200 dlt poetry run flake8 --max-line-length=200 tests --exclude tests/reflection/module_cases @@ -57,7 +57,7 @@ lint: format: poetry run black ./ --exclude=".*syntax_error.py" - poetry run isort ./ + # poetry run isort ./ test-and-lint-snippets: poetry run mypy --config-file mypy.ini docs/website docs/examples From 75373f7f26aa502af218fd832d1d30e9e0d681bd Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 22 Nov 2023 16:46:03 +0100 Subject: [PATCH 05/10] update flake8 config --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 7d16160004..9469001572 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [flake8] -ignore=E1,E2,E3,E4,F401,W391,W292,E501,E731,F811 +ignore=E1,E2,E3,E4,F401,W391,W292,E501,E731,F811,W503,E704, W504 banned-modules = datetime = use dlt.common.pendulum json = use dlt.common.json decimal = use dlt.common.decimal From c3ddbaa6e61c44a3809e625c802cb4c7632934a3 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 22 Nov 2023 16:46:37 +0100 Subject: [PATCH 06/10] format all files --- dlt/__init__.py | 10 +- dlt/cli/_dlt.py | 462 ++++++++++--- dlt/cli/config_toml_writer.py | 34 +- dlt/cli/deploy_command.py | 298 +++++--- dlt/cli/deploy_command_helpers.py | 132 +++- dlt/cli/echo.py | 2 +- dlt/cli/init_command.py | 260 +++++-- dlt/cli/pipeline_command.py | 127 +++- dlt/cli/pipeline_files.py | 80 ++- dlt/cli/requirements.py | 1 + dlt/cli/source_detection.py | 44 +- dlt/cli/telemetry_command.py | 4 +- dlt/cli/utils.py | 15 +- dlt/common/arithmetics.py | 15 +- dlt/common/configuration/__init__.py | 2 +- dlt/common/configuration/accessors.py | 12 +- dlt/common/configuration/container.py | 6 +- dlt/common/configuration/exceptions.py | 86 ++- dlt/common/configuration/inject.py | 38 +- dlt/common/configuration/paths.py | 11 +- .../configuration/providers/__init__.py | 10 +- dlt/common/configuration/providers/airflow.py | 3 +- dlt/common/configuration/providers/context.py | 5 +- .../configuration/providers/dictionary.py | 9 +- dlt/common/configuration/providers/environ.py | 6 +- .../configuration/providers/google_secrets.py | 26 +- .../configuration/providers/provider.py | 5 +- dlt/common/configuration/providers/toml.py | 48 +- dlt/common/configuration/resolve.py | 180 +++-- dlt/common/configuration/specs/__init__.py | 46 +- .../configuration/specs/api_credentials.py | 2 - .../configuration/specs/aws_credentials.py | 18 +- .../configuration/specs/azure_credentials.py | 12 +- .../configuration/specs/base_configuration.py | 86 ++- .../specs/config_providers_context.py | 51 +- .../specs/config_section_context.py | 38 +- .../specs/connection_string_credentials.py | 14 +- dlt/common/configuration/specs/exceptions.py | 32 +- .../configuration/specs/gcp_credentials.py | 57 +- .../configuration/specs/known_sections.py | 6 +- .../configuration/specs/run_configuration.py | 8 +- dlt/common/configuration/utils.py | 43 +- dlt/common/data_types/__init__.py | 4 +- dlt/common/data_types/type_helpers.py | 11 +- dlt/common/data_types/typing.py | 14 +- dlt/common/data_writers/__init__.py | 14 +- dlt/common/data_writers/buffered.py | 28 +- dlt/common/data_writers/escape.py | 30 +- dlt/common/data_writers/exceptions.py | 9 +- dlt/common/data_writers/writers.py | 77 ++- dlt/common/destination/__init__.py | 6 +- dlt/common/destination/capabilities.py | 13 +- dlt/common/destination/reference.py | 156 +++-- dlt/common/exceptions.py | 82 ++- dlt/common/git.py | 42 +- dlt/common/json/__init__.py | 79 +-- dlt/common/json/_orjson.py | 14 +- dlt/common/json/_simplejson.py | 27 +- dlt/common/libs/pyarrow.py | 66 +- dlt/common/libs/pydantic.py | 159 ++++- dlt/common/normalizers/__init__.py | 6 +- dlt/common/normalizers/configuration.py | 4 +- dlt/common/normalizers/exceptions.py | 5 +- dlt/common/normalizers/json/__init__.py | 7 +- dlt/common/normalizers/json/relational.py | 89 ++- dlt/common/normalizers/naming/__init__.py | 5 +- dlt/common/normalizers/naming/direct.py | 2 +- dlt/common/normalizers/naming/duck_case.py | 7 +- dlt/common/normalizers/naming/exceptions.py | 6 +- dlt/common/normalizers/naming/naming.py | 29 +- dlt/common/normalizers/naming/snake_case.py | 11 +- dlt/common/normalizers/typing.py | 2 +- dlt/common/normalizers/utils.py | 27 +- dlt/common/pendulum.py | 2 +- dlt/common/pipeline.py | 99 ++- dlt/common/reflection/function_visitor.py | 1 + dlt/common/reflection/spec.py | 13 +- dlt/common/reflection/utils.py | 16 +- dlt/common/runners/__init__.py | 10 +- dlt/common/runners/configuration.py | 8 +- dlt/common/runners/pool_runner.py | 16 +- dlt/common/runners/runnable.py | 6 +- dlt/common/runners/stdout.py | 14 +- dlt/common/runners/synth_pickle.py | 3 +- dlt/common/runners/venv.py | 12 +- dlt/common/runtime/collector.py | 103 ++- dlt/common/runtime/exec_info.py | 16 +- dlt/common/runtime/json_logging.py | 99 ++- dlt/common/runtime/logger.py | 18 +- dlt/common/runtime/prometheus.py | 4 +- dlt/common/runtime/segment.py | 37 +- dlt/common/runtime/sentry.py | 17 +- dlt/common/runtime/slack.py | 11 +- dlt/common/runtime/telemetry.py | 13 +- dlt/common/schema/__init__.py | 29 +- dlt/common/schema/detections.py | 2 +- dlt/common/schema/exceptions.py | 66 +- dlt/common/schema/schema.py | 289 ++++++-- dlt/common/schema/typing.py | 77 ++- dlt/common/schema/utils.py | 317 +++++---- dlt/common/source.py | 3 +- dlt/common/storages/__init__.py | 17 +- dlt/common/storages/configuration.py | 73 +- dlt/common/storages/data_item_storage.py | 22 +- dlt/common/storages/exceptions.py | 52 +- dlt/common/storages/file_storage.py | 52 +- dlt/common/storages/fsspec_filesystem.py | 67 +- dlt/common/storages/live_schema_storage.py | 5 +- dlt/common/storages/load_storage.py | 218 ++++-- dlt/common/storages/normalize_storage.py | 16 +- dlt/common/storages/schema_storage.py | 57 +- dlt/common/storages/transactional_file.py | 16 +- dlt/common/storages/versioned_storage.py | 25 +- dlt/common/time.py | 15 +- dlt/common/typing.py | 58 +- dlt/common/utils.py | 73 +- dlt/common/validation.py | 93 ++- dlt/common/wei.py | 11 +- dlt/destinations/exceptions.py | 59 +- dlt/destinations/impl/athena/athena.py | 149 ++-- dlt/destinations/impl/athena/configuration.py | 2 +- dlt/destinations/impl/athena/factory.py | 1 - dlt/destinations/impl/bigquery/bigquery.py | 143 ++-- .../impl/bigquery/configuration.py | 16 +- dlt/destinations/impl/bigquery/factory.py | 7 +- dlt/destinations/impl/bigquery/sql_client.py | 78 ++- dlt/destinations/impl/duckdb/configuration.py | 25 +- dlt/destinations/impl/duckdb/duck.py | 53 +- dlt/destinations/impl/duckdb/factory.py | 5 +- dlt/destinations/impl/duckdb/sql_client.py | 43 +- dlt/destinations/impl/dummy/__init__.py | 8 +- dlt/destinations/impl/dummy/configuration.py | 10 +- dlt/destinations/impl/dummy/dummy.py | 40 +- dlt/destinations/impl/dummy/factory.py | 6 +- .../impl/filesystem/configuration.py | 13 +- dlt/destinations/impl/filesystem/factory.py | 1 - .../impl/filesystem/filesystem.py | 87 ++- .../impl/motherduck/configuration.py | 18 +- dlt/destinations/impl/motherduck/factory.py | 10 +- .../impl/motherduck/motherduck.py | 6 +- .../impl/motherduck/sql_client.py | 24 +- dlt/destinations/impl/mssql/__init__.py | 2 +- dlt/destinations/impl/mssql/configuration.py | 8 +- dlt/destinations/impl/mssql/factory.py | 1 - dlt/destinations/impl/mssql/mssql.py | 78 ++- dlt/destinations/impl/mssql/sql_client.py | 41 +- dlt/destinations/impl/postgres/__init__.py | 5 +- .../impl/postgres/configuration.py | 6 +- dlt/destinations/impl/postgres/factory.py | 6 +- dlt/destinations/impl/postgres/postgres.py | 52 +- dlt/destinations/impl/postgres/sql_client.py | 52 +- dlt/destinations/impl/qdrant/configuration.py | 5 +- dlt/destinations/impl/qdrant/factory.py | 1 - .../impl/qdrant/qdrant_adapter.py | 7 +- dlt/destinations/impl/qdrant/qdrant_client.py | 196 ++++-- .../impl/redshift/configuration.py | 5 +- dlt/destinations/impl/redshift/factory.py | 11 +- dlt/destinations/impl/redshift/redshift.py | 118 +++- dlt/destinations/impl/snowflake/__init__.py | 1 - .../impl/snowflake/configuration.py | 41 +- dlt/destinations/impl/snowflake/factory.py | 13 +- dlt/destinations/impl/snowflake/snowflake.py | 138 ++-- dlt/destinations/impl/snowflake/sql_client.py | 34 +- dlt/destinations/impl/weaviate/ci_naming.py | 1 + .../impl/weaviate/configuration.py | 16 +- dlt/destinations/impl/weaviate/exceptions.py | 10 +- dlt/destinations/impl/weaviate/factory.py | 11 +- dlt/destinations/impl/weaviate/naming.py | 15 +- .../impl/weaviate/weaviate_adapter.py | 8 +- .../impl/weaviate/weaviate_client.py | 123 ++-- dlt/destinations/insert_job_client.py | 5 +- dlt/destinations/job_client_impl.py | 239 +++++-- dlt/destinations/job_impl.py | 11 +- dlt/destinations/path_utils.py | 38 +- dlt/destinations/sql_client.py | 51 +- dlt/destinations/sql_jobs.py | 167 ++++- dlt/destinations/type_mapping.py | 42 +- dlt/destinations/typing.py | 25 +- dlt/extract/__init__.py | 12 +- dlt/extract/decorators.py | 184 +++-- dlt/extract/exceptions.py | 229 +++++-- dlt/extract/extract.py | 32 +- dlt/extract/extractors.py | 106 ++- dlt/extract/hints.py | 78 ++- dlt/extract/incremental/__init__.py | 175 +++-- dlt/extract/incremental/exceptions.py | 13 +- dlt/extract/incremental/transform.py | 104 ++- dlt/extract/incremental/typing.py | 1 + dlt/extract/pipe.py | 178 ++++- dlt/extract/resource.py | 137 +++- dlt/extract/source.py | 91 ++- dlt/extract/storage.py | 27 +- dlt/extract/typing.py | 19 +- dlt/extract/utils.py | 39 +- dlt/extract/validation.py | 47 +- dlt/extract/wrappers.py | 2 +- dlt/helpers/airflow_helper.py | 65 +- dlt/helpers/dbt/__init__.py | 23 +- dlt/helpers/dbt/configuration.py | 4 +- dlt/helpers/dbt/dbt_utils.py | 50 +- dlt/helpers/dbt/exceptions.py | 4 +- dlt/helpers/dbt/runner.py | 87 ++- dlt/helpers/dbt_cloud/client.py | 14 +- dlt/helpers/streamlit_helper.py | 80 ++- dlt/load/configuration.py | 6 +- dlt/load/exceptions.py | 24 +- dlt/load/load.py | 280 ++++++-- dlt/normalize/__init__.py | 2 +- dlt/normalize/configuration.py | 24 +- dlt/normalize/exceptions.py | 1 + dlt/normalize/items_normalizers.py | 129 ++-- dlt/normalize/normalize.py | 147 +++- dlt/pipeline/__init__.py | 34 +- dlt/pipeline/dbt.py | 32 +- dlt/pipeline/deprecations.py | 5 +- dlt/pipeline/exceptions.py | 68 +- dlt/pipeline/helpers.py | 79 ++- dlt/pipeline/mark.py | 2 +- dlt/pipeline/pipeline.py | 529 +++++++++++---- dlt/pipeline/progress.py | 8 +- dlt/pipeline/state_sync.py | 50 +- dlt/pipeline/trace.py | 75 ++- dlt/pipeline/track.py | 15 +- dlt/reflection/names.py | 4 +- dlt/reflection/script_inspector.py | 40 +- dlt/reflection/script_visitor.py | 9 +- dlt/sources/config.py | 2 +- dlt/sources/credentials.py | 7 +- dlt/sources/filesystem.py | 7 +- dlt/sources/helpers/requests/__init__.py | 30 +- dlt/sources/helpers/requests/retry.py | 50 +- dlt/sources/helpers/requests/session.py | 21 +- dlt/sources/helpers/transform.py | 4 + docs/examples/archive/_helpers.py | 10 +- docs/examples/archive/credentials/explicit.py | 14 +- docs/examples/archive/dbt_run_jaffle.py | 13 +- docs/examples/archive/discord_iterator.py | 1 - docs/examples/archive/google_sheets.py | 4 +- docs/examples/archive/quickstart.py | 39 +- docs/examples/archive/rasa_example.py | 8 +- docs/examples/archive/read_table.py | 8 +- docs/examples/archive/restore_pipeline.py | 2 +- docs/examples/archive/singer_tap_example.py | 13 +- .../archive/singer_tap_jsonl_example.py | 4 +- .../examples/archive/sources/google_sheets.py | 44 +- docs/examples/archive/sources/jsonl.py | 11 +- .../examples/archive/sources/rasa/__init__.py | 2 +- docs/examples/archive/sources/rasa/rasa.py | 8 +- docs/examples/archive/sources/singer_tap.py | 31 +- docs/examples/archive/sources/sql_query.py | 49 +- docs/examples/chess/chess.py | 21 +- docs/examples/chess/chess_dbt.py | 1 - docs/examples/chess_production/chess.py | 28 +- docs/examples/incremental_loading/zendesk.py | 8 +- docs/examples/nested_data/nested_data.py | 14 +- docs/examples/transformers/pokemon.py | 8 +- docs/website/docs/conftest.py | 40 +- .../transformations/dbt/dbt-snippets.py | 7 +- .../chess_production/code/chess-snippets.py | 16 +- .../code/load_arrow-snippets.py | 18 +- .../code/zendesk-snippets.py | 16 +- .../nested_data/code/nested_data-snippets.py | 18 +- .../transformers/code/pokemon-snippets.py | 10 +- docs/website/docs/getting-started-snippets.py | 106 ++- docs/website/docs/intro-snippets.py | 13 +- .../performance-snippets.py | 14 +- docs/website/docs/utils.py | 7 +- docs/website/pydoc_markdown_dlt.py | 2 +- tests/cases.py | 252 +++---- .../cases/deploy_pipeline/debug_pipeline.py | 13 +- .../cases/deploy_pipeline/dummy_pipeline.py | 8 +- tests/cli/common/test_cli_invoke.py | 91 ++- tests/cli/common/test_telemetry_command.py | 30 +- tests/cli/conftest.py | 2 +- tests/cli/test_config_toml_writer.py | 57 +- tests/cli/test_deploy_command.py | 85 ++- tests/cli/test_init_command.py | 153 ++++- tests/cli/test_pipeline_command.py | 37 +- tests/cli/utils.py | 8 +- tests/common/cases/modules/uniq_mod_121.py | 2 + tests/common/configuration/test_accessors.py | 84 ++- .../configuration/test_configuration.py | 432 ++++++++---- tests/common/configuration/test_container.py | 11 +- .../common/configuration/test_credentials.py | 47 +- .../configuration/test_environ_provider.py | 33 +- tests/common/configuration/test_inject.py | 57 +- tests/common/configuration/test_providers.py | 1 + tests/common/configuration/test_sections.py | 59 +- tests/common/configuration/test_spec_union.py | 56 +- .../configuration/test_toml_provider.py | 138 +++- tests/common/configuration/utils.py | 60 +- .../data_writers/test_buffered_writer.py | 25 +- .../common/data_writers/test_data_writers.py | 54 +- tests/common/data_writers/utils.py | 22 +- .../common/normalizers/custom_normalizers.py | 7 +- .../normalizers/test_import_normalizers.py | 23 +- .../normalizers/test_json_relational.py | 448 ++++++------ tests/common/normalizers/test_naming.py | 103 ++- .../normalizers/test_naming_duck_case.py | 7 +- .../normalizers/test_naming_snake_case.py | 10 +- tests/common/reflection/test_reflect_spec.py | 169 ++++- tests/common/runners/test_pipes.py | 33 +- tests/common/runners/test_runnable.py | 14 +- tests/common/runners/test_runners.py | 20 +- tests/common/runners/test_venv.py | 4 +- tests/common/runners/utils.py | 10 +- tests/common/runtime/test_collector.py | 2 +- tests/common/runtime/test_logging.py | 30 +- tests/common/runtime/test_signals.py | 1 - tests/common/runtime/test_telemetry.py | 20 +- tests/common/schema/test_coercion.py | 111 ++- tests/common/schema/test_detections.py | 24 +- tests/common/schema/test_filtering.py | 34 +- tests/common/schema/test_inference.py | 81 ++- tests/common/schema/test_merges.py | 123 ++-- tests/common/schema/test_schema.py | 320 ++++++--- tests/common/schema/test_schema_contract.py | 240 ++++--- tests/common/schema/test_versioning.py | 4 +- tests/common/scripts/args.py | 2 +- tests/common/scripts/counter.py | 2 +- tests/common/scripts/cwd.py | 2 +- tests/common/scripts/long_lines.py | 2 +- tests/common/scripts/long_lines_fails.py | 2 +- tests/common/scripts/no_stdout_exception.py | 2 +- .../scripts/no_stdout_no_stderr_with_fail.py | 2 +- tests/common/scripts/raising_counter.py | 2 +- .../common/scripts/stdout_encode_exception.py | 2 +- tests/common/scripts/stdout_encode_result.py | 1 + tests/common/storages/test_file_storage.py | 39 +- tests/common/storages/test_loader_storage.py | 61 +- .../common/storages/test_local_filesystem.py | 6 +- .../common/storages/test_normalize_storage.py | 13 +- tests/common/storages/test_schema_storage.py | 74 +- .../storages/test_transactional_file.py | 10 +- .../common/storages/test_versioned_storage.py | 6 +- tests/common/storages/utils.py | 30 +- tests/common/test_arithmetics.py | 3 +- tests/common/test_destination.py | 71 +- tests/common/test_git.py | 17 +- tests/common/test_json.py | 49 +- tests/common/test_pipeline_state.py | 6 +- tests/common/test_time.py | 15 +- tests/common/test_typing.py | 43 +- tests/common/test_utils.py | 149 ++-- tests/common/test_validation.py | 40 +- tests/common/test_wei.py | 23 +- tests/common/utils.py | 18 +- tests/conftest.py | 50 +- tests/destinations/test_path_utils.py | 30 +- tests/extract/cases/eth_source/source.py | 1 + .../section_source/external_resources.py | 16 +- .../cases/section_source/named_module.py | 1 + tests/extract/conftest.py | 8 +- tests/extract/test_decorators.py | 170 +++-- tests/extract/test_extract.py | 22 +- tests/extract/test_extract_pipe.py | 97 +-- tests/extract/test_incremental.py | 636 +++++++++++------- tests/extract/test_sources.py | 266 +++++--- tests/extract/test_utils.py | 8 +- tests/extract/test_validation.py | 36 +- tests/extract/utils.py | 19 +- tests/helpers/airflow_tests/conftest.py | 2 +- .../airflow_tests/test_airflow_provider.py | 79 +-- .../airflow_tests/test_airflow_wrapper.py | 368 ++++++---- .../test_join_airflow_scheduler.py | 162 +++-- tests/helpers/airflow_tests/utils.py | 8 +- .../helpers/dbt_cloud_tests/test_dbt_cloud.py | 4 +- .../helpers/dbt_tests/local/test_dbt_utils.py | 71 +- .../local/test_runner_destinations.py | 80 ++- tests/helpers/dbt_tests/local/utils.py | 16 +- .../dbt_tests/test_runner_dbt_versions.py | 133 ++-- tests/helpers/dbt_tests/utils.py | 31 +- .../providers/test_google_secrets_provider.py | 64 +- .../test_streamlit_show_resources.py | 4 +- tests/libs/test_parquet_writer.py | 69 +- tests/libs/test_pyarrow.py | 36 +- tests/libs/test_pydantic.py | 164 +++-- .../athena_iceberg/test_athena_iceberg.py | 25 +- tests/load/bigquery/test_bigquery_client.py | 117 +++- .../bigquery/test_bigquery_table_builder.py | 3 +- tests/load/conftest.py | 18 +- tests/load/duckdb/test_duckdb_client.py | 39 +- .../load/duckdb/test_duckdb_table_builder.py | 7 +- tests/load/duckdb/test_motherduck_client.py | 10 +- tests/load/filesystem/test_aws_credentials.py | 34 +- .../load/filesystem/test_azure_credentials.py | 59 +- .../load/filesystem/test_filesystem_client.py | 88 ++- .../load/filesystem/test_filesystem_common.py | 32 +- tests/load/filesystem/utils.py | 13 +- tests/load/mssql/test_mssql_credentials.py | 21 +- tests/load/mssql/test_mssql_table_builder.py | 6 +- tests/load/pipeline/conftest.py | 7 +- tests/load/pipeline/test_arrow_loading.py | 61 +- tests/load/pipeline/test_athena.py | 113 +++- tests/load/pipeline/test_dbt_helper.py | 60 +- tests/load/pipeline/test_drop.py | 175 +++-- tests/load/pipeline/test_duckdb.py | 30 +- .../load/pipeline/test_filesystem_pipeline.py | 50 +- tests/load/pipeline/test_merge_disposition.py | 190 ++++-- tests/load/pipeline/test_pipelines.py | 350 +++++++--- tests/load/pipeline/test_redshift.py | 13 +- .../load/pipeline/test_replace_disposition.py | 233 ++++--- tests/load/pipeline/test_restore_state.py | 194 ++++-- tests/load/pipeline/test_stage_loading.py | 127 +++- .../test_write_disposition_changes.py | 124 ++-- tests/load/pipeline/utils.py | 59 +- tests/load/postgres/test_postgres_client.py | 52 +- .../postgres/test_postgres_table_builder.py | 22 +- tests/load/qdrant/test_pipeline.py | 47 +- tests/load/qdrant/utils.py | 10 +- tests/load/redshift/test_redshift_client.py | 33 +- .../redshift/test_redshift_table_builder.py | 31 +- .../snowflake/test_snowflake_configuration.py | 86 ++- .../snowflake/test_snowflake_table_builder.py | 11 +- tests/load/test_dummy_client.py | 207 +++--- tests/load/test_insert_job_client.py | 133 +++- tests/load/test_job_client.py | 285 +++++--- tests/load/test_sql_client.py | 258 +++++-- tests/load/utils.py | 347 +++++++--- tests/load/weaviate/test_naming.py | 7 +- tests/load/weaviate/test_pipeline.py | 52 +- tests/load/weaviate/test_weaviate_client.py | 107 ++- tests/normalize/mock_rasa_json_normalizer.py | 15 +- tests/normalize/test_normalize.py | 283 ++++++-- .../cases/github_pipeline/github_extract.py | 4 +- .../cases/github_pipeline/github_pipeline.py | 17 +- tests/pipeline/conftest.py | 10 +- tests/pipeline/test_arrow_sources.py | 85 ++- tests/pipeline/test_dlt_versions.py | 63 +- tests/pipeline/test_pipeline.py | 294 +++++--- tests/pipeline/test_pipeline_extra.py | 72 +- .../test_pipeline_file_format_resolver.py | 15 +- tests/pipeline/test_pipeline_state.py | 133 +++- tests/pipeline/test_pipeline_trace.py | 117 ++-- tests/pipeline/test_schema_contracts.py | 329 ++++----- tests/pipeline/test_schema_updates.py | 37 +- tests/pipeline/utils.py | 34 +- tests/reflection/module_cases/__init__.py | 2 +- tests/reflection/module_cases/all_imports.py | 2 +- .../module_cases/executes_resource.py | 3 +- .../reflection/module_cases/import_as_type.py | 2 + tests/reflection/module_cases/no_pkg.py | 2 +- tests/reflection/module_cases/raises.py | 2 +- .../module_cases/stripe_analytics/__init__.py | 2 +- .../stripe_analytics/stripe_analytics.py | 2 +- .../module_cases/stripe_analytics_pipeline.py | 2 +- tests/reflection/test_script_inspector.py | 14 +- tests/sources/helpers/test_requests.py | 107 +-- tests/tools/clean_redshift.py | 2 +- tests/tools/create_storages.py | 9 +- tests/utils.py | 36 +- 451 files changed, 16939 insertions(+), 7726 deletions(-) diff --git a/dlt/__init__.py b/dlt/__init__.py index 728343bdd6..e2a6b1a3a7 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -29,7 +29,15 @@ from dlt import sources from dlt.extract.decorators import source, resource, transformer, defer -from dlt.pipeline import pipeline as _pipeline, run, attach, Pipeline, dbt, current as _current, mark as _mark +from dlt.pipeline import ( + pipeline as _pipeline, + run, + attach, + Pipeline, + dbt, + current as _current, + mark as _mark, +) from dlt.pipeline import progress from dlt import destinations diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index dfda2966b9..158a4a4b1c 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -14,13 +14,28 @@ from dlt.cli import utils from dlt.pipeline.exceptions import CannotRestorePipelineException -from dlt.cli.init_command import init_command, list_verified_sources_command, DLT_INIT_DOCS_URL, DEFAULT_VERIFIED_SOURCES_REPO +from dlt.cli.init_command import ( + init_command, + list_verified_sources_command, + DLT_INIT_DOCS_URL, + DEFAULT_VERIFIED_SOURCES_REPO, +) from dlt.cli.pipeline_command import pipeline_command, DLT_PIPELINE_COMMAND_DOCS_URL -from dlt.cli.telemetry_command import DLT_TELEMETRY_DOCS_URL, change_telemetry_status_command, telemetry_status_command +from dlt.cli.telemetry_command import ( + DLT_TELEMETRY_DOCS_URL, + change_telemetry_status_command, + telemetry_status_command, +) try: from dlt.cli import deploy_command - from dlt.cli.deploy_command import PipelineWasNotRun, DLT_DEPLOY_DOCS_URL, DeploymentMethods, COMMAND_DEPLOY_REPO_LOCATION, SecretFormats + from dlt.cli.deploy_command import ( + PipelineWasNotRun, + DLT_DEPLOY_DOCS_URL, + DeploymentMethods, + COMMAND_DEPLOY_REPO_LOCATION, + SecretFormats, + ) except ModuleNotFoundError: pass @@ -36,7 +51,13 @@ def on_exception(ex: Exception, info: str) -> None: @utils.track_command("init", False, "source_name", "destination_name") -def init_command_wrapper(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str) -> int: +def init_command_wrapper( + source_name: str, + destination_name: str, + use_generic_template: bool, + repo_location: str, + branch: str, +) -> int: try: init_command(source_name, destination_name, use_generic_template, repo_location, branch) except Exception as ex: @@ -56,7 +77,12 @@ def list_verified_sources_command_wrapper(repo_location: str, branch: str) -> in @utils.track_command("deploy", False, "deployment_method") -def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, repo_location: str, branch: Optional[str] = None, **kwargs: Any +def deploy_command_wrapper( + pipeline_script_path: str, + deployment_method: str, + repo_location: str, + branch: Optional[str] = None, + **kwargs: Any, ) -> int: try: utils.ensure_git_command("deploy") @@ -65,35 +91,41 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re return -1 from git import InvalidGitRepositoryError, NoSuchPathError + try: deploy_command.deploy_command( pipeline_script_path=pipeline_script_path, deployment_method=deployment_method, repo_location=repo_location, branch=branch, - **kwargs + **kwargs, ) except (CannotRestorePipelineException, PipelineWasNotRun) as ex: - fmt.note("You must run the pipeline locally successfully at least once in order to deploy it.") + fmt.note( + "You must run the pipeline locally successfully at least once in order to deploy it." + ) on_exception(ex, DLT_DEPLOY_DOCS_URL) return -2 except InvalidGitRepositoryError: click.secho( "No git repository found for pipeline script %s." % fmt.bold(pipeline_script_path), err=True, - fg="red" + fg="red", ) fmt.note("If you do not have a repository yet, you can do either of:") - fmt.note("- Run the following command to initialize new repository: %s" % fmt.bold("git init")) - fmt.note("- Add your local code to Github as described here: %s" % fmt.bold("https://docs.github.com/en/get-started/importing-your-projects-to-github/importing-source-code-to-github/adding-locally-hosted-code-to-github")) + fmt.note( + "- Run the following command to initialize new repository: %s" % fmt.bold("git init") + ) + fmt.note( + "- Add your local code to Github as described here: %s" + % fmt.bold( + "https://docs.github.com/en/get-started/importing-your-projects-to-github/importing-source-code-to-github/adding-locally-hosted-code-to-github" + ) + ) fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_DEPLOY_DOCS_URL)) return -3 except NoSuchPathError as path_ex: - click.secho( - "The pipeline script does not exist\n%s" % str(path_ex), - err=True, - fg="red" - ) + click.secho("The pipeline script does not exist\n%s" % str(path_ex), err=True, fg="red") return -4 except Exception as ex: on_exception(ex, DLT_DEPLOY_DOCS_URL) @@ -103,14 +135,17 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re @utils.track_command("pipeline", True, "operation") def pipeline_command_wrapper( - operation: str, pipeline_name: str, pipelines_dir: str, verbosity: int, **command_kwargs: Any + operation: str, pipeline_name: str, pipelines_dir: str, verbosity: int, **command_kwargs: Any ) -> int: try: pipeline_command(operation, pipeline_name, pipelines_dir, verbosity, **command_kwargs) return 0 except CannotRestorePipelineException as ex: click.secho(str(ex), err=True, fg="red") - click.secho("Try command %s to restore the pipeline state from destination" % fmt.bold(f"dlt pipeline {pipeline_name} sync")) + click.secho( + "Try command %s to restore the pipeline state from destination" + % fmt.bold(f"dlt pipeline {pipeline_name} sync") + ) return -1 except Exception as ex: on_exception(ex, DLT_PIPELINE_COMMAND_DOCS_URL) @@ -155,21 +190,31 @@ def telemetry_change_status_command_wrapper(enabled: bool) -> int: ACTION_EXECUTED = False + def print_help(parser: argparse.ArgumentParser) -> None: if not ACTION_EXECUTED: parser.print_help() class TelemetryAction(argparse.Action): - def __init__(self, option_strings: Sequence[str], dest: Any = argparse.SUPPRESS, default: Any = argparse.SUPPRESS, help: str = None) -> None: # noqa + def __init__( + self, + option_strings: Sequence[str], + dest: Any = argparse.SUPPRESS, + default: Any = argparse.SUPPRESS, + help: str = None, # noqa + ) -> None: super(TelemetryAction, self).__init__( - option_strings=option_strings, - dest=dest, - default=default, - nargs=0, - help=help + option_strings=option_strings, dest=dest, default=default, nargs=0, help=help ) - def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Any, option_string: str = None) -> None: + + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: Any, + option_string: str = None, + ) -> None: global ACTION_EXECUTED ACTION_EXECUTED = True @@ -177,129 +222,347 @@ def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespac class NonInteractiveAction(argparse.Action): - def __init__(self, option_strings: Sequence[str], dest: Any = argparse.SUPPRESS, default: Any = argparse.SUPPRESS, help: str = None) -> None: # noqa + def __init__( + self, + option_strings: Sequence[str], + dest: Any = argparse.SUPPRESS, + default: Any = argparse.SUPPRESS, + help: str = None, # noqa + ) -> None: super(NonInteractiveAction, self).__init__( - option_strings=option_strings, - dest=dest, - default=default, - nargs=0, - help=help + option_strings=option_strings, dest=dest, default=default, nargs=0, help=help ) - def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Any, option_string: str = None) -> None: + + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: Any, + option_string: str = None, + ) -> None: fmt.ALWAYS_CHOOSE_DEFAULT = True class DebugAction(argparse.Action): - def __init__(self, option_strings: Sequence[str], dest: Any = argparse.SUPPRESS, default: Any = argparse.SUPPRESS, help: str = None) -> None: # noqa + def __init__( + self, + option_strings: Sequence[str], + dest: Any = argparse.SUPPRESS, + default: Any = argparse.SUPPRESS, + help: str = None, # noqa + ) -> None: super(DebugAction, self).__init__( - option_strings=option_strings, - dest=dest, - default=default, - nargs=0, - help=help + option_strings=option_strings, dest=dest, default=default, nargs=0, help=help ) - def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Any, option_string: str = None) -> None: + + def __call__( + self, + parser: argparse.ArgumentParser, + namespace: argparse.Namespace, + values: Any, + option_string: str = None, + ) -> None: global DEBUG_FLAG # will show stack traces (and maybe more debug things) DEBUG_FLAG = True def main() -> int: - parser = argparse.ArgumentParser(description="Creates, adds, inspects and deploys dlt pipelines.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action="version", version='%(prog)s {version}'.format(version=__version__)) - parser.add_argument('--disable-telemetry', action=TelemetryAction, help="Disables telemetry before command is executed") - parser.add_argument('--enable-telemetry', action=TelemetryAction, help="Enables telemetry before command is executed") - parser.add_argument('--non-interactive', action=NonInteractiveAction, help="Non interactive mode. Default choices are automatically made for confirmations and prompts.") - parser.add_argument('--debug', action=DebugAction, help="Displays full stack traces on exceptions.") + parser = argparse.ArgumentParser( + description="Creates, adds, inspects and deploys dlt pipelines.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--version", action="version", version="%(prog)s {version}".format(version=__version__) + ) + parser.add_argument( + "--disable-telemetry", + action=TelemetryAction, + help="Disables telemetry before command is executed", + ) + parser.add_argument( + "--enable-telemetry", + action=TelemetryAction, + help="Enables telemetry before command is executed", + ) + parser.add_argument( + "--non-interactive", + action=NonInteractiveAction, + help=( + "Non interactive mode. Default choices are automatically made for confirmations and" + " prompts." + ), + ) + parser.add_argument( + "--debug", action=DebugAction, help="Displays full stack traces on exceptions." + ) subparsers = parser.add_subparsers(dest="command") - init_cmd = subparsers.add_parser("init", help="Creates a pipeline project in the current folder by adding existing verified source or creating a new one from template.") - init_cmd.add_argument("--list-verified-sources", "-l", default=False, action="store_true", help="List available verified sources") - init_cmd.add_argument("source", nargs='?', help="Name of data source for which to create a pipeline. Adds existing verified source or creates a new pipeline template if verified source for your data source is not yet implemented.") - init_cmd.add_argument("destination", nargs='?', help="Name of a destination ie. bigquery or redshift") - init_cmd.add_argument("--location", default=DEFAULT_VERIFIED_SOURCES_REPO, help="Advanced. Uses a specific url or local path to verified sources repository.") - init_cmd.add_argument("--branch", default=None, help="Advanced. Uses specific branch of the init repository to fetch the template.") - init_cmd.add_argument("--generic", default=False, action="store_true", help="When present uses a generic template with all the dlt loading code present will be used. Otherwise a debug template is used that can be immediately run to get familiar with the dlt sources.") + init_cmd = subparsers.add_parser( + "init", + help=( + "Creates a pipeline project in the current folder by adding existing verified source or" + " creating a new one from template." + ), + ) + init_cmd.add_argument( + "--list-verified-sources", + "-l", + default=False, + action="store_true", + help="List available verified sources", + ) + init_cmd.add_argument( + "source", + nargs="?", + help=( + "Name of data source for which to create a pipeline. Adds existing verified source or" + " creates a new pipeline template if verified source for your data source is not yet" + " implemented." + ), + ) + init_cmd.add_argument( + "destination", nargs="?", help="Name of a destination ie. bigquery or redshift" + ) + init_cmd.add_argument( + "--location", + default=DEFAULT_VERIFIED_SOURCES_REPO, + help="Advanced. Uses a specific url or local path to verified sources repository.", + ) + init_cmd.add_argument( + "--branch", + default=None, + help="Advanced. Uses specific branch of the init repository to fetch the template.", + ) + init_cmd.add_argument( + "--generic", + default=False, + action="store_true", + help=( + "When present uses a generic template with all the dlt loading code present will be" + " used. Otherwise a debug template is used that can be immediately run to get familiar" + " with the dlt sources." + ), + ) # deploy command requires additional dependencies try: # make sure the name is defined _ = deploy_command - deploy_comm = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False) - deploy_comm.add_argument("--location", default=COMMAND_DEPLOY_REPO_LOCATION, help="Advanced. Uses a specific url or local path to pipelines repository.") - deploy_comm.add_argument("--branch", help="Advanced. Uses specific branch of the deploy repository to fetch the template.") + deploy_comm = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, add_help=False + ) + deploy_comm.add_argument( + "--location", + default=COMMAND_DEPLOY_REPO_LOCATION, + help="Advanced. Uses a specific url or local path to pipelines repository.", + ) + deploy_comm.add_argument( + "--branch", + help="Advanced. Uses specific branch of the deploy repository to fetch the template.", + ) - deploy_cmd = subparsers.add_parser("deploy", help="Creates a deployment package for a selected pipeline script") - deploy_cmd.add_argument("pipeline_script_path", metavar="pipeline-script-path", help="Path to a pipeline script") + deploy_cmd = subparsers.add_parser( + "deploy", help="Creates a deployment package for a selected pipeline script" + ) + deploy_cmd.add_argument( + "pipeline_script_path", metavar="pipeline-script-path", help="Path to a pipeline script" + ) deploy_sub_parsers = deploy_cmd.add_subparsers(dest="deployment_method") # deploy github actions - deploy_github_cmd = deploy_sub_parsers.add_parser(DeploymentMethods.github_actions.value, help="Deploys the pipeline to Github Actions", parents=[deploy_comm]) - deploy_github_cmd.add_argument("--schedule", required=True, help="A schedule with which to run the pipeline, in cron format. Example: '*/30 * * * *' will run the pipeline every 30 minutes. Remember to enclose the scheduler expression in quotation marks!") - deploy_github_cmd.add_argument("--run-manually", default=True, action="store_true", help="Allows the pipeline to be run manually form Github Actions UI.") - deploy_github_cmd.add_argument("--run-on-push", default=False, action="store_true", help="Runs the pipeline with every push to the repository.") + deploy_github_cmd = deploy_sub_parsers.add_parser( + DeploymentMethods.github_actions.value, + help="Deploys the pipeline to Github Actions", + parents=[deploy_comm], + ) + deploy_github_cmd.add_argument( + "--schedule", + required=True, + help=( + "A schedule with which to run the pipeline, in cron format. Example: '*/30 * * * *'" + " will run the pipeline every 30 minutes. Remember to enclose the scheduler" + " expression in quotation marks!" + ), + ) + deploy_github_cmd.add_argument( + "--run-manually", + default=True, + action="store_true", + help="Allows the pipeline to be run manually form Github Actions UI.", + ) + deploy_github_cmd.add_argument( + "--run-on-push", + default=False, + action="store_true", + help="Runs the pipeline with every push to the repository.", + ) # deploy airflow composer - deploy_airflow_cmd = deploy_sub_parsers.add_parser(DeploymentMethods.airflow_composer.value, help="Deploys the pipeline to Airflow", parents=[deploy_comm]) - deploy_airflow_cmd.add_argument("--secrets-format", default=SecretFormats.toml.value, choices=[v.value for v in SecretFormats], required=False, help="Format of the secrets") + deploy_airflow_cmd = deploy_sub_parsers.add_parser( + DeploymentMethods.airflow_composer.value, + help="Deploys the pipeline to Airflow", + parents=[deploy_comm], + ) + deploy_airflow_cmd.add_argument( + "--secrets-format", + default=SecretFormats.toml.value, + choices=[v.value for v in SecretFormats], + required=False, + help="Format of the secrets", + ) except NameError: # create placeholder command - deploy_cmd = subparsers.add_parser("deploy", help='Install additional dependencies with pip install "dlt[cli]" to create deployment packages', add_help=False) + deploy_cmd = subparsers.add_parser( + "deploy", + help=( + 'Install additional dependencies with pip install "dlt[cli]" to create deployment' + " packages" + ), + add_help=False, + ) deploy_cmd.add_argument("--help", "-h", nargs="?", const=True) - deploy_cmd.add_argument("pipeline_script_path", metavar="pipeline-script-path", nargs=argparse.REMAINDER) + deploy_cmd.add_argument( + "pipeline_script_path", metavar="pipeline-script-path", nargs=argparse.REMAINDER + ) schema = subparsers.add_parser("schema", help="Shows, converts and upgrades schemas") - schema.add_argument("file", help="Schema file name, in yaml or json format, will autodetect based on extension") - schema.add_argument("--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format") - schema.add_argument("--remove-defaults", action="store_true", help="Does not show default hint values") + schema.add_argument( + "file", help="Schema file name, in yaml or json format, will autodetect based on extension" + ) + schema.add_argument( + "--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format" + ) + schema.add_argument( + "--remove-defaults", action="store_true", help="Does not show default hint values" + ) - pipe_cmd = subparsers.add_parser("pipeline", help="Operations on pipelines that were ran locally") - pipe_cmd.add_argument("--list-pipelines", "-l", default=False, action="store_true", help="List local pipelines") - pipe_cmd.add_argument("pipeline_name", nargs='?', help="Pipeline name") + pipe_cmd = subparsers.add_parser( + "pipeline", help="Operations on pipelines that were ran locally" + ) + pipe_cmd.add_argument( + "--list-pipelines", "-l", default=False, action="store_true", help="List local pipelines" + ) + pipe_cmd.add_argument("pipeline_name", nargs="?", help="Pipeline name") pipe_cmd.add_argument("--pipelines-dir", help="Pipelines working directory", default=None) - pipe_cmd.add_argument("--verbose", "-v", action='count', default=0, help="Provides more information for certain commands.", dest="verbosity") + pipe_cmd.add_argument( + "--verbose", + "-v", + action="count", + default=0, + help="Provides more information for certain commands.", + dest="verbosity", + ) pipeline_subparsers = pipe_cmd.add_subparsers(dest="operation", required=False) pipe_cmd_sync_parent = argparse.ArgumentParser(add_help=False) - pipe_cmd_sync_parent.add_argument("--destination", help="Sync from this destination when local pipeline state is missing.") - pipe_cmd_sync_parent.add_argument("--dataset-name", help="Dataset name to sync from when local pipeline state is missing.") + pipe_cmd_sync_parent.add_argument( + "--destination", help="Sync from this destination when local pipeline state is missing." + ) + pipe_cmd_sync_parent.add_argument( + "--dataset-name", help="Dataset name to sync from when local pipeline state is missing." + ) - pipeline_subparsers.add_parser("info", help="Displays state of the pipeline, use -v or -vv for more info") - pipeline_subparsers.add_parser("show", help="Generates and launches Streamlit app with the loading status and dataset explorer") - pipeline_subparsers.add_parser("failed-jobs", help="Displays information on all the failed loads in all completed packages, failed jobs and associated error messages") - pipeline_subparsers.add_parser("drop-pending-packages", help="Deletes all extracted and normalized packages including those that are partially loaded.") + pipeline_subparsers.add_parser( + "info", help="Displays state of the pipeline, use -v or -vv for more info" + ) + pipeline_subparsers.add_parser( + "show", + help="Generates and launches Streamlit app with the loading status and dataset explorer", + ) + pipeline_subparsers.add_parser( + "failed-jobs", + help=( + "Displays information on all the failed loads in all completed packages, failed jobs" + " and associated error messages" + ), + ) + pipeline_subparsers.add_parser( + "drop-pending-packages", + help=( + "Deletes all extracted and normalized packages including those that are partially" + " loaded." + ), + ) pipeline_subparsers.add_parser( "sync", - help="Drops the local state of the pipeline and resets all the schemas and restores it from destination. The destination state, data and schemas are left intact.", - parents=[pipe_cmd_sync_parent] + help=( + "Drops the local state of the pipeline and resets all the schemas and restores it from" + " destination. The destination state, data and schemas are left intact." + ), + parents=[pipe_cmd_sync_parent], + ) + pipeline_subparsers.add_parser( + "trace", help="Displays last run trace, use -v or -vv for more info" ) - pipeline_subparsers.add_parser("trace", help="Displays last run trace, use -v or -vv for more info") pipe_cmd_schema = pipeline_subparsers.add_parser("schema", help="Displays default schema") - pipe_cmd_schema.add_argument("--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format") - pipe_cmd_schema.add_argument("--remove-defaults", action="store_true", help="Does not show default hint values") + pipe_cmd_schema.add_argument( + "--format", choices=["json", "yaml"], default="yaml", help="Display schema in this format" + ) + pipe_cmd_schema.add_argument( + "--remove-defaults", action="store_true", help="Does not show default hint values" + ) pipe_cmd_drop = pipeline_subparsers.add_parser( "drop", help="Selectively drop tables and reset state", parents=[pipe_cmd_sync_parent], - epilog=f"See {DLT_PIPELINE_COMMAND_DOCS_URL}#selectively-drop-tables-and-reset-state for more info" + epilog=( + f"See {DLT_PIPELINE_COMMAND_DOCS_URL}#selectively-drop-tables-and-reset-state for more" + " info" + ), + ) + pipe_cmd_drop.add_argument( + "resources", + nargs="*", + help=( + "One or more resources to drop. Can be exact resource name(s) or regex pattern(s)." + " Regex patterns must start with re:" + ), + ) + pipe_cmd_drop.add_argument( + "--drop-all", + action="store_true", + default=False, + help="Drop all resources found in schema. Supersedes [resources] argument.", + ) + pipe_cmd_drop.add_argument( + "--state-paths", nargs="*", help="State keys or json paths to drop", default=() + ) + pipe_cmd_drop.add_argument( + "--schema", + help="Schema name to drop from (if other than default schema).", + dest="schema_name", + ) + pipe_cmd_drop.add_argument( + "--state-only", + action="store_true", + help="Only wipe state for matching resources without dropping tables.", + default=False, ) - pipe_cmd_drop.add_argument("resources", nargs="*", help="One or more resources to drop. Can be exact resource name(s) or regex pattern(s). Regex patterns must start with re:") - pipe_cmd_drop.add_argument("--drop-all", action="store_true", default=False, help="Drop all resources found in schema. Supersedes [resources] argument.") - pipe_cmd_drop.add_argument("--state-paths", nargs="*", help="State keys or json paths to drop", default=()) - pipe_cmd_drop.add_argument("--schema", help="Schema name to drop from (if other than default schema).", dest="schema_name") - pipe_cmd_drop.add_argument("--state-only", action="store_true", help="Only wipe state for matching resources without dropping tables.", default=False) - pipe_cmd_package = pipeline_subparsers.add_parser("load-package", help="Displays information on load package, use -v or -vv for more info") - pipe_cmd_package.add_argument("load_id", metavar="load-id", nargs='?', help="Load id of completed or normalized package. Defaults to the most recent package.") + pipe_cmd_package = pipeline_subparsers.add_parser( + "load-package", help="Displays information on load package, use -v or -vv for more info" + ) + pipe_cmd_package.add_argument( + "load_id", + metavar="load-id", + nargs="?", + help="Load id of completed or normalized package. Defaults to the most recent package.", + ) subparsers.add_parser("telemetry", help="Shows telemetry status") args = parser.parse_args() if Venv.is_virtual_env() and not Venv.is_venv_activated(): - fmt.warning("You are running dlt installed in the global environment, however you have virtual environment activated. The dlt command will not see dependencies from virtual environment. You should uninstall the dlt from global environment and install it in the current virtual environment instead.") + fmt.warning( + "You are running dlt installed in the global environment, however you have virtual" + " environment activated. The dlt command will not see dependencies from virtual" + " environment. You should uninstall the dlt from global environment and install it in" + " the current virtual environment instead." + ) if args.command == "schema": return schema_command_wrapper(args.file, args.format, args.remove_defaults) @@ -311,7 +574,7 @@ def main() -> int: if not command_kwargs.get("pipeline_name"): pipe_cmd.print_usage() return -1 - command_kwargs['operation'] = args.operation or "info" + command_kwargs["operation"] = args.operation or "info" del command_kwargs["command"] del command_kwargs["list_pipelines"] return pipeline_command_wrapper(**command_kwargs) @@ -323,7 +586,9 @@ def main() -> int: init_cmd.print_usage() return -1 else: - return init_command_wrapper(args.source, args.destination, args.generic, args.location, args.branch) + return init_command_wrapper( + args.source, args.destination, args.generic, args.location, args.branch + ) elif args.command == "deploy": try: deploy_args = vars(args) @@ -332,12 +597,17 @@ def main() -> int: deployment_method=deploy_args.pop("deployment_method"), repo_location=deploy_args.pop("location"), branch=deploy_args.pop("branch"), - **deploy_args + **deploy_args, ) except (NameError, KeyError): - fmt.warning("Please install additional command line dependencies to use deploy command:") + fmt.warning( + "Please install additional command line dependencies to use deploy command:" + ) fmt.secho('pip install "dlt[cli]"', bold=True) - fmt.echo("We ask you to install those dependencies separately to keep our core library small and make it work everywhere.") + fmt.echo( + "We ask you to install those dependencies separately to keep our core library small" + " and make it work everywhere." + ) return -1 elif args.command == "telemetry": return telemetry_status_command_wrapper() diff --git a/dlt/cli/config_toml_writer.py b/dlt/cli/config_toml_writer.py index ca2e74fd15..8cf831d725 100644 --- a/dlt/cli/config_toml_writer.py +++ b/dlt/cli/config_toml_writer.py @@ -5,7 +5,11 @@ from collections.abc import Sequence as C_Sequence from dlt.common import pendulum -from dlt.common.configuration.specs import BaseConfiguration, is_base_configuration_inner_hint, extract_inner_hint +from dlt.common.configuration.specs import ( + BaseConfiguration, + is_base_configuration_inner_hint, + extract_inner_hint, +) from dlt.common.data_types import py_type_to_sc_type from dlt.common.typing import AnyType, is_final_type, is_optional_type @@ -53,13 +57,15 @@ def write_value( hint: AnyType, overwrite_existing: bool, default_value: Any = None, - is_default_of_interest: bool = False + is_default_of_interest: bool = False, ) -> None: # skip if table contains the name already if name in toml_table and not overwrite_existing: return # do not dump final and optional fields if they are not of special interest - if (is_final_type(hint) or is_optional_type(hint) or default_value is not None) and not is_default_of_interest: + if ( + is_final_type(hint) or is_optional_type(hint) or default_value is not None + ) and not is_default_of_interest: return # get the inner hint to generate cool examples hint = extract_inner_hint(hint) @@ -84,10 +90,19 @@ def write_spec(toml_table: TOMLTable, config: BaseConfiguration, overwrite_exist default_value = getattr(config, name, None) # check if field is of particular interest and should be included if it has default is_default_of_interest = name in config.__config_gen_annotations__ - write_value(toml_table, name, hint, overwrite_existing, default_value=default_value, is_default_of_interest=is_default_of_interest) + write_value( + toml_table, + name, + hint, + overwrite_existing, + default_value=default_value, + is_default_of_interest=is_default_of_interest, + ) -def write_values(toml: TOMLContainer, values: Iterable[WritableConfigValue], overwrite_existing: bool) -> None: +def write_values( + toml: TOMLContainer, values: Iterable[WritableConfigValue], overwrite_existing: bool +) -> None: for value in values: toml_table: TOMLTable = toml # type: ignore for section in value.sections: @@ -98,4 +113,11 @@ def write_values(toml: TOMLContainer, values: Iterable[WritableConfigValue], ove else: toml_table = toml_table[section] # type: ignore - write_value(toml_table, value.name, value.hint, overwrite_existing, default_value=value.default_value, is_default_of_interest=True) + write_value( + toml_table, + value.name, + value.hint, + overwrite_existing, + default_value=value.default_value, + is_default_of_interest=True, + ) diff --git a/dlt/cli/deploy_command.py b/dlt/cli/deploy_command.py index a7bdf2e0e7..8dd744f86c 100644 --- a/dlt/cli/deploy_command.py +++ b/dlt/cli/deploy_command.py @@ -11,8 +11,16 @@ from dlt.cli import utils from dlt.cli import echo as fmt -from dlt.cli.deploy_command_helpers import (PipelineWasNotRun, BaseDeployment, ask_files_overwrite, generate_pip_freeze, github_origin_to_url, serialize_templated_yaml, - wrap_template_str, get_schedule_description) +from dlt.cli.deploy_command_helpers import ( + PipelineWasNotRun, + BaseDeployment, + ask_files_overwrite, + generate_pip_freeze, + github_origin_to_url, + serialize_templated_yaml, + wrap_template_str, + get_schedule_description, +) from dlt.version import DLT_PKG_NAME @@ -20,7 +28,9 @@ REQUIREMENTS_GITHUB_ACTION = "requirements_github_action.txt" DLT_DEPLOY_DOCS_URL = "https://dlthub.com/docs/walkthroughs/deploy-a-pipeline" -DLT_AIRFLOW_GCP_DOCS_URL = "https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer" +DLT_AIRFLOW_GCP_DOCS_URL = ( + "https://dlthub.com/docs/walkthroughs/deploy-a-pipeline/deploy-with-airflow-composer" +) AIRFLOW_GETTING_STARTED = "https://airflow.apache.org/docs/apache-airflow/stable/start.html" AIRFLOW_DAG_TEMPLATE_SCRIPT = "dag_template.py" AIRFLOW_CLOUDBUILD_YAML = "cloudbuild.yaml" @@ -38,9 +48,13 @@ class SecretFormats(Enum): toml = "toml" -def deploy_command(pipeline_script_path: str, deployment_method: str, repo_location: str, branch: Optional[str] = None, **kwargs: Any +def deploy_command( + pipeline_script_path: str, + deployment_method: str, + repo_location: str, + branch: Optional[str] = None, + **kwargs: Any, ) -> None: - # get current repo local folder deployment_class: Type[BaseDeployment] = None if deployment_method == DeploymentMethods.github_actions.value: @@ -48,10 +62,15 @@ def deploy_command(pipeline_script_path: str, deployment_method: str, repo_locat elif deployment_method == DeploymentMethods.airflow_composer.value: deployment_class = AirflowDeployment else: - raise ValueError(f"Deployment method '{deployment_method}' is not supported. Only {', '.join([m.value for m in DeploymentMethods])} are available.'") + raise ValueError( + f"Deployment method '{deployment_method}' is not supported. Only" + f" {', '.join([m.value for m in DeploymentMethods])} are available.'" + ) # command no longer needed kwargs.pop("command", None) - deployment_class(pipeline_script_path=pipeline_script_path, location=repo_location, branch=branch, **kwargs).run_deployment() + deployment_class( + pipeline_script_path=pipeline_script_path, location=repo_location, branch=branch, **kwargs + ).run_deployment() class GithubActionDeployment(BaseDeployment): @@ -77,22 +96,25 @@ def _generate_workflow(self, *args: Optional[Any]) -> None: if self.schedule_description is None: # TODO: move that check to _dlt and some intelligent help message on missing arg raise ValueError( - f"Setting 'schedule' for '{self.deployment_method}' is required! Use deploy command as 'dlt deploy chess.py {self.deployment_method} --schedule \"*/30 * * * *\"'." + f"Setting 'schedule' for '{self.deployment_method}' is required! Use deploy command" + f" as 'dlt deploy chess.py {self.deployment_method} --schedule \"*/30 * * * *\"'." ) workflow = self._create_new_workflow() serialized_workflow = serialize_templated_yaml(workflow) serialized_workflow_name = f"run_{self.state['pipeline_name']}_workflow.yml" - self.artifacts['serialized_workflow'] = serialized_workflow - self.artifacts['serialized_workflow_name'] = serialized_workflow_name + self.artifacts["serialized_workflow"] = serialized_workflow + self.artifacts["serialized_workflow_name"] = serialized_workflow_name # pip freeze special requirements file - with self.template_storage.open_file(os.path.join(self.deployment_method, "requirements_blacklist.txt")) as f: + with self.template_storage.open_file( + os.path.join(self.deployment_method, "requirements_blacklist.txt") + ) as f: requirements_blacklist = f.readlines() requirements_txt = generate_pip_freeze(requirements_blacklist, REQUIREMENTS_GITHUB_ACTION) requirements_txt_name = REQUIREMENTS_GITHUB_ACTION # if repo_storage.has_file(utils.REQUIREMENTS_TXT): - self.artifacts['requirements_txt'] = requirements_txt - self.artifacts['requirements_txt_name'] = requirements_txt_name + self.artifacts["requirements_txt"] = requirements_txt + self.artifacts["requirements_txt_name"] = requirements_txt_name def _make_modification(self) -> None: if not self.repo_storage.has_folder(utils.GITHUB_WORKFLOWS_DIR): @@ -100,15 +122,21 @@ def _make_modification(self) -> None: self.repo_storage.save( os.path.join(utils.GITHUB_WORKFLOWS_DIR, self.artifacts["serialized_workflow_name"]), - self.artifacts["serialized_workflow"] + self.artifacts["serialized_workflow"], + ) + self.repo_storage.save( + self.artifacts["requirements_txt_name"], self.artifacts["requirements_txt"] ) - self.repo_storage.save(self.artifacts["requirements_txt_name"], self.artifacts["requirements_txt"]) def _create_new_workflow(self) -> Any: - with self.template_storage.open_file(os.path.join(self.deployment_method, "run_pipeline_workflow.yml")) as f: + with self.template_storage.open_file( + os.path.join(self.deployment_method, "run_pipeline_workflow.yml") + ) as f: workflow = yaml.safe_load(f) # customize the workflow - workflow["name"] = f"Run {self.state['pipeline_name']} pipeline from {self.pipeline_script_path}" + workflow["name"] = ( + f"Run {self.state['pipeline_name']} pipeline from {self.pipeline_script_path}" + ) if self.run_on_push is False: del workflow["on"]["push"] if self.run_manually is False: @@ -137,51 +165,98 @@ def _create_new_workflow(self) -> Any: return workflow def _echo_instructions(self, *args: Optional[Any]) -> None: - fmt.echo("Your %s deployment for pipeline %s in script %s is ready!" % ( - fmt.bold(self.deployment_method), fmt.bold(self.state["pipeline_name"]), fmt.bold(self.pipeline_script_path) - )) + fmt.echo( + "Your %s deployment for pipeline %s in script %s is ready!" + % ( + fmt.bold(self.deployment_method), + fmt.bold(self.state["pipeline_name"]), + fmt.bold(self.pipeline_script_path), + ) + ) # It contains all relevant configurations and references to credentials that are needed to run the pipeline - fmt.echo("* A github workflow file %s was created in %s." % ( - fmt.bold(self.artifacts["serialized_workflow_name"]), fmt.bold(utils.GITHUB_WORKFLOWS_DIR) - )) - fmt.echo("* The schedule with which the pipeline is run is: %s.%s%s" % ( - fmt.bold(self.schedule_description), - " You can also run the pipeline manually." if self.run_manually else "", - " Pipeline will also run on each push to the repository." if self.run_on_push else "", - )) fmt.echo( - "* The dependencies that will be used to run the pipeline are stored in %s. If you change add more dependencies, remember to refresh your deployment by running the same 'deploy' command again." % fmt.bold( - self.artifacts['requirements_txt_name']) + "* A github workflow file %s was created in %s." + % ( + fmt.bold(self.artifacts["serialized_workflow_name"]), + fmt.bold(utils.GITHUB_WORKFLOWS_DIR), + ) + ) + fmt.echo( + "* The schedule with which the pipeline is run is: %s.%s%s" + % ( + fmt.bold(self.schedule_description), + " You can also run the pipeline manually." if self.run_manually else "", + ( + " Pipeline will also run on each push to the repository." + if self.run_on_push + else "" + ), + ) + ) + fmt.echo( + "* The dependencies that will be used to run the pipeline are stored in %s. If you" + " change add more dependencies, remember to refresh your deployment by running the same" + " 'deploy' command again." + % fmt.bold(self.artifacts["requirements_txt_name"]) ) fmt.echo() if len(self.secret_envs) == 0 and len(self.envs) == 0: fmt.echo("1. Your pipeline does not seem to need any secrets.") else: - fmt.echo("You should now add the secrets to github repository secrets, commit and push the pipeline files to github.") - fmt.echo("1. Add the following secret values (typically stored in %s): \n%s\nin %s" % ( - fmt.bold(make_dlt_settings_path(SECRETS_TOML)), - fmt.bold("\n".join(self.env_prov.get_key_name(s_v.key, *s_v.sections) for s_v in self.secret_envs)), - fmt.bold(github_origin_to_url(self.origin, "/settings/secrets/actions")) - )) + fmt.echo( + "You should now add the secrets to github repository secrets, commit and push the" + " pipeline files to github." + ) + fmt.echo( + "1. Add the following secret values (typically stored in %s): \n%s\nin %s" + % ( + fmt.bold(make_dlt_settings_path(SECRETS_TOML)), + fmt.bold( + "\n".join( + self.env_prov.get_key_name(s_v.key, *s_v.sections) + for s_v in self.secret_envs + ) + ), + fmt.bold(github_origin_to_url(self.origin, "/settings/secrets/actions")), + ) + ) fmt.echo() self._echo_secrets() - fmt.echo("2. Add stage deployment files to commit. Use your Git UI or the following command") - new_req_path = self.repo_storage.from_relative_path_to_wd(self.artifacts['requirements_txt_name']) - new_workflow_path = self.repo_storage.from_relative_path_to_wd(os.path.join(utils.GITHUB_WORKFLOWS_DIR, self.artifacts['serialized_workflow_name'])) - fmt.echo(fmt.bold( - f"git add {new_req_path} {new_workflow_path}")) + fmt.echo( + "2. Add stage deployment files to commit. Use your Git UI or the following command" + ) + new_req_path = self.repo_storage.from_relative_path_to_wd( + self.artifacts["requirements_txt_name"] + ) + new_workflow_path = self.repo_storage.from_relative_path_to_wd( + os.path.join(utils.GITHUB_WORKFLOWS_DIR, self.artifacts["serialized_workflow_name"]) + ) + fmt.echo(fmt.bold(f"git add {new_req_path} {new_workflow_path}")) fmt.echo() fmt.echo("3. Commit the files above. Use your Git UI or the following command") - fmt.echo(fmt.bold(f"git commit -m 'run {self.state['pipeline_name']} pipeline with github action'")) + fmt.echo( + fmt.bold( + f"git commit -m 'run {self.state['pipeline_name']} pipeline with github action'" + ) + ) if is_dirty(self.repo): - fmt.warning("You have modified files in your repository. Do not forget to push changes to your pipeline script as well!") + fmt.warning( + "You have modified files in your repository. Do not forget to push changes to your" + " pipeline script as well!" + ) fmt.echo() fmt.echo("4. Push changes to github. Use your Git UI or the following command") fmt.echo(fmt.bold("git push origin")) fmt.echo() fmt.echo("5. Your pipeline should be running! You can monitor it here:") - fmt.echo(fmt.bold(github_origin_to_url(self.origin, f"/actions/workflows/{self.artifacts['serialized_workflow_name']}"))) + fmt.echo( + fmt.bold( + github_origin_to_url( + self.origin, f"/actions/workflows/{self.artifacts['serialized_workflow_name']}" + ) + ) + ) class AirflowDeployment(BaseDeployment): @@ -206,11 +281,15 @@ def _generate_workflow(self, *args: Optional[Any]) -> None: dag_script_name = f"dag_{self.state['pipeline_name']}.py" self.artifacts["dag_script_name"] = dag_script_name - cloudbuild_file = self.template_storage.load(os.path.join(self.deployment_method, AIRFLOW_CLOUDBUILD_YAML)) + cloudbuild_file = self.template_storage.load( + os.path.join(self.deployment_method, AIRFLOW_CLOUDBUILD_YAML) + ) self.artifacts["cloudbuild_file"] = cloudbuild_file # TODO: rewrite dag file to at least set the schedule - dag_file = self.template_storage.load(os.path.join(self.deployment_method, AIRFLOW_DAG_TEMPLATE_SCRIPT)) + dag_file = self.template_storage.load( + os.path.join(self.deployment_method, AIRFLOW_DAG_TEMPLATE_SCRIPT) + ) self.artifacts["dag_file"] = dag_file # ask user if to overwrite the files @@ -227,61 +306,92 @@ def _make_modification(self) -> None: # save cloudbuild.yaml only if not exist to allow to run the deploy command for many different pipelines dest_cloud_build = os.path.join(utils.AIRFLOW_BUILD_FOLDER, AIRFLOW_CLOUDBUILD_YAML) if not self.repo_storage.has_file(dest_cloud_build): - self.repo_storage.save( - dest_cloud_build, - self.artifacts["cloudbuild_file"] - ) + self.repo_storage.save(dest_cloud_build, self.artifacts["cloudbuild_file"]) else: - fmt.warning(f"{AIRFLOW_CLOUDBUILD_YAML} already created. Delete the file and run the deploy command again to re-create.") + fmt.warning( + f"{AIRFLOW_CLOUDBUILD_YAML} already created. Delete the file and run the deploy" + " command again to re-create." + ) dest_dag_script = os.path.join(utils.AIRFLOW_DAGS_FOLDER, self.artifacts["dag_script_name"]) - self.repo_storage.save( - dest_dag_script, - self.artifacts["dag_file"] - ) - + self.repo_storage.save(dest_dag_script, self.artifacts["dag_file"]) def _echo_instructions(self, *args: Optional[Any]) -> None: - fmt.echo("Your %s deployment for pipeline %s is ready!" % ( - fmt.bold(self.deployment_method), fmt.bold(self.state["pipeline_name"]), - )) - fmt.echo("* The airflow %s file was created in %s." % ( - fmt.bold(AIRFLOW_CLOUDBUILD_YAML), fmt.bold(utils.AIRFLOW_BUILD_FOLDER) - )) - fmt.echo("* The %s script was created in %s." % ( - fmt.bold(self.artifacts["dag_script_name"]), fmt.bold(utils.AIRFLOW_DAGS_FOLDER) - )) + fmt.echo( + "Your %s deployment for pipeline %s is ready!" + % ( + fmt.bold(self.deployment_method), + fmt.bold(self.state["pipeline_name"]), + ) + ) + fmt.echo( + "* The airflow %s file was created in %s." + % (fmt.bold(AIRFLOW_CLOUDBUILD_YAML), fmt.bold(utils.AIRFLOW_BUILD_FOLDER)) + ) + fmt.echo( + "* The %s script was created in %s." + % (fmt.bold(self.artifacts["dag_script_name"]), fmt.bold(utils.AIRFLOW_DAGS_FOLDER)) + ) fmt.echo() fmt.echo("You must prepare your DAG first:") - fmt.echo("1. Import your sources in %s, configure the DAG ans tasks as needed." % (fmt.bold(self.artifacts["dag_script_name"]))) - fmt.echo("2. Test the DAG with Airflow locally .\nSee Airflow getting started: %s" % (fmt.bold(AIRFLOW_GETTING_STARTED))) + fmt.echo( + "1. Import your sources in %s, configure the DAG ans tasks as needed." + % (fmt.bold(self.artifacts["dag_script_name"])) + ) + fmt.echo( + "2. Test the DAG with Airflow locally .\nSee Airflow getting started: %s" + % (fmt.bold(AIRFLOW_GETTING_STARTED)) + ) fmt.echo() - fmt.echo("If you are planning run the pipeline with Google Cloud Composer, follow the next instructions:\n") - fmt.echo("1. Read this doc and set up the Environment: %s" % ( - fmt.bold(DLT_AIRFLOW_GCP_DOCS_URL) - )) - fmt.echo("2. Set _BUCKET_NAME up in %s/%s file. " % ( - fmt.bold(utils.AIRFLOW_BUILD_FOLDER), fmt.bold(AIRFLOW_CLOUDBUILD_YAML), - )) + fmt.echo( + "If you are planning run the pipeline with Google Cloud Composer, follow the next" + " instructions:\n" + ) + fmt.echo( + "1. Read this doc and set up the Environment: %s" % (fmt.bold(DLT_AIRFLOW_GCP_DOCS_URL)) + ) + fmt.echo( + "2. Set _BUCKET_NAME up in %s/%s file. " + % ( + fmt.bold(utils.AIRFLOW_BUILD_FOLDER), + fmt.bold(AIRFLOW_CLOUDBUILD_YAML), + ) + ) if len(self.secret_envs) == 0 and len(self.envs) == 0: fmt.echo("3. Your pipeline does not seem to need any secrets.") else: if self.secrets_format == SecretFormats.env.value: - fmt.echo("3. Add the following secret values (typically stored in %s): \n%s\n%s\nin ENVIRONMENT VARIABLES using Google Composer UI" % ( - fmt.bold(make_dlt_settings_path(SECRETS_TOML)), - fmt.bold("\n".join(self.env_prov.get_key_name(s_v.key, *s_v.sections) for s_v in self.secret_envs)), - fmt.bold("\n".join(self.env_prov.get_key_name(v.key, *v.sections) for v in self.envs)), - )) + fmt.echo( + "3. Add the following secret values (typically stored in %s): \n%s\n%s\nin" + " ENVIRONMENT VARIABLES using Google Composer UI" + % ( + fmt.bold(make_dlt_settings_path(SECRETS_TOML)), + fmt.bold( + "\n".join( + self.env_prov.get_key_name(s_v.key, *s_v.sections) + for s_v in self.secret_envs + ) + ), + fmt.bold( + "\n".join( + self.env_prov.get_key_name(v.key, *v.sections) for v in self.envs + ) + ), + ) + ) fmt.echo() # if fmt.confirm("Do you want to list the environment variables in the format suitable for Airflow?", default=True): self._echo_secrets() self._echo_envs() elif self.secrets_format == SecretFormats.toml.value: # build toml - fmt.echo(f"3. Add the following toml-string in the Google Composer UI as the {SECRETS_TOML_KEY} variable.") + fmt.echo( + "3. Add the following toml-string in the Google Composer UI as the" + f" {SECRETS_TOML_KEY} variable." + ) fmt.echo() toml_provider = StringTomlProvider("") for s_v in self.secret_envs: @@ -294,18 +404,34 @@ def _echo_instructions(self, *args: Optional[Any]) -> None: fmt.echo("4. Add dlt package below using Google Composer UI.") fmt.echo(fmt.bold(self.artifacts["requirements_txt"])) - fmt.note("You may need to add more packages ie. when your source requires additional dependencies") + fmt.note( + "You may need to add more packages ie. when your source requires additional" + " dependencies" + ) fmt.echo("5. Commit and push the pipeline files to github:") - fmt.echo("a. Add stage deployment files to commit. Use your Git UI or the following command") + fmt.echo( + "a. Add stage deployment files to commit. Use your Git UI or the following command" + ) - dag_script_path = self.repo_storage.from_relative_path_to_wd(os.path.join(utils.AIRFLOW_DAGS_FOLDER, self.artifacts["dag_script_name"])) - cloudbuild_path = self.repo_storage.from_relative_path_to_wd(os.path.join(utils.AIRFLOW_BUILD_FOLDER, AIRFLOW_CLOUDBUILD_YAML)) + dag_script_path = self.repo_storage.from_relative_path_to_wd( + os.path.join(utils.AIRFLOW_DAGS_FOLDER, self.artifacts["dag_script_name"]) + ) + cloudbuild_path = self.repo_storage.from_relative_path_to_wd( + os.path.join(utils.AIRFLOW_BUILD_FOLDER, AIRFLOW_CLOUDBUILD_YAML) + ) fmt.echo(fmt.bold(f"git add {dag_script_path} {cloudbuild_path}")) fmt.echo("b. Commit the files above. Use your Git UI or the following command") - fmt.echo(fmt.bold(f"git commit -m 'initiate {self.state['pipeline_name']} pipeline with Airflow'")) + fmt.echo( + fmt.bold( + f"git commit -m 'initiate {self.state['pipeline_name']} pipeline with Airflow'" + ) + ) if is_dirty(self.repo): - fmt.warning("You have modified files in your repository. Do not forget to push changes to your pipeline script as well!") + fmt.warning( + "You have modified files in your repository. Do not forget to push changes to your" + " pipeline script as well!" + ) fmt.echo("c. Push changes to github. Use your Git UI or the following command") fmt.echo(fmt.bold("git push origin")) fmt.echo("6. You should see your pipeline in Airflow.") diff --git a/dlt/cli/deploy_command_helpers.py b/dlt/cli/deploy_command_helpers.py index 81852f3ce1..5065ba1cfc 100644 --- a/dlt/cli/deploy_command_helpers.py +++ b/dlt/cli/deploy_command_helpers.py @@ -6,6 +6,7 @@ from itertools import chain from typing import List, Optional, Sequence, Tuple, Any, Dict from astunparse import unparse + # optional dependencies import pipdeptree import cron_descriptor @@ -77,20 +78,36 @@ def _prepare_deployment(self) -> None: # make sure the repo has origin self.origin = self._get_origin() # convert to path relative to repo - self.repo_pipeline_script_path = self.repo_storage.from_wd_to_relative_path(self.pipeline_script_path) + self.repo_pipeline_script_path = self.repo_storage.from_wd_to_relative_path( + self.pipeline_script_path + ) # load a pipeline script and extract full_refresh and pipelines_dir args self.pipeline_script = self.repo_storage.load(self.repo_pipeline_script_path) - fmt.echo("Looking up the deployment template scripts in %s...\n" % fmt.bold(self.repo_location)) - self.template_storage = git.get_fresh_repo_files(self.repo_location, get_dlt_repos_dir(), branch=self.branch) + fmt.echo( + "Looking up the deployment template scripts in %s...\n" % fmt.bold(self.repo_location) + ) + self.template_storage = git.get_fresh_repo_files( + self.repo_location, get_dlt_repos_dir(), branch=self.branch + ) self.working_directory = os.path.split(self.pipeline_script_path)[0] def _get_origin(self) -> str: try: origin = get_origin(self.repo) if "github.com" not in origin: - raise CliCommandException("deploy", f"Your current repository origin is not set to github but to {origin}.\nYou must change it to be able to run the pipelines with github actions: https://docs.github.com/en/get-started/getting-started-with-git/managing-remote-repositories") + raise CliCommandException( + "deploy", + f"Your current repository origin is not set to github but to {origin}.\nYou" + " must change it to be able to run the pipelines with github actions:" + " https://docs.github.com/en/get-started/getting-started-with-git/managing-remote-repositories", + ) except ValueError: - raise CliCommandException("deploy", "Your current repository has no origin set. Please set it up to be able to run the pipelines with github actions: https://docs.github.com/en/get-started/importing-your-projects-to-github/importing-source-code-to-github/adding-locally-hosted-code-to-github") + raise CliCommandException( + "deploy", + "Your current repository has no origin set. Please set it up to be able to run the" + " pipelines with github actions:" + " https://docs.github.com/en/get-started/importing-your-projects-to-github/importing-source-code-to-github/adding-locally-hosted-code-to-github", + ) return origin @@ -104,14 +121,18 @@ def run_deployment(self) -> None: pipeline_name: str = None pipelines_dir: str = None - uniq_possible_pipelines = {t[0]:t for t in possible_pipelines} + uniq_possible_pipelines = {t[0]: t for t in possible_pipelines} if len(uniq_possible_pipelines) == 1: pipeline_name, pipelines_dir = possible_pipelines[0] elif len(uniq_possible_pipelines) > 1: choices = list(uniq_possible_pipelines.keys()) - choices_str = "".join([str(i+1) for i in range(len(choices))]) + choices_str = "".join([str(i + 1) for i in range(len(choices))]) choices_selection = [f"{idx+1}-{name}" for idx, name in enumerate(choices)] - sel = fmt.prompt("Several pipelines found in script, please select one: " + ", ".join(choices_selection), choices=choices_str) + sel = fmt.prompt( + "Several pipelines found in script, please select one: " + + ", ".join(choices_selection), + choices=choices_str, + ) pipeline_name, pipelines_dir = uniq_possible_pipelines[choices[int(sel) - 1]] if pipelines_dir: @@ -126,11 +147,17 @@ def run_deployment(self) -> None: self.pipeline_name = dlt.config.get("pipeline_name") if not self.pipeline_name: self.pipeline_name = get_default_pipeline_name(self.pipeline_script_path) - fmt.warning(f"Using default pipeline name {self.pipeline_name}. The pipeline name is not passed as argument to dlt.pipeline nor configured via config provides ie. config.toml") + fmt.warning( + f"Using default pipeline name {self.pipeline_name}. The pipeline name" + " is not passed as argument to dlt.pipeline nor configured via config" + " provides ie. config.toml" + ) # fmt.echo("Generating deployment for pipeline %s" % fmt.bold(self.pipeline_name)) # attach to pipeline name, get state and trace - pipeline = dlt.attach(pipeline_name=self.pipeline_name, pipelines_dir=self.pipelines_dir) + pipeline = dlt.attach( + pipeline_name=self.pipeline_name, pipelines_dir=self.pipelines_dir + ) self.state, trace = get_state_and_trace(pipeline) self._update_envs(trace) @@ -148,12 +175,26 @@ def _update_envs(self, trace: PipelineTrace) -> None: for resolved_value in trace.resolved_config_values: if resolved_value.is_secret_hint: # generate special forms for all secrets - self.secret_envs.append(LookupTrace(self.env_prov.name, tuple(resolved_value.sections), resolved_value.key, resolved_value.value)) + self.secret_envs.append( + LookupTrace( + self.env_prov.name, + tuple(resolved_value.sections), + resolved_value.key, + resolved_value.value, + ) + ) # fmt.echo(f"{resolved_value.key}:{resolved_value.value}{type(resolved_value.value)} in {resolved_value.sections} is SECRET") else: # move all config values that are not in config.toml into env if resolved_value.provider_name != self.config_prov.name: - self.envs.append(LookupTrace(self.env_prov.name, tuple(resolved_value.sections), resolved_value.key, resolved_value.value)) + self.envs.append( + LookupTrace( + self.env_prov.name, + tuple(resolved_value.sections), + resolved_value.key, + resolved_value.value, + ) + ) # fmt.echo(f"{resolved_value.key} in {resolved_value.sections} moved to CONFIG") def _echo_secrets(self) -> None: @@ -189,12 +230,20 @@ def get_state_and_trace(pipeline: Pipeline) -> Tuple[TPipelineState, PipelineTra # trace must exist and end with a successful loading step trace = pipeline.last_trace if trace is None or len(trace.steps) == 0: - raise PipelineWasNotRun("Pipeline run trace could not be found. Please run the pipeline at least once locally.") + raise PipelineWasNotRun( + "Pipeline run trace could not be found. Please run the pipeline at least once locally." + ) last_step = trace.steps[-1] if last_step.step_exception is not None: - raise PipelineWasNotRun(f"The last pipeline run ended with error. Please make sure that pipeline runs correctly before deployment.\n{last_step.step_exception}") + raise PipelineWasNotRun( + "The last pipeline run ended with error. Please make sure that pipeline runs correctly" + f" before deployment.\n{last_step.step_exception}" + ) if not isinstance(last_step.step_info, LoadInfo): - raise PipelineWasNotRun("The last pipeline run did not reach the load step. Please run the pipeline locally until it loads data into destination.") + raise PipelineWasNotRun( + "The last pipeline run did not reach the load step. Please run the pipeline locally" + " until it loads data into destination." + ) return pipeline.state, trace @@ -202,7 +251,10 @@ def get_state_and_trace(pipeline: Pipeline) -> Tuple[TPipelineState, PipelineTra def get_visitors(pipeline_script: str, pipeline_script_path: str) -> PipelineScriptVisitor: visitor = utils.parse_init_script("deploy", pipeline_script, pipeline_script_path) if n.RUN not in visitor.known_calls: - raise CliCommandException("deploy", f"The pipeline script {pipeline_script_path} does not seem to run the pipeline.") + raise CliCommandException( + "deploy", + f"The pipeline script {pipeline_script_path} does not seem to run the pipeline.", + ) return visitor @@ -215,22 +267,40 @@ def parse_pipeline_info(visitor: PipelineScriptVisitor) -> List[Tuple[str, Optio if f_r_node: f_r_value = evaluate_node_literal(f_r_node) if f_r_value is None: - fmt.warning(f"The value of `full_refresh` in call to `dlt.pipeline` cannot be determined from {unparse(f_r_node).strip()}. We assume that you know what you are doing :)") + fmt.warning( + "The value of `full_refresh` in call to `dlt.pipeline` cannot be" + f" determined from {unparse(f_r_node).strip()}. We assume that you know" + " what you are doing :)" + ) if f_r_value is True: - if fmt.confirm("The value of 'full_refresh' is set to True. Do you want to abort to set it to False?", default=True): + if fmt.confirm( + "The value of 'full_refresh' is set to True. Do you want to abort to set it" + " to False?", + default=True, + ): raise CliCommandException("deploy", "Please set the full_refresh to False") p_d_node = call_args.arguments.get("pipelines_dir") if p_d_node: pipelines_dir = evaluate_node_literal(p_d_node) if pipelines_dir is None: - raise CliCommandException("deploy", f"The value of 'pipelines_dir' argument in call to `dlt_pipeline` cannot be determined from {unparse(p_d_node).strip()}. Pipeline working dir will be found. Pass it directly with --pipelines-dir option.") + raise CliCommandException( + "deploy", + "The value of 'pipelines_dir' argument in call to `dlt_pipeline` cannot be" + f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" + " be found. Pass it directly with --pipelines-dir option.", + ) p_n_node = call_args.arguments.get("pipeline_name") if p_n_node: pipeline_name = evaluate_node_literal(p_n_node) if pipeline_name is None: - raise CliCommandException("deploy", f"The value of 'pipeline_name' argument in call to `dlt_pipeline` cannot be determined from {unparse(p_d_node).strip()}. Pipeline working dir will be found. Pass it directly with --pipeline-name option.") + raise CliCommandException( + "deploy", + "The value of 'pipeline_name' argument in call to `dlt_pipeline` cannot be" + f" determined from {unparse(p_d_node).strip()}. Pipeline working dir will" + " be found. Pass it directly with --pipeline-name option.", + ) pipelines.append((pipeline_name, pipelines_dir)) return pipelines @@ -240,8 +310,8 @@ def str_representer(dumper: yaml.Dumper, data: str) -> yaml.ScalarNode: # format multiline strings as blocks with the exception of placeholders # that will be expanded as yaml if len(data.splitlines()) > 1 and "{{ toYaml" not in data: # check for multiline string - return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='|') - return dumper.represent_scalar('tag:yaml.org,2002:str', data) + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + return dumper.represent_scalar("tag:yaml.org,2002:str", data) def wrap_template_str(s: str) -> str: @@ -253,17 +323,14 @@ def serialize_templated_yaml(tree: StrAny) -> str: try: yaml.add_representer(str, str_representer) # pretty serialize yaml - serialized: str = yaml.dump(tree, allow_unicode=True, default_flow_style=False, sort_keys=False) + serialized: str = yaml.dump( + tree, allow_unicode=True, default_flow_style=False, sort_keys=False + ) # removes apostrophes around the template - serialized = re.sub(r"'([\s\n]*?\${{.+?}})'", - r"\1", - serialized, - flags=re.DOTALL) + serialized = re.sub(r"'([\s\n]*?\${{.+?}})'", r"\1", serialized, flags=re.DOTALL) # print(serialized) # fix the new lines in templates ending }} - serialized = re.sub(r"(\${{.+)\n.+(}})", - r"\1 \2", - serialized) + serialized = re.sub(r"(\${{.+)\n.+(}})", r"\1 \2", serialized) return serialized finally: yaml.add_representer(str, old_representer) @@ -292,7 +359,10 @@ def generate_pip_freeze(requirements_blacklist: List[str], requirements_file_nam conflicts = pipdeptree.conflicting_deps(tree) cycles = pipdeptree.cyclic_deps(tree) if conflicts: - fmt.warning(f"Unable to create dependencies for the github action. Please edit {requirements_file_name} yourself") + fmt.warning( + "Unable to create dependencies for the github action. Please edit" + f" {requirements_file_name} yourself" + ) pipdeptree.render_conflicts_text(conflicts) pipdeptree.render_cycles_text(cycles) fmt.echo() diff --git a/dlt/cli/echo.py b/dlt/cli/echo.py index 41c9fc1f7f..bd9cf24f64 100644 --- a/dlt/cli/echo.py +++ b/dlt/cli/echo.py @@ -20,7 +20,6 @@ def always_choose(always_choose_default: bool, always_choose_value: Any) -> Iter ALWAYS_CHOOSE_VALUE = _always_choose_value - echo = click.echo secho = click.secho style = click.style @@ -65,5 +64,6 @@ def prompt(text: str, choices: Iterable[str], default: Optional[Any] = None) -> click_choices = click.Choice(choices) return click.prompt(text, type=click_choices, default=default) + def text_input(text: str) -> str: return click.prompt(text) # type: ignore[no-any-return] diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py index 4cec1706b9..c2c7bdd7ff 100644 --- a/dlt/cli/init_command.py +++ b/dlt/cli/init_command.py @@ -8,7 +8,12 @@ from dlt.common import git from dlt.common.configuration.paths import get_dlt_settings_dir, make_dlt_settings_path from dlt.common.configuration.specs import known_sections -from dlt.common.configuration.providers import CONFIG_TOML, SECRETS_TOML, ConfigTomlProvider, SecretsTomlProvider +from dlt.common.configuration.providers import ( + CONFIG_TOML, + SECRETS_TOML, + ConfigTomlProvider, + SecretsTomlProvider, +) from dlt.common.pipeline import get_dlt_repos_dir from dlt.common.source import _SOURCES from dlt.version import DLT_PKG_NAME, __version__ @@ -24,7 +29,11 @@ from dlt.cli import echo as fmt, pipeline_files as files_ops, source_detection from dlt.cli import utils from dlt.cli.config_toml_writer import WritableConfigValue, write_values -from dlt.cli.pipeline_files import VerifiedSourceFiles, TVerifiedSourceFileEntry, TVerifiedSourceFileIndex +from dlt.cli.pipeline_files import ( + VerifiedSourceFiles, + TVerifiedSourceFileEntry, + TVerifiedSourceFileIndex, +) from dlt.cli.exceptions import CliCommandException from dlt.cli.requirements import SourceRequirements @@ -34,7 +43,9 @@ SOURCES_MODULE_NAME = "sources" -def _get_template_files(command_module: ModuleType, use_generic_template: bool) -> Tuple[str, List[str]]: +def _get_template_files( + command_module: ModuleType, use_generic_template: bool +) -> Tuple[str, List[str]]: template_files: List[str] = command_module.TEMPLATE_FILES pipeline_script: str = command_module.PIPELINE_SCRIPT if use_generic_template: @@ -48,22 +59,41 @@ def _select_source_files( remote_modified: Dict[str, TVerifiedSourceFileEntry], remote_deleted: Dict[str, TVerifiedSourceFileEntry], conflict_modified: Sequence[str], - conflict_deleted: Sequence[str] + conflict_deleted: Sequence[str], ) -> Tuple[str, Dict[str, TVerifiedSourceFileEntry], Dict[str, TVerifiedSourceFileEntry]]: # some files were changed and cannot be updated (or are created without index) - fmt.echo("Existing files for %s source were changed and cannot be automatically updated" % fmt.bold(source_name)) + fmt.echo( + "Existing files for %s source were changed and cannot be automatically updated" + % fmt.bold(source_name) + ) if conflict_modified: - fmt.echo("Following files are MODIFIED locally and CONFLICT with incoming changes: %s" % fmt.bold(", ".join(conflict_modified))) + fmt.echo( + "Following files are MODIFIED locally and CONFLICT with incoming changes: %s" + % fmt.bold(", ".join(conflict_modified)) + ) if conflict_deleted: - fmt.echo("Following files are DELETED locally and CONFLICT with incoming changes: %s" % fmt.bold(", ".join(conflict_deleted))) + fmt.echo( + "Following files are DELETED locally and CONFLICT with incoming changes: %s" + % fmt.bold(", ".join(conflict_deleted)) + ) can_update_files = set(remote_modified.keys()) - set(conflict_modified) can_delete_files = set(remote_deleted.keys()) - set(conflict_deleted) if len(can_update_files) > 0 or len(can_delete_files) > 0: if len(can_update_files) > 0: - fmt.echo("Following files can be automatically UPDATED: %s" % fmt.bold(", ".join(can_update_files))) + fmt.echo( + "Following files can be automatically UPDATED: %s" + % fmt.bold(", ".join(can_update_files)) + ) if len(can_delete_files) > 0: - fmt.echo("Following files can be automatically DELETED: %s" % fmt.bold(", ".join(can_delete_files))) - prompt = "Should incoming changes be Skipped, Applied (local changes will be lost) or Merged (%s UPDATED | %s DELETED | all local changes remain)?" % (fmt.bold(",".join(can_update_files)), fmt.bold(",".join(can_delete_files))) + fmt.echo( + "Following files can be automatically DELETED: %s" + % fmt.bold(", ".join(can_delete_files)) + ) + prompt = ( + "Should incoming changes be Skipped, Applied (local changes will be lost) or Merged (%s" + " UPDATED | %s DELETED | all local changes remain)?" + % (fmt.bold(",".join(can_update_files)), fmt.bold(",".join(can_delete_files))) + ) choices = "sam" else: prompt = "Should incoming changes be Skipped or Applied?" @@ -78,8 +108,8 @@ def _select_source_files( elif resolution == "m": # update what we can fmt.echo("Merging the incoming changes. No files with local changes were modified.") - remote_modified = {n:e for n, e in remote_modified.items() if n in can_update_files} - remote_deleted = {n:e for n, e in remote_deleted.items() if n in can_delete_files} + remote_modified = {n: e for n, e in remote_modified.items() if n in can_update_files} + remote_deleted = {n: e for n, e in remote_deleted.items() if n in can_delete_files} else: # fully overwrite, leave all files to be copied fmt.echo("Applying all incoming changes to local files.") @@ -96,7 +126,9 @@ def _get_dependency_system(dest_storage: FileStorage) -> str: return None -def _list_verified_sources(repo_location: str, branch: str = None) -> Dict[str, VerifiedSourceFiles]: +def _list_verified_sources( + repo_location: str, branch: str = None +) -> Dict[str, VerifiedSourceFiles]: clone_storage = git.get_fresh_repo_files(repo_location, get_dlt_repos_dir(), branch=branch) sources_storage = FileStorage(clone_storage.make_full_path(SOURCES_MODULE_NAME)) @@ -110,41 +142,73 @@ def _list_verified_sources(repo_location: str, branch: str = None) -> Dict[str, return sources -def _welcome_message(source_name: str, destination_name: str, source_files: VerifiedSourceFiles, dependency_system: str, is_new_source: bool) -> None: +def _welcome_message( + source_name: str, + destination_name: str, + source_files: VerifiedSourceFiles, + dependency_system: str, + is_new_source: bool, +) -> None: fmt.echo() if source_files.is_template: fmt.echo("Your new pipeline %s is ready to be customized!" % fmt.bold(source_name)) - fmt.echo("* Review and change how dlt loads your data in %s" % fmt.bold(source_files.dest_pipeline_script)) + fmt.echo( + "* Review and change how dlt loads your data in %s" + % fmt.bold(source_files.dest_pipeline_script) + ) else: if is_new_source: fmt.echo("Verified source %s was added to your project!" % fmt.bold(source_name)) - fmt.echo("* See the usage examples and code snippets to copy from %s" % fmt.bold(source_files.dest_pipeline_script)) + fmt.echo( + "* See the usage examples and code snippets to copy from %s" + % fmt.bold(source_files.dest_pipeline_script) + ) else: - fmt.echo("Verified source %s was updated to the newest version!" % fmt.bold(source_name)) + fmt.echo( + "Verified source %s was updated to the newest version!" % fmt.bold(source_name) + ) if is_new_source: - fmt.echo("* Add credentials for %s and other secrets in %s" % (fmt.bold(destination_name), fmt.bold(make_dlt_settings_path(SECRETS_TOML)))) + fmt.echo( + "* Add credentials for %s and other secrets in %s" + % (fmt.bold(destination_name), fmt.bold(make_dlt_settings_path(SECRETS_TOML))) + ) if dependency_system: fmt.echo("* Add the required dependencies to %s:" % fmt.bold(dependency_system)) compiled_requirements = source_files.requirements.compiled() for dep in compiled_requirements: fmt.echo(" " + fmt.bold(dep)) - fmt.echo(" If the dlt dependency is already added, make sure you install the extra for %s to it" % fmt.bold(destination_name)) + fmt.echo( + " If the dlt dependency is already added, make sure you install the extra for %s to it" + % fmt.bold(destination_name) + ) if dependency_system == utils.REQUIREMENTS_TXT: qs = "' '" - fmt.echo(" To install with pip: %s" % fmt.bold(f"pip3 install '{qs.join(compiled_requirements)}'")) + fmt.echo( + " To install with pip: %s" + % fmt.bold(f"pip3 install '{qs.join(compiled_requirements)}'") + ) elif dependency_system == utils.PYPROJECT_TOML: fmt.echo(" If you are using poetry you may issue the following command:") fmt.echo(fmt.bold(" poetry add %s -E %s" % (DLT_PKG_NAME, destination_name))) fmt.echo() else: - fmt.echo("* %s was created. Install it with:\npip3 install -r %s" % (fmt.bold(utils.REQUIREMENTS_TXT), utils.REQUIREMENTS_TXT)) + fmt.echo( + "* %s was created. Install it with:\npip3 install -r %s" + % (fmt.bold(utils.REQUIREMENTS_TXT), utils.REQUIREMENTS_TXT) + ) if is_new_source: - fmt.echo("* Read %s for more information" % fmt.bold("https://dlthub.com/docs/walkthroughs/create-a-pipeline")) + fmt.echo( + "* Read %s for more information" + % fmt.bold("https://dlthub.com/docs/walkthroughs/create-a-pipeline") + ) else: - fmt.echo("* Read %s for more information" % fmt.bold("https://dlthub.com/docs/walkthroughs/add-a-verified-source")) + fmt.echo( + "* Read %s for more information" + % fmt.bold("https://dlthub.com/docs/walkthroughs/add-a-verified-source") + ) def list_verified_sources_command(repo_location: str, branch: str = None) -> None: @@ -158,7 +222,13 @@ def list_verified_sources_command(repo_location: str, branch: str = None) -> Non fmt.echo(msg) -def init_command(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str = None) -> None: +def init_command( + source_name: str, + destination_name: str, + use_generic_template: bool, + repo_location: str, + branch: str = None, +) -> None: # try to import the destination and get config spec destination_reference = Destination.from_reference(destination_name) destination_spec = destination_reference.spec @@ -192,76 +262,115 @@ def init_command(source_name: str, destination_name: str, use_generic_template: source_files = files_ops.get_verified_source_files(sources_storage, source_name) # get file index from remote verified source files being copied remote_index = files_ops.get_remote_source_index( - source_files.storage.storage_path, source_files.files, source_files.requirements.dlt_version_constraint() + source_files.storage.storage_path, + source_files.files, + source_files.requirements.dlt_version_constraint(), ) # diff local and remote index to get modified and deleted files - remote_new, remote_modified, remote_deleted = files_ops.gen_index_diff(local_index, remote_index) + remote_new, remote_modified, remote_deleted = files_ops.gen_index_diff( + local_index, remote_index + ) # find files that are modified locally - conflict_modified, conflict_deleted = files_ops.find_conflict_files(local_index, remote_new, remote_modified, remote_deleted, dest_storage) + conflict_modified, conflict_deleted = files_ops.find_conflict_files( + local_index, remote_new, remote_modified, remote_deleted, dest_storage + ) # add new to modified remote_modified.update(remote_new) if conflict_modified or conflict_deleted: # select source files that can be copied/updated _, remote_modified, remote_deleted = _select_source_files( - source_name, - remote_modified, - remote_deleted, - conflict_modified, - conflict_deleted + source_name, remote_modified, remote_deleted, conflict_modified, conflict_deleted ) if not remote_deleted and not remote_modified: fmt.echo("No files to update, exiting") return if remote_index["is_dirty"]: - fmt.warning(f"The verified sources repository is dirty. {source_name} source files may not update correctly in the future.") + fmt.warning( + f"The verified sources repository is dirty. {source_name} source files may not" + " update correctly in the future." + ) # add template files source_files.files.extend(template_files) else: - if not is_valid_schema_name(source_name): raise InvalidSchemaName(source_name) dest_pipeline_script = source_name + ".py" - source_files = VerifiedSourceFiles(True, init_storage, pipeline_script, dest_pipeline_script, template_files, SourceRequirements([]), "") + source_files = VerifiedSourceFiles( + True, + init_storage, + pipeline_script, + dest_pipeline_script, + template_files, + SourceRequirements([]), + "", + ) if dest_storage.has_file(dest_pipeline_script): fmt.warning("Pipeline script %s already exist, exiting" % dest_pipeline_script) return # add .dlt/*.toml files to be copied - source_files.files.extend([make_dlt_settings_path(CONFIG_TOML), make_dlt_settings_path(SECRETS_TOML)]) + source_files.files.extend( + [make_dlt_settings_path(CONFIG_TOML), make_dlt_settings_path(SECRETS_TOML)] + ) # add dlt extras line to requirements source_files.requirements.update_dlt_extras(destination_name) # Check compatibility with installed dlt if not source_files.requirements.is_installed_dlt_compatible(): - msg = f"This pipeline requires a newer version of dlt than your installed version ({source_files.requirements.current_dlt_version()}). " \ - f"Pipeline requires '{source_files.requirements.dlt_requirement_base}'" + msg = ( + "This pipeline requires a newer version of dlt than your installed version" + f" ({source_files.requirements.current_dlt_version()}). Pipeline requires" + f" '{source_files.requirements.dlt_requirement_base}'" + ) fmt.warning(msg) - if not fmt.confirm("Would you like to continue anyway? (you can update dlt after this step)", default=True): - fmt.echo(f'You can update dlt with: pip3 install -U "{source_files.requirements.dlt_requirement_base}"') + if not fmt.confirm( + "Would you like to continue anyway? (you can update dlt after this step)", default=True + ): + fmt.echo( + "You can update dlt with: pip3 install -U" + f' "{source_files.requirements.dlt_requirement_base}"' + ) return # read module source and parse it - visitor = utils.parse_init_script("init", source_files.storage.load(source_files.pipeline_script), source_files.pipeline_script) + visitor = utils.parse_init_script( + "init", + source_files.storage.load(source_files.pipeline_script), + source_files.pipeline_script, + ) if visitor.is_destination_imported: - raise CliCommandException("init", f"The pipeline script {source_files.pipeline_script} import a destination from dlt.destinations. You should specify destinations by name when calling dlt.pipeline or dlt.run in init scripts.") + raise CliCommandException( + "init", + f"The pipeline script {source_files.pipeline_script} import a destination from" + " dlt.destinations. You should specify destinations by name when calling dlt.pipeline" + " or dlt.run in init scripts.", + ) if n.PIPELINE not in visitor.known_calls: - raise CliCommandException("init", f"The pipeline script {source_files.pipeline_script} does not seem to initialize pipeline with dlt.pipeline. Please initialize pipeline explicitly in init scripts.") + raise CliCommandException( + "init", + f"The pipeline script {source_files.pipeline_script} does not seem to initialize" + " pipeline with dlt.pipeline. Please initialize pipeline explicitly in init scripts.", + ) # find all arguments in all calls to replace transformed_nodes = source_detection.find_call_arguments_to_replace( visitor, - [("destination", destination_name), ("pipeline_name", source_name), ("dataset_name", source_name + "_data")], - source_files.pipeline_script + [ + ("destination", destination_name), + ("pipeline_name", source_name), + ("dataset_name", source_name + "_data"), + ], + source_files.pipeline_script, ) # inspect the script inspect_pipeline_script( source_files.storage.storage_path, source_files.storage.to_relative_path(source_files.pipeline_script), - ignore_missing_imports=True + ignore_missing_imports=True, ) # detect all the required secrets and configs that should go into tomls files @@ -269,32 +378,57 @@ def init_command(source_name: str, destination_name: str, use_generic_template: # replace destination, pipeline_name and dataset_name in templates transformed_nodes = source_detection.find_call_arguments_to_replace( visitor, - [("destination", destination_name), ("pipeline_name", source_name), ("dataset_name", source_name + "_data")], - source_files.pipeline_script + [ + ("destination", destination_name), + ("pipeline_name", source_name), + ("dataset_name", source_name + "_data"), + ], + source_files.pipeline_script, ) # template sources are always in module starting with "pipeline" # for templates, place config and secrets into top level section - required_secrets, required_config, checked_sources = source_detection.detect_source_configs(_SOURCES, "pipeline", ()) + required_secrets, required_config, checked_sources = source_detection.detect_source_configs( + _SOURCES, "pipeline", () + ) # template has a strict rules where sources are placed for source_q_name, source_config in checked_sources.items(): if source_q_name not in visitor.known_sources_resources: - raise CliCommandException("init", f"The pipeline script {source_files.pipeline_script} imports a source/resource {source_config.f.__name__} from module {source_config.module.__name__}. In init scripts you must declare all sources and resources in single file.") + raise CliCommandException( + "init", + f"The pipeline script {source_files.pipeline_script} imports a source/resource" + f" {source_config.f.__name__} from module {source_config.module.__name__}. In" + " init scripts you must declare all sources and resources in single file.", + ) # rename sources and resources - transformed_nodes.extend(source_detection.find_source_calls_to_replace(visitor, source_name)) + transformed_nodes.extend( + source_detection.find_source_calls_to_replace(visitor, source_name) + ) else: # replace only destination for existing pipelines - transformed_nodes = source_detection.find_call_arguments_to_replace(visitor, [("destination", destination_name)], source_files.pipeline_script) + transformed_nodes = source_detection.find_call_arguments_to_replace( + visitor, [("destination", destination_name)], source_files.pipeline_script + ) # pipeline sources are in module with name starting from {pipeline_name} # for verified pipelines place in the specific source section - required_secrets, required_config, checked_sources = source_detection.detect_source_configs(_SOURCES, source_name, (known_sections.SOURCES, source_name)) + required_secrets, required_config, checked_sources = source_detection.detect_source_configs( + _SOURCES, source_name, (known_sections.SOURCES, source_name) + ) if len(checked_sources) == 0: - raise CliCommandException("init", f"The pipeline script {source_files.pipeline_script} is not creating or importing any sources or resources") + raise CliCommandException( + "init", + f"The pipeline script {source_files.pipeline_script} is not creating or importing any" + " sources or resources", + ) # add destination spec to required secrets - required_secrets["destinations:" + destination_name] = WritableConfigValue(destination_name, destination_spec, None, ("destination",)) + required_secrets["destinations:" + destination_name] = WritableConfigValue( + destination_name, destination_spec, None, ("destination",) + ) # add the global telemetry to required config - required_config["runtime.dlthub_telemetry"] = WritableConfigValue("dlthub_telemetry", bool, utils.get_telemetry_status(), ("runtime", )) + required_config["runtime.dlthub_telemetry"] = WritableConfigValue( + "dlthub_telemetry", bool, utils.get_telemetry_status(), ("runtime",) + ) # modify the script script_lines = rewrite_python_script(visitor.source_lines, transformed_nodes) @@ -305,9 +439,15 @@ def init_command(source_name: str, destination_name: str, use_generic_template: # ask for confirmation if is_new_source: if source_files.is_template: - fmt.echo("A verified source %s was not found. Using a template to create a new source and pipeline with name %s." % (fmt.bold(source_name), fmt.bold(source_name))) + fmt.echo( + "A verified source %s was not found. Using a template to create a new source and" + " pipeline with name %s." % (fmt.bold(source_name), fmt.bold(source_name)) + ) else: - fmt.echo("Cloning and configuring a verified source %s (%s)" % (fmt.bold(source_name), source_files.doc)) + fmt.echo( + "Cloning and configuring a verified source %s (%s)" + % (fmt.bold(source_name), source_files.doc) + ) if use_generic_template: fmt.warning("--generic parameter is meaningless if verified source is found") if not fmt.confirm("Do you want to proceed?", default=True): @@ -339,7 +479,9 @@ def init_command(source_name: str, destination_name: str, use_generic_template: for file_name in remote_deleted: if dest_storage.has_file(file_name): dest_storage.delete(file_name) - files_ops.save_verified_source_local_index(source_name, remote_index, remote_modified, remote_deleted) + files_ops.save_verified_source_local_index( + source_name, remote_index, remote_modified, remote_deleted + ) # create script if not dest_storage.has_file(source_files.dest_pipeline_script): dest_storage.save(source_files.dest_pipeline_script, dest_script_source) diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 2d705dc1a3..c7b8dd53d4 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -18,7 +18,15 @@ DLT_PIPELINE_COMMAND_DOCS_URL = "https://dlthub.com/docs/reference/command-line-interface" -def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, verbosity: int, dataset_name: str = None, destination: TDestinationReferenceArg = None, **command_kwargs: Any) -> None: +def pipeline_command( + operation: str, + pipeline_name: str, + pipelines_dir: str, + verbosity: int, + dataset_name: str = None, + destination: TDestinationReferenceArg = None, + **command_kwargs: Any, +) -> None: if operation == "list": pipelines_dir = pipelines_dir or get_dlt_pipelines_dir() storage = FileStorage(pipelines_dir) @@ -39,16 +47,26 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver if operation not in {"sync", "drop"}: raise fmt.warning(str(e)) - if not fmt.confirm("Do you want to attempt to restore the pipeline state from destination?", default=False): + if not fmt.confirm( + "Do you want to attempt to restore the pipeline state from destination?", default=False + ): return - destination = destination or fmt.text_input(f"Enter destination name for pipeline {fmt.bold(pipeline_name)}") - dataset_name = dataset_name or fmt.text_input(f"Enter dataset name for pipeline {fmt.bold(pipeline_name)}") - p = dlt.pipeline(pipeline_name, pipelines_dir, destination=destination, dataset_name=dataset_name) + destination = destination or fmt.text_input( + f"Enter destination name for pipeline {fmt.bold(pipeline_name)}" + ) + dataset_name = dataset_name or fmt.text_input( + f"Enter dataset name for pipeline {fmt.bold(pipeline_name)}" + ) + p = dlt.pipeline( + pipeline_name, pipelines_dir, destination=destination, dataset_name=dataset_name + ) p.sync_destination() if p.first_run: # remote state was not found p._wipe_working_folder() - fmt.error(f"Pipeline {pipeline_name} was not found in dataset {dataset_name} in {destination}") + fmt.error( + f"Pipeline {pipeline_name} was not found in dataset {dataset_name} in {destination}" + ) return if operation == "sync": return # No need to sync again @@ -56,16 +74,24 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: extracted_files = p.list_extracted_resources() if extracted_files: - fmt.echo("Has %s extracted files ready to be normalized" % fmt.bold(str(len(extracted_files)))) + fmt.echo( + "Has %s extracted files ready to be normalized" + % fmt.bold(str(len(extracted_files))) + ) norm_packages = p.list_normalized_load_packages() if norm_packages: - fmt.echo("Has %s load packages ready to be loaded with following load ids:" % fmt.bold(str(len(norm_packages)))) + fmt.echo( + "Has %s load packages ready to be loaded with following load ids:" + % fmt.bold(str(len(norm_packages))) + ) for load_id in norm_packages: fmt.echo(load_id) # load first (oldest) package first_package_info = p.get_load_package_info(norm_packages[0]) if LoadStorage.is_package_partially_loaded(first_package_info): - fmt.warning("This package is partially loaded. Data in the destination may be modified.") + fmt.warning( + "This package is partially loaded. Data in the destination may be modified." + ) fmt.echo() return extracted_files, norm_packages @@ -77,7 +103,9 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: with signals.delayed_signals(): venv = Venv.restore_current() - for line in iter_stdout(venv, "streamlit", "run", streamlit_helper.__file__, pipeline_name): + for line in iter_stdout( + venv, "streamlit", "run", streamlit_helper.__file__, pipeline_name + ): fmt.echo(line) if operation == "info": @@ -105,24 +133,38 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.warning("This pipeline does not have a default schema") else: is_single_schema = len(p.schema_names) == 1 - for schema_name in p.schema_names: + for schema_name in p.schema_names: fmt.echo("Resources in schema: %s" % fmt.bold(schema_name)) schema = p.schemas[schema_name] data_tables = {t["name"]: t for t in schema.data_tables()} for resource_name, tables in group_tables_by_resource(data_tables).items(): res_state_slots = 0 if sources_state: - source_state = next(iter(sources_state.items()))[1] if is_single_schema else sources_state.get(schema_name) + source_state = ( + next(iter(sources_state.items()))[1] + if is_single_schema + else sources_state.get(schema_name) + ) if source_state: resource_state_ = resource_state(resource_name, source_state) res_state_slots = len(resource_state_) - fmt.echo("%s with %s table(s) and %s resource state slot(s)" % (fmt.bold(resource_name), fmt.bold(str(len(tables))), fmt.bold(str(res_state_slots)))) + fmt.echo( + "%s with %s table(s) and %s resource state slot(s)" + % ( + fmt.bold(resource_name), + fmt.bold(str(len(tables))), + fmt.bold(str(res_state_slots)), + ) + ) fmt.echo() fmt.echo("Working dir content:") _display_pending_packages() loaded_packages = p.list_completed_load_packages() if loaded_packages: - fmt.echo("Has %s completed load packages with following load ids:" % fmt.bold(str(len(loaded_packages)))) + fmt.echo( + "Has %s completed load packages with following load ids:" + % fmt.bold(str(len(loaded_packages))) + ) for load_id in loaded_packages: fmt.echo(load_id) fmt.echo() @@ -130,7 +172,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: if trace is None or len(trace.steps) == 0: fmt.echo("Pipeline does not have last run trace.") else: - fmt.echo("Pipeline has last run trace. Use 'dlt pipeline %s trace' to inspect " % pipeline_name) + fmt.echo( + "Pipeline has last run trace. Use 'dlt pipeline %s trace' to inspect " + % pipeline_name + ) if operation == "trace": trace = p.last_trace @@ -147,7 +192,13 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: failed_jobs = p.list_failed_jobs_in_package(load_id) if failed_jobs: for failed_job in p.list_failed_jobs_in_package(load_id): - fmt.echo("JOB: %s(%s)" % (fmt.bold(failed_job.job_file_info.job_id()), fmt.bold(failed_job.job_file_info.table_name))) + fmt.echo( + "JOB: %s(%s)" + % ( + fmt.bold(failed_job.job_file_info.job_id()), + fmt.bold(failed_job.job_file_info.table_name), + ) + ) fmt.echo("JOB file type: %s" % fmt.bold(failed_job.job_file_info.file_format)) fmt.echo("JOB file path: %s" % fmt.bold(failed_job.file_path)) if verbosity > 0: @@ -166,24 +217,32 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.echo("Pending packages deleted") if operation == "sync": - if fmt.confirm("About to drop the local state of the pipeline and reset all the schemas. The destination state, data and schemas are left intact. Proceed?", default=False): + if fmt.confirm( + "About to drop the local state of the pipeline and reset all the schemas. The" + " destination state, data and schemas are left intact. Proceed?", + default=False, + ): fmt.echo("Dropping local state") p = p.drop() fmt.echo("Restoring from destination") p.sync_destination() if operation == "load-package": - load_id = command_kwargs.get('load_id') + load_id = command_kwargs.get("load_id") if not load_id: packages = sorted(p.list_normalized_load_packages()) if not packages: packages = sorted(p.list_completed_load_packages()) if not packages: - raise CliCommandException("pipeline", "There are no load packages for that pipeline") + raise CliCommandException( + "pipeline", "There are no load packages for that pipeline" + ) load_id = packages[-1] package_info = p.get_load_package_info(load_id) - fmt.echo("Package %s found in %s" % (fmt.bold(load_id), fmt.bold(package_info.package_path))) + fmt.echo( + "Package %s found in %s" % (fmt.bold(load_id), fmt.bold(package_info.package_path)) + ) fmt.echo(package_info.asstr(verbosity)) if len(package_info.schema_update) > 0: if verbosity == 0: @@ -191,7 +250,9 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: else: tables = remove_defaults({"tables": package_info.schema_update}) # type: ignore fmt.echo(fmt.bold("Schema update:")) - fmt.echo(yaml.dump(tables, allow_unicode=True, default_flow_style=False, sort_keys=False)) + fmt.echo( + yaml.dump(tables, allow_unicode=True, default_flow_style=False, sort_keys=False) + ) if operation == "schema": if not p.default_schema_name: @@ -204,7 +265,10 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: if operation == "drop": drop = DropCommand(p, **command_kwargs) if drop.is_empty: - fmt.echo("Could not select any resources to drop and no resource/source state to reset. Use the command below to inspect the pipeline:") + fmt.echo( + "Could not select any resources to drop and no resource/source state to reset. Use" + " the command below to inspect the pipeline:" + ) fmt.echo(f"dlt pipeline -v {p.pipeline_name} info") if len(drop.info["warnings"]): fmt.echo("Additional warnings are available") @@ -212,12 +276,23 @@ def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: fmt.warning(warning) return - fmt.echo("About to drop the following data in dataset %s in destination %s:" % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.name))) + fmt.echo( + "About to drop the following data in dataset %s in destination %s:" + % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.name)) + ) fmt.echo("%s: %s" % (fmt.style("Selected schema", fg="green"), drop.info["schema_name"])) - fmt.echo("%s: %s" % (fmt.style("Selected resource(s)", fg="green"), drop.info["resource_names"])) + fmt.echo( + "%s: %s" % (fmt.style("Selected resource(s)", fg="green"), drop.info["resource_names"]) + ) fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"])) - fmt.echo("%s: %s" % (fmt.style("Resource(s) state to reset", fg="green"), drop.info["resource_states"])) - fmt.echo("%s: %s" % (fmt.style("Source state path(s) to reset", fg="green"), drop.info["state_paths"])) + fmt.echo( + "%s: %s" + % (fmt.style("Resource(s) state to reset", fg="green"), drop.info["resource_states"]) + ) + fmt.echo( + "%s: %s" + % (fmt.style("Source state path(s) to reset", fg="green"), drop.info["state_paths"]) + ) # for k, v in drop.info.items(): # fmt.echo("%s: %s" % (fmt.style(k, fg="green"), v)) for warning in drop.info["warnings"]: diff --git a/dlt/cli/pipeline_files.py b/dlt/cli/pipeline_files.py index acd3a95e80..49c0f71b21 100644 --- a/dlt/cli/pipeline_files.py +++ b/dlt/cli/pipeline_files.py @@ -65,17 +65,14 @@ def _load_dot_sources() -> TVerifiedSourcesFileIndex: raise FileNotFoundError(SOURCES_INIT_INFO_FILE) return index except FileNotFoundError: - return { - "engine_version": SOURCES_INIT_INFO_ENGINE_VERSION, - "sources": {} - } + return {"engine_version": SOURCES_INIT_INFO_ENGINE_VERSION, "sources": {}} def _merge_remote_index( local_index: TVerifiedSourceFileIndex, remote_index: TVerifiedSourceFileIndex, remote_modified: Dict[str, TVerifiedSourceFileEntry], - remote_deleted: Dict[str, TVerifiedSourceFileEntry] + remote_deleted: Dict[str, TVerifiedSourceFileEntry], ) -> TVerifiedSourceFileIndex: # update all modified files local_index["files"].update(remote_modified) @@ -92,13 +89,15 @@ def _merge_remote_index( def load_verified_sources_local_index(source_name: str) -> TVerifiedSourceFileIndex: - return _load_dot_sources()["sources"].get(source_name, { - "is_dirty": False, - "last_commit_sha": None, - "last_commit_timestamp": None, - "files": {}, - "dlt_version_constraint": ">=0.1.0" - } + return _load_dot_sources()["sources"].get( + source_name, + { + "is_dirty": False, + "last_commit_sha": None, + "last_commit_timestamp": None, + "files": {}, + "dlt_version_constraint": ">=0.1.0", + }, ) @@ -106,17 +105,17 @@ def save_verified_source_local_index( source_name: str, remote_index: TVerifiedSourceFileIndex, remote_modified: Dict[str, TVerifiedSourceFileEntry], - remote_deleted: Dict[str, TVerifiedSourceFileEntry] + remote_deleted: Dict[str, TVerifiedSourceFileEntry], ) -> None: - all_sources = _load_dot_sources() local_index = all_sources["sources"].setdefault(source_name, remote_index) _merge_remote_index(local_index, remote_index, remote_modified, remote_deleted) _save_dot_sources(all_sources) -def get_remote_source_index(repo_path: str, files: Sequence[str], dlt_version_constraint: str) -> TVerifiedSourceFileIndex: - +def get_remote_source_index( + repo_path: str, files: Sequence[str], dlt_version_constraint: str +) -> TVerifiedSourceFileIndex: with git.get_repo(repo_path) as repo: tree = repo.tree() commit_sha = repo.head.commit.hexsha @@ -136,7 +135,7 @@ def get_remote_source_index(repo_path: str, files: Sequence[str], dlt_version_co files_sha[file] = { "commit_sha": commit_sha, "git_sha": blob_sha3, - "sha3_256": hashlib.sha3_256(file_blob).hexdigest() + "sha3_256": hashlib.sha3_256(file_blob).hexdigest(), } return { @@ -144,26 +143,37 @@ def get_remote_source_index(repo_path: str, files: Sequence[str], dlt_version_co "last_commit_sha": commit_sha, "last_commit_timestamp": repo.head.commit.committed_datetime.isoformat(), "files": files_sha, - "dlt_version_constraint": dlt_version_constraint + "dlt_version_constraint": dlt_version_constraint, } def get_verified_source_names(sources_storage: FileStorage) -> List[str]: candidates: List[str] = [] - for name in [n for n in sources_storage.list_folder_dirs(".", to_root=False) if not any(fnmatch.fnmatch(n, ignore) for ignore in IGNORE_SOURCES)]: + for name in [ + n + for n in sources_storage.list_folder_dirs(".", to_root=False) + if not any(fnmatch.fnmatch(n, ignore) for ignore in IGNORE_SOURCES) + ]: # must contain at least one valid python script if any(f.endswith(".py") for f in sources_storage.list_folder_files(name, to_root=False)): candidates.append(name) return candidates -def get_verified_source_files(sources_storage: FileStorage, source_name: str) -> VerifiedSourceFiles: +def get_verified_source_files( + sources_storage: FileStorage, source_name: str +) -> VerifiedSourceFiles: if not sources_storage.has_folder(source_name): - raise VerifiedSourceRepoError(f"Verified source {source_name} could not be found in the repository", source_name) + raise VerifiedSourceRepoError( + f"Verified source {source_name} could not be found in the repository", source_name + ) # find example script example_script = f"{source_name}_pipeline.py" if not sources_storage.has_file(example_script): - raise VerifiedSourceRepoError(f"Pipeline example script {example_script} could not be found in the repository", source_name) + raise VerifiedSourceRepoError( + f"Pipeline example script {example_script} could not be found in the repository", + source_name, + ) # get all files recursively files: List[str] = [] for root, subdirs, _files in os.walk(sources_storage.make_full_path(source_name)): @@ -172,9 +182,15 @@ def get_verified_source_files(sources_storage: FileStorage, source_name: str) -> if any(fnmatch.fnmatch(subdir, ignore) for ignore in IGNORE_FILES): subdirs.remove(subdir) rel_root = sources_storage.to_relative_path(root) - files.extend([os.path.join(rel_root, file) for file in _files if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES)]) + files.extend( + [ + os.path.join(rel_root, file) + for file in _files + if all(not fnmatch.fnmatch(file, ignore) for ignore in IGNORE_FILES) + ] + ) # read the docs - init_py = os.path.join(source_name, utils.MODULE_INIT) + init_py = os.path.join(source_name, utils.MODULE_INIT) docstring: str = "" if sources_storage.has_file(init_py): docstring = get_module_docstring(sources_storage.load(init_py)) @@ -187,14 +203,18 @@ def get_verified_source_files(sources_storage: FileStorage, source_name: str) -> else: requirements = SourceRequirements([]) # find requirements - return VerifiedSourceFiles(False, sources_storage, example_script, example_script, files, requirements, docstring) + return VerifiedSourceFiles( + False, sources_storage, example_script, example_script, files, requirements, docstring + ) def gen_index_diff( - local_index: TVerifiedSourceFileIndex, - remote_index: TVerifiedSourceFileIndex -) -> Tuple[Dict[str, TVerifiedSourceFileEntry], Dict[str, TVerifiedSourceFileEntry], Dict[str, TVerifiedSourceFileEntry]]: - + local_index: TVerifiedSourceFileIndex, remote_index: TVerifiedSourceFileIndex +) -> Tuple[ + Dict[str, TVerifiedSourceFileEntry], + Dict[str, TVerifiedSourceFileEntry], + Dict[str, TVerifiedSourceFileEntry], +]: deleted: Dict[str, TVerifiedSourceFileEntry] = {} modified: Dict[str, TVerifiedSourceFileEntry] = {} new: Dict[str, TVerifiedSourceFileEntry] = {} @@ -223,7 +243,7 @@ def find_conflict_files( remote_new: Dict[str, TVerifiedSourceFileEntry], remote_modified: Dict[str, TVerifiedSourceFileEntry], remote_deleted: Dict[str, TVerifiedSourceFileEntry], - dest_storage: FileStorage + dest_storage: FileStorage, ) -> Tuple[List[str], List[str]]: """Use files index from .sources to identify modified files via sha3 content hash""" diff --git a/dlt/cli/requirements.py b/dlt/cli/requirements.py index 79907ae01c..5b16f7a60f 100644 --- a/dlt/cli/requirements.py +++ b/dlt/cli/requirements.py @@ -7,6 +7,7 @@ class SourceRequirements: """Helper class to parse and manipulate entries in source's requirements.txt""" + dlt_requirement: Requirement """Final dlt requirement that may be updated with destination extras""" dlt_requirement_base: Requirement diff --git a/dlt/cli/source_detection.py b/dlt/cli/source_detection.py index 369663b82f..636615af61 100644 --- a/dlt/cli/source_detection.py +++ b/dlt/cli/source_detection.py @@ -14,7 +14,9 @@ from dlt.reflection.script_visitor import PipelineScriptVisitor -def find_call_arguments_to_replace(visitor: PipelineScriptVisitor, replace_nodes: List[Tuple[str, str]], init_script_name: str) -> List[Tuple[ast.AST, ast.AST]]: +def find_call_arguments_to_replace( + visitor: PipelineScriptVisitor, replace_nodes: List[Tuple[str, str]], init_script_name: str +) -> List[Tuple[ast.AST, ast.AST]]: # the input tuple (call argument name, replacement value) # the returned tuple (node, replacement value, node type) transformed_nodes: List[Tuple[ast.AST, ast.AST]] = [] @@ -26,7 +28,11 @@ def find_call_arguments_to_replace(visitor: PipelineScriptVisitor, replace_nodes dn_node: ast.AST = args.arguments.get(t_arg_name) if dn_node is not None: if not isinstance(dn_node, ast.Constant) or not isinstance(dn_node.value, str): - raise CliCommandException("init", f"The pipeline script {init_script_name} must pass the {t_arg_name} as string to '{arg_name}' function in line {dn_node.lineno}") + raise CliCommandException( + "init", + f"The pipeline script {init_script_name} must pass the {t_arg_name} as" + f" string to '{arg_name}' function in line {dn_node.lineno}", + ) else: transformed_nodes.append((dn_node, ast.Constant(value=t_value, kind=None))) replaced_args.add(t_arg_name) @@ -34,27 +40,39 @@ def find_call_arguments_to_replace(visitor: PipelineScriptVisitor, replace_nodes # there was at least one replacement for t_arg_name, _ in replace_nodes: if t_arg_name not in replaced_args: - raise CliCommandException("init", f"The pipeline script {init_script_name} is not explicitly passing the '{t_arg_name}' argument to 'pipeline' or 'run' function. In init script the default and configured values are not accepted.") + raise CliCommandException( + "init", + f"The pipeline script {init_script_name} is not explicitly passing the" + f" '{t_arg_name}' argument to 'pipeline' or 'run' function. In init script the" + " default and configured values are not accepted.", + ) return transformed_nodes -def find_source_calls_to_replace(visitor: PipelineScriptVisitor, pipeline_name: str) -> List[Tuple[ast.AST, ast.AST]]: +def find_source_calls_to_replace( + visitor: PipelineScriptVisitor, pipeline_name: str +) -> List[Tuple[ast.AST, ast.AST]]: transformed_nodes: List[Tuple[ast.AST, ast.AST]] = [] for source_def in visitor.known_sources_resources.values(): # append function name to be replaced - transformed_nodes.append((creates_func_def_name_node(source_def, visitor.source_lines), ast.Name(id=pipeline_name + "_" + source_def.name))) + transformed_nodes.append( + ( + creates_func_def_name_node(source_def, visitor.source_lines), + ast.Name(id=pipeline_name + "_" + source_def.name), + ) + ) for calls in visitor.known_sources_resources_calls.values(): for call in calls: - transformed_nodes.append((call.func, ast.Name(id=pipeline_name + "_" + unparse(call.func)))) + transformed_nodes.append( + (call.func, ast.Name(id=pipeline_name + "_" + unparse(call.func))) + ) return transformed_nodes def detect_source_configs( - sources: Dict[str, SourceInfo], - module_prefix: str, - section: Tuple[str, ...] + sources: Dict[str, SourceInfo], module_prefix: str, section: Tuple[str, ...] ) -> Tuple[Dict[str, WritableConfigValue], Dict[str, WritableConfigValue], Dict[str, SourceInfo]]: # all detected secrets with sections required_secrets: Dict[str, WritableConfigValue] = {} @@ -75,11 +93,15 @@ def detect_source_configs( if is_secret_hint(field_type): val_store = required_secrets # all configs that are required and do not have a default value must go to config.toml - elif not is_optional_type(field_type) and getattr(source_config, field_name) is None: + elif ( + not is_optional_type(field_type) and getattr(source_config, field_name) is None + ): val_store = required_config if val_store is not None: # we are sure that all resources come from single file so we can put them in single section - val_store[source_name + ":" + field_name] = WritableConfigValue(field_name, field_type, None, section) + val_store[source_name + ":" + field_name] = WritableConfigValue( + field_name, field_type, None, section + ) return required_secrets, required_config, checked_sources diff --git a/dlt/cli/telemetry_command.py b/dlt/cli/telemetry_command.py index 574005797a..bb451ea979 100644 --- a/dlt/cli/telemetry_command.py +++ b/dlt/cli/telemetry_command.py @@ -24,7 +24,9 @@ def telemetry_status_command() -> None: def change_telemetry_status_command(enabled: bool) -> None: # value to write - telemetry_value = [WritableConfigValue("dlthub_telemetry", bool, enabled, (RunConfiguration.__section__, ))] + telemetry_value = [ + WritableConfigValue("dlthub_telemetry", bool, enabled, (RunConfiguration.__section__,)) + ] # write local config config = ConfigTomlProvider(add_global_config=False) if not config.is_empty: diff --git a/dlt/cli/utils.py b/dlt/cli/utils.py index 996770b40d..5ea4471d7e 100644 --- a/dlt/cli/utils.py +++ b/dlt/cli/utils.py @@ -25,14 +25,20 @@ MODULE_INIT = "__init__.py" -def parse_init_script(command: str, script_source: str, init_script_name: str) -> PipelineScriptVisitor: +def parse_init_script( + command: str, script_source: str, init_script_name: str +) -> PipelineScriptVisitor: # parse the script first tree = ast.parse(source=script_source) set_ast_parents(tree) visitor = PipelineScriptVisitor(script_source) visitor.visit_passes(tree) if len(visitor.mod_aliases) == 0: - raise CliCommandException(command, f"The pipeline script {init_script_name} does not import dlt and does not seem to run any pipelines") + raise CliCommandException( + command, + f"The pipeline script {init_script_name} does not import dlt and does not seem to run" + " any pipelines", + ) return visitor @@ -45,8 +51,9 @@ def ensure_git_command(command: str) -> None: raise raise CliCommandException( command, - "'git' command is not available. Install and setup git with the following the guide %s" % "https://docs.github.com/en/get-started/quickstart/set-up-git", - imp_ex + "'git' command is not available. Install and setup git with the following the guide %s" + % "https://docs.github.com/en/get-started/quickstart/set-up-git", + imp_ex, ) from imp_ex diff --git a/dlt/common/arithmetics.py b/dlt/common/arithmetics.py index 5277acad4f..56d8fcd49b 100644 --- a/dlt/common/arithmetics.py +++ b/dlt/common/arithmetics.py @@ -1,7 +1,18 @@ -import decimal # noqa: I251 +import decimal # noqa: I251 from contextlib import contextmanager from typing import Iterator -from decimal import ROUND_HALF_UP, Decimal, Inexact, DivisionByZero, DefaultContext, InvalidOperation, localcontext, Context, Subnormal, ConversionSyntax # noqa: I251 +from decimal import ( # noqa: I251 + ROUND_HALF_UP, + Decimal, + Inexact, + DivisionByZero, + DefaultContext, + InvalidOperation, + localcontext, + Context, + Subnormal, + ConversionSyntax, +) DEFAULT_NUMERIC_PRECISION = 38 diff --git a/dlt/common/configuration/__init__.py b/dlt/common/configuration/__init__.py index a5ffd3e7b8..b7d868ff8b 100644 --- a/dlt/common/configuration/__init__.py +++ b/dlt/common/configuration/__init__.py @@ -7,7 +7,7 @@ ConfigFieldMissingException, ConfigValueCannotBeCoercedException, ConfigFileNotFoundException, - ConfigurationValueError + ConfigurationValueError, ) diff --git a/dlt/common/configuration/accessors.py b/dlt/common/configuration/accessors.py index cf71db7030..1d6ef221b9 100644 --- a/dlt/common/configuration/accessors.py +++ b/dlt/common/configuration/accessors.py @@ -15,8 +15,8 @@ DLT_CONFIG_VALUE = "config.value" TConfigAny = TypeVar("TConfigAny", bound=Any) -class _Accessor(abc.ABC): +class _Accessor(abc.ABC): def __getitem__(self, field: str) -> Any: value, traces = self._get_value(field) if value is None: @@ -100,7 +100,11 @@ def default_type(self) -> AnyType: @property def writable_provider(self) -> ConfigProvider: """find first writable provider that does not support secrets - should be config.toml""" - return next(p for p in self._get_providers_from_context() if p.is_writable and not p.supports_secrets) + return next( + p + for p in self._get_providers_from_context() + if p.is_writable and not p.supports_secrets + ) value: ClassVar[None] = ConfigValue "A placeholder that tells dlt to replace it with actual config value during the call to a source or resource decorated function." @@ -121,7 +125,9 @@ def default_type(self) -> AnyType: @property def writable_provider(self) -> ConfigProvider: """find first writable provider that supports secrets - should be secrets.toml""" - return next(p for p in self._get_providers_from_context() if p.is_writable and p.supports_secrets) + return next( + p for p in self._get_providers_from_context() if p.is_writable and p.supports_secrets + ) value: ClassVar[None] = ConfigValue "A placeholder that tells dlt to replace it with actual secret during the call to a source or resource decorated function." diff --git a/dlt/common/configuration/container.py b/dlt/common/configuration/container.py index 46d64f7a37..c410d18dd9 100644 --- a/dlt/common/configuration/container.py +++ b/dlt/common/configuration/container.py @@ -2,7 +2,10 @@ from typing import Dict, Iterator, Type, TypeVar from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext -from dlt.common.configuration.exceptions import ContainerInjectableContextMangled, ContextDefaultCannotBeCreated +from dlt.common.configuration.exceptions import ( + ContainerInjectableContextMangled, + ContextDefaultCannotBeCreated, +) TConfiguration = TypeVar("TConfiguration", bound=ContainerInjectableContext) @@ -60,7 +63,6 @@ def __delitem__(self, spec: Type[TConfiguration]) -> None: def __contains__(self, spec: Type[TConfiguration]) -> bool: return spec in self.contexts - @contextmanager def injectable_context(self, config: TConfiguration) -> Iterator[TConfiguration]: """A context manager that will insert `config` into the container and restore the previous value when it gets out of scope.""" diff --git a/dlt/common/configuration/exceptions.py b/dlt/common/configuration/exceptions.py index f019565013..91aa3e7ad3 100644 --- a/dlt/common/configuration/exceptions.py +++ b/dlt/common/configuration/exceptions.py @@ -22,17 +22,22 @@ class ConfigurationValueError(ConfigurationException, ValueError): class ContainerException(DltException): """base exception for all exceptions related to injectable container""" + pass class ConfigProviderException(ConfigurationException): """base exceptions for all exceptions raised by config providers""" + pass class ConfigurationWrongTypeException(ConfigurationException): def __init__(self, _typ: type) -> None: - super().__init__(f"Invalid configuration instance type {_typ}. Configuration instances must derive from BaseConfiguration.") + super().__init__( + f"Invalid configuration instance type {_typ}. Configuration instances must derive from" + " BaseConfiguration." + ) class ConfigFieldMissingException(KeyError, ConfigurationException): @@ -45,42 +50,60 @@ def __init__(self, spec_name: str, traces: Mapping[str, Sequence[LookupTrace]]) super().__init__(spec_name) def __str__(self) -> str: - msg = f"Following fields are missing: {str(self.fields)} in configuration with spec {self.spec_name}\n" + msg = ( + f"Following fields are missing: {str(self.fields)} in configuration with spec" + f" {self.spec_name}\n" + ) for f, field_traces in self.traces.items(): msg += f'\tfor field "{f}" config providers and keys were tried in following order:\n' for tr in field_traces: - msg += f'\t\tIn {tr.provider} key {tr.key} was not found.\n' + msg += f"\t\tIn {tr.provider} key {tr.key} was not found.\n" # check if entry point is run with path. this is common problem so warn the user main_path = main_module_file_path() main_dir = os.path.dirname(main_path) abs_main_dir = os.path.abspath(main_dir) if abs_main_dir != os.getcwd(): # directory was specified - msg += "WARNING: dlt looks for .dlt folder in your current working directory and your cwd (%s) is different from directory of your pipeline script (%s).\n" % (os.getcwd(), abs_main_dir) - msg += "If you keep your secret files in the same folder as your pipeline script but run your script from some other folder, secrets/configs will not be found\n" - msg += "Please refer to https://dlthub.com/docs/general-usage/credentials for more information\n" + msg += ( + "WARNING: dlt looks for .dlt folder in your current working directory and your cwd" + " (%s) is different from directory of your pipeline script (%s).\n" + % (os.getcwd(), abs_main_dir) + ) + msg += ( + "If you keep your secret files in the same folder as your pipeline script but run" + " your script from some other folder, secrets/configs will not be found\n" + ) + msg += ( + "Please refer to https://dlthub.com/docs/general-usage/credentials for more" + " information\n" + ) return msg class UnmatchedConfigHintResolversException(ConfigurationException): """Raised when using `@resolve_type` on a field that doesn't exist in the spec""" + def __init__(self, spec_name: str, field_names: Sequence[str]) -> None: self.field_names = field_names self.spec_name = spec_name - example = f">>> class {spec_name}(BaseConfiguration)\n" + "\n".join(f">>> {name}: Any" for name in field_names) + example = f">>> class {spec_name}(BaseConfiguration)\n" + "\n".join( + f">>> {name}: Any" for name in field_names + ) msg = ( - f"The config spec {spec_name} has dynamic type resolvers for fields: {field_names} " - "but these fields are not defined in the spec.\n" - "When using @resolve_type() decorator, Add the fields with 'Any' or another common type hint, example:\n" - f"\n{example}" + f"The config spec {spec_name} has dynamic type resolvers for fields: {field_names} but" + " these fields are not defined in the spec.\nWhen using @resolve_type() decorator, Add" + f" the fields with 'Any' or another common type hint, example:\n\n{example}" ) super().__init__(msg) class FinalConfigFieldException(ConfigurationException): """rises when field was annotated as final ie Final[str] and the value is modified by config provider""" + def __init__(self, spec_name: str, field: str) -> None: - super().__init__(f"Field {field} in spec {spec_name} is final but is being changed by a config provider") + super().__init__( + f"Field {field} in spec {spec_name} is final but is being changed by a config provider" + ) class ConfigValueCannotBeCoercedException(ConfigurationValueError): @@ -90,7 +113,9 @@ def __init__(self, field_name: str, field_value: Any, hint: type) -> None: self.field_name = field_name self.field_value = field_value self.hint = hint - super().__init__('Configured value for field %s cannot be coerced into type %s' % (field_name, str(hint))) + super().__init__( + "Configured value for field %s cannot be coerced into type %s" % (field_name, str(hint)) + ) # class ConfigIntegrityException(ConfigurationException): @@ -116,7 +141,9 @@ class ConfigFieldMissingTypeHintException(ConfigurationException): def __init__(self, field_name: str, spec: Type[Any]) -> None: self.field_name = field_name self.typ_ = spec - super().__init__(f"Field {field_name} on configspec {spec} does not provide required type hint") + super().__init__( + f"Field {field_name} on configspec {spec} does not provide required type hint" + ) class ConfigFieldTypeHintNotSupported(ConfigurationException): @@ -125,25 +152,39 @@ class ConfigFieldTypeHintNotSupported(ConfigurationException): def __init__(self, field_name: str, spec: Type[Any], typ_: Type[Any]) -> None: self.field_name = field_name self.typ_ = spec - super().__init__(f"Field {field_name} on configspec {spec} has hint with unsupported type {typ_}") + super().__init__( + f"Field {field_name} on configspec {spec} has hint with unsupported type {typ_}" + ) class ValueNotSecretException(ConfigurationException): def __init__(self, provider_name: str, key: str) -> None: self.provider_name = provider_name self.key = key - super().__init__(f"Provider {provider_name} cannot hold secret values but key {key} with secret value is present") + super().__init__( + f"Provider {provider_name} cannot hold secret values but key {key} with secret value is" + " present" + ) class InvalidNativeValue(ConfigurationException): - def __init__(self, spec: Type[Any], native_value_type: Type[Any], embedded_sections: Tuple[str, ...], inner_exception: Exception) -> None: + def __init__( + self, + spec: Type[Any], + native_value_type: Type[Any], + embedded_sections: Tuple[str, ...], + inner_exception: Exception, + ) -> None: self.spec = spec self.native_value_type = native_value_type self.embedded_sections = embedded_sections self.inner_exception = inner_exception inner_msg = f" {self.inner_exception}" if inner_exception is not ValueError else "" super().__init__( - f"{spec.__name__} cannot parse the configuration value provided. The value is of type {native_value_type.__name__} and comes from the {embedded_sections} section(s).{inner_msg}") + f"{spec.__name__} cannot parse the configuration value provided. The value is of type" + f" {native_value_type.__name__} and comes from the" + f" {embedded_sections} section(s).{inner_msg}" + ) class ContainerInjectableContextMangled(ContainerException): @@ -151,7 +192,10 @@ def __init__(self, spec: Type[Any], existing_config: Any, expected_config: Any) self.spec = spec self.existing_config = existing_config self.expected_config = expected_config - super().__init__(f"When restoring context {spec.__name__}, instance {expected_config} was expected, instead instance {existing_config} was found.") + super().__init__( + f"When restoring context {spec.__name__}, instance {expected_config} was expected," + f" instead instance {existing_config} was found." + ) class ContextDefaultCannotBeCreated(ContainerException, KeyError): @@ -163,4 +207,6 @@ def __init__(self, spec: Type[Any]) -> None: class DuplicateConfigProviderException(ConfigProviderException): def __init__(self, provider_name: str) -> None: self.provider_name = provider_name - super().__init__(f"Provider with name {provider_name} already present in ConfigProvidersContext") + super().__init__( + f"Provider with name {provider_name} already present in ConfigProvidersContext" + ) diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index f50e947011..6478c3258c 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -34,8 +34,7 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, -) -> TFun: - ... +) -> TFun: ... @overload @@ -48,8 +47,7 @@ def with_config( auto_pipeline_section: bool = False, include_defaults: bool = True, accept_partial: bool = False, -) -> Callable[[TFun], TFun]: - ... +) -> Callable[[TFun], TFun]: ... def with_config( @@ -62,7 +60,7 @@ def with_config( include_defaults: bool = True, accept_partial: bool = False, initial_config: Optional[BaseConfiguration] = None, -) -> Callable[[TFun], TFun]: +) -> Callable[[TFun], TFun]: """Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. The synthesized spec contains the arguments marked with `dlt.secrets.value` and `dlt.config.value` which are required to be injected at runtime. @@ -87,7 +85,9 @@ def with_config( def decorator(f: TFun) -> TFun: SPEC: Type[BaseConfiguration] = None sig: Signature = inspect.signature(f) - kwargs_arg = next((p for p in sig.parameters.values() if p.kind == Parameter.VAR_KEYWORD), None) + kwargs_arg = next( + (p for p in sig.parameters.values() if p.kind == Parameter.VAR_KEYWORD), None + ) spec_arg: Parameter = None pipeline_name_arg: Parameter = None @@ -111,7 +111,6 @@ def decorator(f: TFun) -> TFun: pipeline_name_arg = p pipeline_name_arg_default = None if p.default == Parameter.empty else p.default - @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> Any: # bind parameters to signature @@ -123,7 +122,7 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: else: # if section derivation function was provided then call it if section_f: - curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments), ) + curr_sections: Tuple[str, ...] = (section_f(bound_args.arguments),) # sections may be a string elif isinstance(sections, str): curr_sections = (sections,) @@ -137,15 +136,25 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: config = bound_args.arguments.get(spec_arg.name, None) # resolve SPEC, also provide section_context with pipeline_name if pipeline_name_arg: - curr_pipeline_name = bound_args.arguments.get(pipeline_name_arg.name, pipeline_name_arg_default) + curr_pipeline_name = bound_args.arguments.get( + pipeline_name_arg.name, pipeline_name_arg_default + ) else: curr_pipeline_name = None - section_context = ConfigSectionContext(pipeline_name=curr_pipeline_name, sections=curr_sections, merge_style=sections_merge_style) + section_context = ConfigSectionContext( + pipeline_name=curr_pipeline_name, + sections=curr_sections, + merge_style=sections_merge_style, + ) # this may be called from many threads so make sure context is not mangled with _RESOLVE_LOCK: with inject_section(section_context): # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") - config = resolve_configuration(config or SPEC(), explicit_value=bound_args.arguments, accept_partial=accept_partial) + config = resolve_configuration( + config or SPEC(), + explicit_value=bound_args.arguments, + accept_partial=accept_partial, + ) resolved_params = dict(config) # overwrite or add resolved params for p in sig.parameters.values(): @@ -175,14 +184,17 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: return decorator if not callable(func): - raise ValueError("First parameter to the with_config must be callable ie. by using it as function decorator") + raise ValueError( + "First parameter to the with_config must be callable ie. by using it as function" + " decorator" + ) # We're called as @with_config without parens. return decorator(func) def last_config(**kwargs: Any) -> Any: - """Get configuration instance used to inject function arguments """ + """Get configuration instance used to inject function arguments""" return kwargs[_LAST_DLT_CONFIG] diff --git a/dlt/common/configuration/paths.py b/dlt/common/configuration/paths.py index f773a779f8..780896ab4b 100644 --- a/dlt/common/configuration/paths.py +++ b/dlt/common/configuration/paths.py @@ -27,11 +27,11 @@ def make_dlt_settings_path(path: str) -> str: def get_dlt_data_dir() -> str: - """ Gets default directory where pipelines' data will be stored - 1. in user home directory: ~/.dlt/ - 2. if current user is root: in /var/dlt/ - 3. if current user does not have a home directory: in /tmp/dlt/ - 4. if DLT_DATA_DIR is set in env then it is used + """Gets default directory where pipelines' data will be stored + 1. in user home directory: ~/.dlt/ + 2. if current user is root: in /var/dlt/ + 3. if current user does not have a home directory: in /tmp/dlt/ + 4. if DLT_DATA_DIR is set in env then it is used """ if "DLT_DATA_DIR" in os.environ: return os.environ["DLT_DATA_DIR"] @@ -49,5 +49,6 @@ def get_dlt_data_dir() -> str: # if home directory is available use ~/.dlt/pipelines return os.path.join(home, DOT_DLT) + def _get_user_home_dir() -> str: return os.path.expanduser("~") diff --git a/dlt/common/configuration/providers/__init__.py b/dlt/common/configuration/providers/__init__.py index 76268c14b1..3f5bc20cdc 100644 --- a/dlt/common/configuration/providers/__init__.py +++ b/dlt/common/configuration/providers/__init__.py @@ -1,7 +1,15 @@ from .provider import ConfigProvider from .environ import EnvironProvider from .dictionary import DictionaryProvider -from .toml import SecretsTomlProvider, ConfigTomlProvider, TomlFileProvider, CONFIG_TOML, SECRETS_TOML, StringTomlProvider, SECRETS_TOML_KEY +from .toml import ( + SecretsTomlProvider, + ConfigTomlProvider, + TomlFileProvider, + CONFIG_TOML, + SECRETS_TOML, + StringTomlProvider, + SECRETS_TOML_KEY, +) from .google_secrets import GoogleSecretsProvider from .context import ContextProvider diff --git a/dlt/common/configuration/providers/airflow.py b/dlt/common/configuration/providers/airflow.py index 3c3149adb1..99edf258d2 100644 --- a/dlt/common/configuration/providers/airflow.py +++ b/dlt/common/configuration/providers/airflow.py @@ -10,12 +10,13 @@ def __init__(self, only_secrets: bool = False, only_toml_fragments: bool = False @property def name(self) -> str: - return 'Airflow Secrets TOML Provider' + return "Airflow Secrets TOML Provider" def _look_vault(self, full_key: str, hint: type) -> str: """Get Airflow Variable with given `full_key`, return None if not found""" from airflow.models import Variable + with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): return Variable.get(full_key, default_var=None) # type: ignore diff --git a/dlt/common/configuration/providers/context.py b/dlt/common/configuration/providers/context.py index 84e26923a3..c6c1aac644 100644 --- a/dlt/common/configuration/providers/context.py +++ b/dlt/common/configuration/providers/context.py @@ -8,7 +8,6 @@ class ContextProvider(ConfigProvider): - NAME: ClassVar[str] = "Injectable Context" def __init__(self) -> None: @@ -18,7 +17,9 @@ def __init__(self) -> None: def name(self) -> str: return ContextProvider.NAME - def get_value(self, key: str, hint: Type[Any], pipeline_name: str = None, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str = None, *sections: str + ) -> Tuple[Optional[Any], str]: assert sections == () # only context is a valid hint diff --git a/dlt/common/configuration/providers/dictionary.py b/dlt/common/configuration/providers/dictionary.py index 40a51eeb72..dffe5f0c71 100644 --- a/dlt/common/configuration/providers/dictionary.py +++ b/dlt/common/configuration/providers/dictionary.py @@ -7,7 +7,6 @@ class DictionaryProvider(ConfigProvider): - NAME: ClassVar[str] = "Dictionary Provider" def __init__(self) -> None: @@ -17,14 +16,16 @@ def __init__(self) -> None: def name(self) -> str: return self.NAME - def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: full_path = sections + (key,) if pipeline_name: - full_path = (pipeline_name, ) + full_path + full_path = (pipeline_name,) + full_path full_key = get_key_name(key, "__", pipeline_name, *sections) node = self._values try: - for k in full_path: + for k in full_path: if not isinstance(node, dict): raise KeyError(k) node = node[k] diff --git a/dlt/common/configuration/providers/environ.py b/dlt/common/configuration/providers/environ.py index 7406a1207b..f83ea9a24d 100644 --- a/dlt/common/configuration/providers/environ.py +++ b/dlt/common/configuration/providers/environ.py @@ -8,8 +8,8 @@ SECRET_STORAGE_PATH: str = "/run/secrets/%s" -class EnvironProvider(ConfigProvider): +class EnvironProvider(ConfigProvider): @staticmethod def get_key_name(key: str, *sections: str) -> str: return get_key_name(key, "__", *sections).upper() @@ -18,7 +18,9 @@ def get_key_name(key: str, *sections: str) -> str: def name(self) -> str: return "Environment Variables" - def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: # apply section to the key key = self.get_key_name(key, pipeline_name, *sections) if hint is TSecretValue: diff --git a/dlt/common/configuration/providers/google_secrets.py b/dlt/common/configuration/providers/google_secrets.py index ccf891a575..e6da3da0a8 100644 --- a/dlt/common/configuration/providers/google_secrets.py +++ b/dlt/common/configuration/providers/google_secrets.py @@ -9,7 +9,12 @@ class GoogleSecretsProvider(VaultTomlProvider): - def __init__(self, credentials: GcpServiceAccountCredentials, only_secrets: bool = True, only_toml_fragments: bool = True) -> None: + def __init__( + self, + credentials: GcpServiceAccountCredentials, + only_secrets: bool = True, + only_toml_fragments: bool = True, + ) -> None: self.credentials = credentials super().__init__(only_secrets, only_toml_fragments) @@ -26,7 +31,11 @@ def _look_vault(self, full_key: str, hint: type) -> str: from googleapiclient.discovery import build from googleapiclient.errors import HttpError except ModuleNotFoundError: - raise MissingDependencyException("GoogleSecretsProvider", ["google-api-python-client"], "We need google-api-python-client to build client for secretmanager v1") + raise MissingDependencyException( + "GoogleSecretsProvider", + ["google-api-python-client"], + "We need google-api-python-client to build client for secretmanager v1", + ) from dlt.common import logger resource_name = f"projects/{self.credentials.project_id}/secrets/{full_key}/versions/latest" @@ -42,10 +51,17 @@ def _look_vault(self, full_key: str, hint: type) -> str: # logger.warning(f"{self.credentials.client_email} has roles/secretmanager.secretAccessor role but {full_key} not found in Google Secrets: {error_doc['message']}[{error_doc['status']}]") return None elif error.resp.status == 403: - logger.warning(f"{self.credentials.client_email} does not have roles/secretmanager.secretAccessor role. It also does not have read permission to {full_key} or the key is not found in Google Secrets: {error_doc['message']}[{error_doc['status']}]") + logger.warning( + f"{self.credentials.client_email} does not have" + " roles/secretmanager.secretAccessor role. It also does not have read" + f" permission to {full_key} or the key is not found in Google Secrets:" + f" {error_doc['message']}[{error_doc['status']}]" + ) return None elif error.resp.status == 400: - logger.warning(f"Unable to read {full_key} : {error_doc['message']}[{error_doc['status']}]") + logger.warning( + f"Unable to read {full_key} : {error_doc['message']}[{error_doc['status']}]" + ) return None raise @@ -68,4 +84,4 @@ def _look_vault(self, full_key: str, hint: type) -> str: # has_required_role = True # break # if not has_required_role: - # print("no secrets read access") \ No newline at end of file + # print("no secrets read access") diff --git a/dlt/common/configuration/providers/provider.py b/dlt/common/configuration/providers/provider.py index c6bfea5dc3..405a42bcf0 100644 --- a/dlt/common/configuration/providers/provider.py +++ b/dlt/common/configuration/providers/provider.py @@ -5,9 +5,10 @@ class ConfigProvider(abc.ABC): - @abc.abstractmethod - def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: pass def set_value(self, key: str, value: Any, pipeline_name: str, *sections: str) -> None: diff --git a/dlt/common/configuration/providers/toml.py b/dlt/common/configuration/providers/toml.py index 9e8b2a0059..3c4fa2c145 100644 --- a/dlt/common/configuration/providers/toml.py +++ b/dlt/common/configuration/providers/toml.py @@ -19,7 +19,7 @@ CONFIG_TOML = "config.toml" SECRETS_TOML = "secrets.toml" -SECRETS_TOML_KEY = 'dlt_secrets_toml' +SECRETS_TOML_KEY = "dlt_secrets_toml" class BaseTomlProvider(ConfigProvider): @@ -30,10 +30,12 @@ def __init__(self, toml_document: TOMLContainer) -> None: def get_key_name(key: str, *sections: str) -> str: return get_key_name(key, ".", *sections) - def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: full_path = sections + (key,) if pipeline_name: - full_path = (pipeline_name, ) + full_path + full_path = (pipeline_name,) + full_path full_key = self.get_key_name(key, pipeline_name, *sections) node: Union[TOMLContainer, TOMLItem] = self._toml try: @@ -48,7 +50,7 @@ def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: st def set_value(self, key: str, value: Any, pipeline_name: str, *sections: str) -> None: if pipeline_name: - sections = (pipeline_name, ) + sections + sections = (pipeline_name,) + sections if isinstance(value, TOMLContainer): if key is None: @@ -85,7 +87,6 @@ def is_empty(self) -> bool: class StringTomlProvider(BaseTomlProvider): - def __init__(self, toml_string: str) -> None: super().__init__(StringTomlProvider.loads(toml_string)) @@ -141,12 +142,13 @@ def __init__(self, only_secrets: bool, only_toml_fragments: bool) -> None: super().__init__(tomlkit.document()) self._update_from_vault(SECRETS_TOML_KEY, None, AnyType, None, ()) - def get_value(self, key: str, hint: type, pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: type, pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: full_key = self.get_key_name(key, pipeline_name, *sections) value, _ = super().get_value(key, hint, pipeline_name, *sections) if value is None: - # only secrets hints are handled if self.only_secrets and not is_secret_hint(hint) and hint is not AnyType: return None, full_key @@ -156,7 +158,6 @@ def get_value(self, key: str, hint: type, pipeline_name: str, *sections: str) -> lookup_fk = self.get_key_name(SECRETS_TOML_KEY, pipeline_name) self._update_from_vault(lookup_fk, "", AnyType, pipeline_name, ()) - # generate auxiliary paths to get from vault for known_section in [known_sections.SOURCES, known_sections.DESTINATION]: @@ -164,7 +165,9 @@ def _look_at_idx(idx: int, full_path: Tuple[str, ...], pipeline_name: str) -> No lookup_key = full_path[idx] lookup_sections = full_path[:idx] lookup_fk = self.get_key_name(lookup_key, *lookup_sections) - self._update_from_vault(lookup_fk, lookup_key, AnyType, pipeline_name, lookup_sections) + self._update_from_vault( + lookup_fk, lookup_key, AnyType, pipeline_name, lookup_sections + ) def _lookup_paths(pipeline_name_: str, known_section_: str) -> None: with contextlib.suppress(ValueError): @@ -180,7 +183,9 @@ def _lookup_paths(pipeline_name_: str, known_section_: str) -> None: # first query the shortest paths so the longer paths can override it _lookup_paths(None, known_section) # check sources and sources. if pipeline_name: - _lookup_paths(pipeline_name, known_section) # check .sources and .sources. + _lookup_paths( + pipeline_name, known_section + ) # check .sources and .sources. value, _ = super().get_value(key, hint, pipeline_name, *sections) # skip checking the exact path if we check only toml fragments @@ -203,7 +208,9 @@ def supports_secrets(self) -> bool: def _look_vault(self, full_key: str, hint: type) -> str: pass - def _update_from_vault(self, full_key: str, key: str, hint: type, pipeline_name: str, sections: Tuple[str, ...]) -> None: + def _update_from_vault( + self, full_key: str, key: str, hint: type, pipeline_name: str, sections: Tuple[str, ...] + ) -> None: if full_key in self._vault_lookups: return # print(f"tries '{key}' {pipeline_name} | {sections} at '{full_key}'") @@ -216,8 +223,11 @@ def _update_from_vault(self, full_key: str, key: str, hint: type, pipeline_name: def is_empty(self) -> bool: return False + class TomlFileProvider(BaseTomlProvider): - def __init__(self, file_name: str, project_dir: str = None, add_global_config: bool = False) -> None: + def __init__( + self, file_name: str, project_dir: str = None, add_global_config: bool = False + ) -> None: """Creates config provider from a `toml` file The provider loads the `toml` file with specified name and from specified folder. If `add_global_config` flags is specified, @@ -236,7 +246,9 @@ def __init__(self, file_name: str, project_dir: str = None, add_global_config: b toml_document = self._read_toml_file(file_name, project_dir, add_global_config) super().__init__(toml_document) - def _read_toml_file(self, file_name: str, project_dir: str = None, add_global_config: bool = False) -> tomlkit.TOMLDocument: + def _read_toml_file( + self, file_name: str, project_dir: str = None, add_global_config: bool = False + ) -> tomlkit.TOMLDocument: self._file_name = file_name self._toml_path = os.path.join(project_dir or get_dlt_settings_dir(), file_name) self._add_global_config = add_global_config @@ -254,7 +266,9 @@ def global_config_path() -> str: return get_dlt_data_dir() def write_toml(self) -> None: - assert not self._add_global_config, "Will not write configs when `add_global_config` flag was set" + assert ( + not self._add_global_config + ), "Will not write configs when `add_global_config` flag was set" with open(self._toml_path, "w", encoding="utf-8") as f: tomlkit.dump(self._toml, f) @@ -269,7 +283,6 @@ def _read_toml(toml_path: str) -> tomlkit.TOMLDocument: class ConfigTomlProvider(TomlFileProvider): - def __init__(self, project_dir: str = None, add_global_config: bool = False) -> None: super().__init__(CONFIG_TOML, project_dir=project_dir, add_global_config=add_global_config) @@ -287,7 +300,6 @@ def is_writable(self) -> bool: class SecretsTomlProvider(TomlFileProvider): - def __init__(self, project_dir: str = None, add_global_config: bool = False) -> None: super().__init__(SECRETS_TOML, project_dir=project_dir, add_global_config=add_global_config) @@ -305,7 +317,9 @@ def is_writable(self) -> bool: class TomlProviderReadException(ConfigProviderException): - def __init__(self, provider_name: str, file_name: str, full_path: str, toml_exception: str) -> None: + def __init__( + self, provider_name: str, file_name: str, full_path: str, toml_exception: str + ) -> None: self.file_name = file_name self.full_path = full_path msg = f"A problem encountered when loading {provider_name} from {full_path}:\n" diff --git a/dlt/common/configuration/resolve.py b/dlt/common/configuration/resolve.py index 8e1af2831e..90376e17a8 100644 --- a/dlt/common/configuration/resolve.py +++ b/dlt/common/configuration/resolve.py @@ -3,22 +3,50 @@ from typing import Any, Dict, ContextManager, List, Optional, Sequence, Tuple, Type, TypeVar from dlt.common.configuration.providers.provider import ConfigProvider -from dlt.common.typing import AnyType, StrAny, TSecretValue, get_all_types_of_class_in_union, is_final_type, is_optional_type, is_union - -from dlt.common.configuration.specs.base_configuration import BaseConfiguration, CredentialsConfiguration, is_secret_hint, extract_inner_hint, is_context_inner_hint, is_base_configuration_inner_hint, is_valid_hint +from dlt.common.typing import ( + AnyType, + StrAny, + TSecretValue, + get_all_types_of_class_in_union, + is_final_type, + is_optional_type, + is_union, +) + +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + CredentialsConfiguration, + is_secret_hint, + extract_inner_hint, + is_context_inner_hint, + is_base_configuration_inner_hint, + is_valid_hint, +) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.container import Container from dlt.common.configuration.utils import log_traces, deserialize_value from dlt.common.configuration.exceptions import ( - FinalConfigFieldException, LookupTrace, ConfigFieldMissingException, ConfigurationWrongTypeException, - ValueNotSecretException, InvalidNativeValue, UnmatchedConfigHintResolversException) + FinalConfigFieldException, + LookupTrace, + ConfigFieldMissingException, + ConfigurationWrongTypeException, + ValueNotSecretException, + InvalidNativeValue, + UnmatchedConfigHintResolversException, +) TConfiguration = TypeVar("TConfiguration", bound=BaseConfiguration) -def resolve_configuration(config: TConfiguration, *, sections: Tuple[str, ...] = (), explicit_value: Any = None, accept_partial: bool = False) -> TConfiguration: +def resolve_configuration( + config: TConfiguration, + *, + sections: Tuple[str, ...] = (), + explicit_value: Any = None, + accept_partial: bool = False +) -> TConfiguration: if not isinstance(config, BaseConfiguration): raise ConfigurationWrongTypeException(type(config)) @@ -26,7 +54,9 @@ def resolve_configuration(config: TConfiguration, *, sections: Tuple[str, ...] = # allows, for example, to store connection string or service.json in their native form in single env variable or under single vault key if config.__section__ and explicit_value is None: initial_hint = TSecretValue if isinstance(config, CredentialsConfiguration) else AnyType - explicit_value, traces = _resolve_single_value(config.__section__, initial_hint, AnyType, None, sections, ()) + explicit_value, traces = _resolve_single_value( + config.__section__, initial_hint, AnyType, None, sections, () + ) if isinstance(explicit_value, C_Mapping): # mappings cannot be used as explicit values, we want to enumerate mappings and request the fields' values one by one explicit_value = None @@ -62,7 +92,9 @@ def initialize_credentials(hint: Any, initial_value: Any) -> CredentialsConfigur return hint(initial_value) # type: ignore -def inject_section(section_context: ConfigSectionContext, merge_existing: bool = True) -> ContextManager[ConfigSectionContext]: +def inject_section( + section_context: ConfigSectionContext, merge_existing: bool = True +) -> ContextManager[ConfigSectionContext]: """Context manager that sets section specified in `section_context` to be used during configuration resolution. Optionally merges the context already in the container with the one provided Args: @@ -83,9 +115,14 @@ def inject_section(section_context: ConfigSectionContext, merge_existing: bool = return container.injectable_context(section_context) -def _maybe_parse_native_value(config: TConfiguration, explicit_value: Any, embedded_sections: Tuple[str, ...]) -> Any: + +def _maybe_parse_native_value( + config: TConfiguration, explicit_value: Any, embedded_sections: Tuple[str, ...] +) -> Any: # use initial value to resolve the whole configuration. if explicit value is a mapping it will be applied field by field later - if explicit_value and (not isinstance(explicit_value, C_Mapping) or isinstance(explicit_value, BaseConfiguration)): + if explicit_value and ( + not isinstance(explicit_value, C_Mapping) or isinstance(explicit_value, BaseConfiguration) + ): try: config.parse_native_representation(explicit_value) except ValueError as v_err: @@ -97,13 +134,14 @@ def _maybe_parse_native_value(config: TConfiguration, explicit_value: Any, embed explicit_value = None return explicit_value + def _resolve_configuration( - config: TConfiguration, - explicit_sections: Tuple[str, ...], - embedded_sections: Tuple[str, ...], - explicit_value: Any, - accept_partial: bool - ) -> TConfiguration: + config: TConfiguration, + explicit_sections: Tuple[str, ...], + embedded_sections: Tuple[str, ...], + explicit_value: Any, + accept_partial: bool, +) -> TConfiguration: # do not resolve twice if config.is_resolved(): return config @@ -114,7 +152,9 @@ def _resolve_configuration( explicit_value = _maybe_parse_native_value(config, explicit_value, embedded_sections) # if native representation didn't fully resolve the config, we try to resolve field by field if not config.is_resolved(): - _resolve_config_fields(config, explicit_value, explicit_sections, embedded_sections, accept_partial) + _resolve_config_fields( + config, explicit_value, explicit_sections, embedded_sections, accept_partial + ) # full configuration was resolved config.resolve() except ConfigFieldMissingException as cm_ex: @@ -134,13 +174,12 @@ def _resolve_configuration( def _resolve_config_fields( - config: BaseConfiguration, - explicit_values: StrAny, - explicit_sections: Tuple[str, ...], - embedded_sections: Tuple[str, ...], - accept_partial: bool - ) -> None: - + config: BaseConfiguration, + explicit_values: StrAny, + explicit_sections: Tuple[str, ...], + embedded_sections: Tuple[str, ...], + accept_partial: bool, +) -> None: fields = config.get_resolvable_fields() unresolved_fields: Dict[str, Sequence[LookupTrace]] = {} @@ -166,7 +205,11 @@ def _resolve_config_fields( current_value = None if is_union(hint): # if union contains a type of explicit value which is not a valid hint, return it as current value - if explicit_value and not is_valid_hint(type(explicit_value)) and get_all_types_of_class_in_union(hint, type(explicit_value)): + if ( + explicit_value + and not is_valid_hint(type(explicit_value)) + and get_all_types_of_class_in_union(hint, type(explicit_value)) + ): current_value, traces = explicit_value, [] else: specs_in_union = get_all_types_of_class_in_union(hint, BaseConfiguration) @@ -184,7 +227,7 @@ def _resolve_config_fields( config.__section__, explicit_sections, embedded_sections, - accept_partial + accept_partial, ) break except ConfigFieldMissingException as cfm_ex: @@ -205,7 +248,7 @@ def _resolve_config_fields( config.__section__, explicit_sections, embedded_sections, - accept_partial + accept_partial, ) # check if hint optional @@ -233,17 +276,16 @@ def _resolve_config_fields( def _resolve_config_field( - key: str, - hint: Type[Any], - default_value: Any, - explicit_value: Any, - config: BaseConfiguration, - config_sections: str, - explicit_sections: Tuple[str, ...], - embedded_sections: Tuple[str, ...], - accept_partial: bool - ) -> Tuple[Any, List[LookupTrace]]: - + key: str, + hint: Type[Any], + default_value: Any, + explicit_value: Any, + config: BaseConfiguration, + config_sections: str, + explicit_sections: Tuple[str, ...], + embedded_sections: Tuple[str, ...], + accept_partial: bool, +) -> Tuple[Any, List[LookupTrace]]: inner_hint = extract_inner_hint(hint) if explicit_value is not None: @@ -251,7 +293,9 @@ def _resolve_config_field( traces: List[LookupTrace] = [] else: # resolve key value via active providers passing the original hint ie. to preserve TSecretValue - value, traces = _resolve_single_value(key, hint, inner_hint, config_sections, explicit_sections, embedded_sections) + value, traces = _resolve_single_value( + key, hint, inner_hint, config_sections, explicit_sections, embedded_sections + ) log_traces(config, key, hint, value, default_value, traces) # contexts must be resolved as a whole if is_context_inner_hint(inner_hint): @@ -280,23 +324,44 @@ def _resolve_config_field( # only config with sections may look for initial values if embedded_config.__section__ and value is None: # config section becomes the key if the key does not start with, otherwise it keeps its original value - initial_key, initial_embedded = _apply_embedded_sections_to_config_sections(embedded_config.__section__, embedded_sections + (key,)) + initial_key, initial_embedded = _apply_embedded_sections_to_config_sections( + embedded_config.__section__, embedded_sections + (key,) + ) # it must be a secret value is config is credentials - initial_hint = TSecretValue if isinstance(embedded_config, CredentialsConfiguration) else AnyType - value, initial_traces = _resolve_single_value(initial_key, initial_hint, AnyType, None, explicit_sections, initial_embedded) + initial_hint = ( + TSecretValue + if isinstance(embedded_config, CredentialsConfiguration) + else AnyType + ) + value, initial_traces = _resolve_single_value( + initial_key, initial_hint, AnyType, None, explicit_sections, initial_embedded + ) if isinstance(value, C_Mapping): # mappings are not passed as initials value = None else: traces.extend(initial_traces) - log_traces(config, initial_key, type(embedded_config), value, default_value, initial_traces) + log_traces( + config, + initial_key, + type(embedded_config), + value, + default_value, + initial_traces, + ) # check if hint optional is_optional = is_optional_type(hint) # accept partial becomes True if type if optional so we do not fail on optional configs that do not resolve fully accept_partial = accept_partial or is_optional # create new instance and pass value from the provider as initial, add key to sections - value = _resolve_configuration(embedded_config, explicit_sections, embedded_sections + (key,), default_value if value is None else value, accept_partial) + value = _resolve_configuration( + embedded_config, + explicit_sections, + embedded_sections + (key,), + default_value if value is None else value, + accept_partial, + ) if value.is_partial() and is_optional: # do not return partially resolved optional embeds value = None @@ -311,14 +376,13 @@ def _resolve_config_field( def _resolve_single_value( - key: str, - hint: Type[Any], - inner_hint: Type[Any], - config_section: str, - explicit_sections: Tuple[str, ...], - embedded_sections: Tuple[str, ...] - ) -> Tuple[Optional[Any], List[LookupTrace]]: - + key: str, + hint: Type[Any], + inner_hint: Type[Any], + config_section: str, + explicit_sections: Tuple[str, ...], + embedded_sections: Tuple[str, ...], +) -> Tuple[Optional[Any], List[LookupTrace]]: traces: List[LookupTrace] = [] value = None @@ -335,7 +399,9 @@ def _resolve_single_value( return value, traces # resolve a field of the config - config_section, embedded_sections = _apply_embedded_sections_to_config_sections(config_section, embedded_sections) + config_section, embedded_sections = _apply_embedded_sections_to_config_sections( + config_section, embedded_sections + ) providers = providers_context.providers # get additional sections to look in from container sections_context = container[ConfigSectionContext] @@ -356,7 +422,7 @@ def look_sections(pipeline_name: str = None) -> Any: config_section, # if explicit sections are provided, ignore the injected context explicit_sections or sections_context.sections, - embedded_sections + embedded_sections, ) traces.extend(provider_traces) if value is not None: @@ -382,7 +448,7 @@ def resolve_single_provider_value( pipeline_name: str = None, config_section: str = None, explicit_sections: Tuple[str, ...] = (), - embedded_sections: Tuple[str, ...] = () + embedded_sections: Tuple[str, ...] = (), ) -> Tuple[Optional[Any], List[LookupTrace]]: traces: List[LookupTrace] = [] @@ -429,7 +495,9 @@ def resolve_single_provider_value( return value, traces -def _apply_embedded_sections_to_config_sections(config_section: str, embedded_sections: Tuple[str, ...]) -> Tuple[str, Tuple[str, ...]]: +def _apply_embedded_sections_to_config_sections( + config_section: str, embedded_sections: Tuple[str, ...] +) -> Tuple[str, Tuple[str, ...]]: # for the configurations that have __section__ (config_section) defined and are embedded in other configurations, # the innermost embedded section replaces config_section if embedded_sections: diff --git a/dlt/common/configuration/specs/__init__.py b/dlt/common/configuration/specs/__init__.py index 2a033b6bbd..9acf14bde3 100644 --- a/dlt/common/configuration/specs/__init__.py +++ b/dlt/common/configuration/specs/__init__.py @@ -1,8 +1,22 @@ from .run_configuration import RunConfiguration -from .base_configuration import BaseConfiguration, CredentialsConfiguration, CredentialsWithDefault, ContainerInjectableContext, extract_inner_hint, is_base_configuration_inner_hint, configspec +from .base_configuration import ( + BaseConfiguration, + CredentialsConfiguration, + CredentialsWithDefault, + ContainerInjectableContext, + extract_inner_hint, + is_base_configuration_inner_hint, + configspec, +) from .config_section_context import ConfigSectionContext -from .gcp_credentials import GcpServiceAccountCredentialsWithoutDefaults, GcpServiceAccountCredentials, GcpOAuthCredentialsWithoutDefaults, GcpOAuthCredentials, GcpCredentials +from .gcp_credentials import ( + GcpServiceAccountCredentialsWithoutDefaults, + GcpServiceAccountCredentials, + GcpOAuthCredentialsWithoutDefaults, + GcpOAuthCredentials, + GcpCredentials, +) from .connection_string_credentials import ConnectionStringCredentials from .api_credentials import OAuth2Credentials from .aws_credentials import AwsCredentials, AwsCredentialsWithoutDefaults @@ -10,17 +24,33 @@ # backward compatibility for service account credentials -from .gcp_credentials import GcpServiceAccountCredentialsWithoutDefaults as GcpClientCredentials, GcpServiceAccountCredentials as GcpClientCredentialsWithDefault +from .gcp_credentials import ( + GcpServiceAccountCredentialsWithoutDefaults as GcpClientCredentials, + GcpServiceAccountCredentials as GcpClientCredentialsWithDefault, +) __all__ = [ "RunConfiguration", - "BaseConfiguration", "CredentialsConfiguration", "CredentialsWithDefault", "ContainerInjectableContext", "extract_inner_hint", "is_base_configuration_inner_hint", "configspec", + "BaseConfiguration", + "CredentialsConfiguration", + "CredentialsWithDefault", + "ContainerInjectableContext", + "extract_inner_hint", + "is_base_configuration_inner_hint", + "configspec", "ConfigSectionContext", - "GcpServiceAccountCredentialsWithoutDefaults", "GcpServiceAccountCredentials", "GcpOAuthCredentialsWithoutDefaults", "GcpOAuthCredentials", "GcpCredentials", + "GcpServiceAccountCredentialsWithoutDefaults", + "GcpServiceAccountCredentials", + "GcpOAuthCredentialsWithoutDefaults", + "GcpOAuthCredentials", + "GcpCredentials", "ConnectionStringCredentials", "OAuth2Credentials", - "AwsCredentials", "AwsCredentialsWithoutDefaults", - "AzureCredentials", "AzureCredentialsWithoutDefaults", - "GcpClientCredentials", "GcpClientCredentialsWithDefault", + "AwsCredentials", + "AwsCredentialsWithoutDefaults", + "AzureCredentials", + "AzureCredentialsWithoutDefaults", + "GcpClientCredentials", + "GcpClientCredentialsWithDefault", ] diff --git a/dlt/common/configuration/specs/api_credentials.py b/dlt/common/configuration/specs/api_credentials.py index 6a06a42713..fd7ae8cb09 100644 --- a/dlt/common/configuration/specs/api_credentials.py +++ b/dlt/common/configuration/specs/api_credentials.py @@ -17,7 +17,6 @@ class OAuth2Credentials(CredentialsConfiguration): # add refresh_token when generating config samples __config_gen_annotations__: ClassVar[List[str]] = ["refresh_token"] - def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) -> None: """Authorizes the client using the available credentials @@ -44,4 +43,3 @@ def add_scopes(self, scopes: Union[str, List[str]]) -> None: self.scopes += [scopes] elif scopes: self.scopes = list(set(self.scopes + scopes)) - diff --git a/dlt/common/configuration/specs/aws_credentials.py b/dlt/common/configuration/specs/aws_credentials.py index 8c4aabc4ee..f6df1d8cce 100644 --- a/dlt/common/configuration/specs/aws_credentials.py +++ b/dlt/common/configuration/specs/aws_credentials.py @@ -2,7 +2,11 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import TSecretStrValue, DictStrAny -from dlt.common.configuration.specs import CredentialsConfiguration, CredentialsWithDefault, configspec +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + CredentialsWithDefault, + configspec, +) from dlt.common.configuration.specs.exceptions import InvalidBoto3Session from dlt import version @@ -37,7 +41,6 @@ def to_native_representation(self) -> Dict[str, Optional[str]]: @configspec class AwsCredentials(AwsCredentialsWithoutDefaults, CredentialsWithDefault): - def on_partial(self) -> None: # Try get default credentials session = self._to_botocore_session() @@ -48,31 +51,34 @@ def _to_botocore_session(self) -> Any: try: import botocore.session except ModuleNotFoundError: - raise MissingDependencyException(self.__class__.__name__, [f"{version.DLT_PKG_NAME}[s3]"]) + raise MissingDependencyException( + self.__class__.__name__, [f"{version.DLT_PKG_NAME}[s3]"] + ) # taken from boto3 Session session = botocore.session.get_session() if self.profile_name is not None: - session.set_config_variable('profile', self.profile_name) + session.set_config_variable("profile", self.profile_name) if self.aws_access_key_id or self.aws_secret_access_key or self.aws_session_token: session.set_credentials( self.aws_access_key_id, self.aws_secret_access_key, self.aws_session_token ) if self.region_name is not None: - session.set_config_variable('region', self.region_name) + session.set_config_variable("region", self.region_name) return session def _from_session(self, session: Any) -> Any: """Sets the credentials properties from botocore or boto3 `session` and return session's credentials if found""" import botocore.session + if not isinstance(session, botocore.session.Session): # assume this is boto3 session session = session._session # NOTE: we do not set profile name from boto3 session # we either pass it explicitly in `_to_session` so we know it is identical # this is what boto3 does: return self._session.profile or 'default' which is obviously wrong (returning default when there's no session) - self.region_name = session.get_config_variable('region') + self.region_name = session.get_config_variable("region") default = session.get_credentials() if not default: return None diff --git a/dlt/common/configuration/specs/azure_credentials.py b/dlt/common/configuration/specs/azure_credentials.py index 49393a6343..f7cac78dca 100644 --- a/dlt/common/configuration/specs/azure_credentials.py +++ b/dlt/common/configuration/specs/azure_credentials.py @@ -3,7 +3,11 @@ from dlt.common import pendulum from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import TSecretStrValue -from dlt.common.configuration.specs import CredentialsConfiguration, CredentialsWithDefault, configspec +from dlt.common.configuration.specs import ( + CredentialsConfiguration, + CredentialsWithDefault, + configspec, +) from dlt.common.configuration.specs.exceptions import InvalidBoto3Session from dlt import version @@ -30,12 +34,13 @@ def to_adlfs_credentials(self) -> Dict[str, Any]: def create_sas_token(self) -> None: from azure.storage.blob import generate_account_sas, ResourceTypes + self.azure_storage_sas_token = generate_account_sas( # type: ignore[assignment] account_name=self.azure_storage_account_name, account_key=self.azure_storage_account_key, resource_types=ResourceTypes(container=True, object=True), permission=self.azure_sas_token_permissions, - expiry=pendulum.now().add(days=1) + expiry=pendulum.now().add(days=1), ) def on_partial(self) -> None: @@ -50,6 +55,7 @@ def on_partial(self) -> None: class AzureCredentials(AzureCredentialsWithoutDefaults, CredentialsWithDefault): def on_partial(self) -> None: from azure.identity import DefaultAzureCredential + if not self.azure_storage_account_key and not self.azure_storage_sas_token: self._set_default_credentials(DefaultAzureCredential()) if self.azure_storage_account_name: @@ -60,5 +66,5 @@ def on_partial(self) -> None: def to_adlfs_credentials(self) -> Dict[str, Any]: base_kwargs = super().to_adlfs_credentials() if self.has_default_credentials(): - base_kwargs['anon'] = False + base_kwargs["anon"] = False return base_kwargs diff --git a/dlt/common/configuration/specs/base_configuration.py b/dlt/common/configuration/specs/base_configuration.py index 08940ffe31..33a91a2a30 100644 --- a/dlt/common/configuration/specs/base_configuration.py +++ b/dlt/common/configuration/specs/base_configuration.py @@ -3,7 +3,23 @@ import contextlib import dataclasses from collections.abc import Mapping as C_Mapping -from typing import Callable, List, Optional, Union, Any, Dict, Iterator, MutableMapping, Type, TYPE_CHECKING, get_args, get_origin, overload, ClassVar, TypeVar +from typing import ( + Callable, + List, + Optional, + Union, + Any, + Dict, + Iterator, + MutableMapping, + Type, + TYPE_CHECKING, + get_args, + get_origin, + overload, + ClassVar, + TypeVar, +) from functools import wraps if TYPE_CHECKING: @@ -11,9 +27,18 @@ else: TDtcField = dataclasses.Field -from dlt.common.typing import TAnyClass, TSecretValue, extract_inner_type, is_optional_type, is_union +from dlt.common.typing import ( + TAnyClass, + TSecretValue, + extract_inner_type, + is_optional_type, + is_union, +) from dlt.common.data_types import py_type_to_sc_type -from dlt.common.configuration.exceptions import ConfigFieldMissingTypeHintException, ConfigFieldTypeHintNotSupported +from dlt.common.configuration.exceptions import ( + ConfigFieldMissingTypeHintException, + ConfigFieldTypeHintNotSupported, +) # forward class declaration @@ -68,7 +93,7 @@ def extract_inner_hint(hint: Type[Any], preserve_new_types: bool = False) -> Typ def is_secret_hint(hint: Type[Any]) -> bool: - is_secret = False + is_secret = False if hasattr(hint, "__name__"): is_secret = hint.__name__ == "TSecretValue" if not is_secret: @@ -82,16 +107,16 @@ def is_secret_hint(hint: Type[Any]) -> bool: @overload -def configspec(cls: Type[TAnyClass]) -> Type[TAnyClass]: - ... +def configspec(cls: Type[TAnyClass]) -> Type[TAnyClass]: ... @overload -def configspec(cls: None = ...) -> Callable[[Type[TAnyClass]], Type[TAnyClass]]: - ... +def configspec(cls: None = ...) -> Callable[[Type[TAnyClass]], Type[TAnyClass]]: ... -def configspec(cls: Optional[Type[Any]] = None) -> Union[Type[TAnyClass], Callable[[Type[TAnyClass]], Type[TAnyClass]]]: +def configspec( + cls: Optional[Type[Any]] = None, +) -> Union[Type[TAnyClass], Callable[[Type[TAnyClass]], Type[TAnyClass]]]: """Converts (via derivation) any decorated class to a Python dataclass that may be used as a spec to resolve configurations In comparison the Python dataclass, a spec implements full dictionary interface for its attributes, allows instance creation from ie. strings @@ -99,6 +124,7 @@ def configspec(cls: Optional[Type[Any]] = None) -> Union[Type[TAnyClass], Callab more information. """ + def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: cls.__hint_resolvers__ = {} # type: ignore[attr-defined] is_context = issubclass(cls, _F_ContainerInjectableContext) @@ -106,8 +132,11 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: with contextlib.suppress(NameError): if not issubclass(cls, BaseConfiguration): # keep the original module and keep defaults for fields listed in annotations - fields = {"__module__": cls.__module__, "__annotations__": getattr(cls, "__annotations__", {})} - for key in fields['__annotations__'].keys(): # type: ignore[union-attr] + fields = { + "__module__": cls.__module__, + "__annotations__": getattr(cls, "__annotations__", {}), + } + for key in fields["__annotations__"].keys(): # type: ignore[union-attr] if key in cls.__dict__: fields[key] = cls.__dict__[key] cls = type(cls.__name__, (cls, _F_BaseConfiguration), fields) @@ -129,7 +158,9 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: except NameError: # Dealing with BaseConfiguration itself before it is defined continue - if not att_name.startswith(("__", "_abc_")) and not isinstance(att_value, (staticmethod, classmethod, property)): + if not att_name.startswith(("__", "_abc_")) and not isinstance( + att_value, (staticmethod, classmethod, property) + ): if att_name not in cls.__annotations__: raise ConfigFieldMissingTypeHintException(att_name, cls) hint = cls.__annotations__[att_name] @@ -142,8 +173,8 @@ def wrap(cls: Type[TAnyClass]) -> Type[TAnyClass]: # blocking mutable defaults def default_factory(att_value=att_value): # type: ignore[no-untyped-def] return att_value.copy() - setattr(cls, att_name, dataclasses.field(default_factory=default_factory)) + setattr(cls, att_name, dataclasses.field(default_factory=default_factory)) # We don't want to overwrite user's __init__ method # Create dataclass init only when not defined in the class @@ -168,12 +199,11 @@ def default_factory(att_value=att_value): # type: ignore[no-untyped-def] @configspec class BaseConfiguration(MutableMapping[str, Any]): - - __is_resolved__: bool = dataclasses.field(default = False, init=False, repr=False) + __is_resolved__: bool = dataclasses.field(default=False, init=False, repr=False) """True when all config fields were resolved and have a specified value type""" - __section__: str = dataclasses.field(default = None, init=False, repr=False) + __section__: str = dataclasses.field(default=None, init=False, repr=False) """Obligatory section used by config providers when searching for keys, always present in the search path""" - __exception__: Exception = dataclasses.field(default = None, init=False, repr=False) + __exception__: Exception = dataclasses.field(default=None, init=False, repr=False) """Holds the exception that prevented the full resolution""" __config_gen_annotations__: ClassVar[List[str]] = [] """Additional annotations for config generator, currently holds a list of fields of interest that have defaults""" @@ -181,7 +211,6 @@ class BaseConfiguration(MutableMapping[str, Any]): """Typing for dataclass fields""" __hint_resolvers__: ClassVar[Dict[str, Callable[["BaseConfiguration"], Type[Any]]]] = {} - def parse_native_representation(self, native_value: Any) -> None: """Initialize the configuration fields by parsing the `native_value` which should be a native representation of the configuration or credentials, for example database connection string or JSON serialized GCP service credentials file. @@ -212,7 +241,7 @@ def _get_resolvable_dataclass_fields(cls) -> Iterator[TDtcField]: # Sort dynamic type hint fields last because they depend on other values yield from sorted( (f for f in cls.__dataclass_fields__.values() if cls.__is_valid_field(f)), - key=lambda f: f.name in cls.__hint_resolvers__ + key=lambda f: f.name in cls.__hint_resolvers__, ) @classmethod @@ -229,7 +258,9 @@ def is_partial(self) -> bool: return False # check if all resolvable fields have value return any( - field for field, hint in self.get_resolvable_fields().items() if getattr(self, field) is None and not is_optional_type(hint) + field + for field, hint in self.get_resolvable_fields().items() + if getattr(self, field) is None and not is_optional_type(hint) ) def resolve(self) -> None: @@ -265,7 +296,10 @@ def __delitem__(self, __key: str) -> None: def __iter__(self) -> Iterator[str]: """Iterator or valid key names""" - return map(lambda field: field.name, filter(lambda val: self.__is_valid_field(val), self.__dataclass_fields__.values())) + return map( + lambda field: field.name, + filter(lambda val: self.__is_valid_field(val), self.__dataclass_fields__.values()), + ) def __len__(self) -> int: return sum(1 for _ in self.__iter__()) @@ -280,7 +314,9 @@ def update(self, other: Any = (), /, **kwds: Any) -> None: # helper functions def __has_attr(self, __key: str) -> bool: - return __key in self.__dataclass_fields__ and self.__is_valid_field(self.__dataclass_fields__[__key]) + return __key in self.__dataclass_fields__ and self.__is_valid_field( + self.__dataclass_fields__[__key] + ) @staticmethod def __is_valid_field(field: TDtcField) -> bool: @@ -335,7 +371,7 @@ def to_native_credentials(self) -> Any: return self.to_native_representation() def __str__(self) -> str: - """Get string representation of credentials to be displayed, with all secret parts removed """ + """Get string representation of credentials to be displayed, with all secret parts removed""" return super().__str__() @@ -372,11 +408,15 @@ def add_extras(self) -> None: TSpec = TypeVar("TSpec", bound=BaseConfiguration) THintResolver = Callable[[TSpec], Type[Any]] + def resolve_type(field_name: str) -> Callable[[THintResolver[TSpec]], THintResolver[TSpec]]: def decorator(func: THintResolver[TSpec]) -> THintResolver[TSpec]: func.__hint_for_field__ = field_name # type: ignore[attr-defined] + @wraps(func) def wrapper(self: TSpec) -> Type[Any]: return func(self) + return wrapper + return decorator diff --git a/dlt/common/configuration/specs/config_providers_context.py b/dlt/common/configuration/specs/config_providers_context.py index 062714245b..0c852edfa5 100644 --- a/dlt/common/configuration/specs/config_providers_context.py +++ b/dlt/common/configuration/specs/config_providers_context.py @@ -2,9 +2,21 @@ import io from typing import List from dlt.common.configuration.exceptions import DuplicateConfigProviderException -from dlt.common.configuration.providers import ConfigProvider, EnvironProvider, ContextProvider, SecretsTomlProvider, ConfigTomlProvider, GoogleSecretsProvider +from dlt.common.configuration.providers import ( + ConfigProvider, + EnvironProvider, + ContextProvider, + SecretsTomlProvider, + ConfigTomlProvider, + GoogleSecretsProvider, +) from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext -from dlt.common.configuration.specs import GcpServiceAccountCredentials, BaseConfiguration, configspec, known_sections +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + BaseConfiguration, + configspec, + known_sections, +) from dlt.common.runtime.exec_info import is_airflow_installed @@ -21,6 +33,7 @@ class ConfigProvidersConfiguration(BaseConfiguration): @configspec class ConfigProvidersContext(ContainerInjectableContext): """Injectable list of providers used by the configuration `resolve` module""" + providers: List[ConfigProvider] context_provider: ConfigProvider @@ -70,27 +83,36 @@ def _initial_providers() -> List[ConfigProvider]: providers = [ EnvironProvider(), SecretsTomlProvider(add_global_config=True), - ConfigTomlProvider(add_global_config=True) + ConfigTomlProvider(add_global_config=True), ] return providers def _extra_providers() -> List[ConfigProvider]: from dlt.common.configuration.resolve import resolve_configuration + providers_config = resolve_configuration(ConfigProvidersConfiguration()) extra_providers = [] if providers_config.enable_airflow_secrets: extra_providers.extend(_airflow_providers()) if providers_config.enable_google_secrets: - extra_providers.append(_google_secrets_provider(only_toml_fragments=providers_config.only_toml_fragments)) + extra_providers.append( + _google_secrets_provider(only_toml_fragments=providers_config.only_toml_fragments) + ) return extra_providers -def _google_secrets_provider(only_secrets: bool = True, only_toml_fragments: bool = True) -> ConfigProvider: +def _google_secrets_provider( + only_secrets: bool = True, only_toml_fragments: bool = True +) -> ConfigProvider: from dlt.common.configuration.resolve import resolve_configuration - c = resolve_configuration(GcpServiceAccountCredentials(), sections=(known_sections.PROVIDERS, "google_secrets")) - return GoogleSecretsProvider(c, only_secrets=only_secrets, only_toml_fragments=only_toml_fragments) + c = resolve_configuration( + GcpServiceAccountCredentials(), sections=(known_sections.PROVIDERS, "google_secrets") + ) + return GoogleSecretsProvider( + c, only_secrets=only_secrets, only_toml_fragments=only_toml_fragments + ) def _airflow_providers() -> List[ConfigProvider]: @@ -112,10 +134,12 @@ def _airflow_providers() -> List[ConfigProvider]: # hide stdio. airflow typically dumps tons of warnings and deprecations to stdout and stderr with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()): # try to get dlt secrets variable. many broken Airflow installations break here. in that case do not create - from airflow.models import Variable, TaskInstance # noqa + from airflow.models import Variable, TaskInstance # noqa from dlt.common.configuration.providers.airflow import AirflowSecretsTomlProvider + # probe if Airflow variable containing all secrets is present from dlt.common.configuration.providers.toml import SECRETS_TOML_KEY + secrets_toml_var = Variable.get(SECRETS_TOML_KEY, default_var=None) # providers can be returned - mind that AirflowSecretsTomlProvider() requests the variable above immediately @@ -123,13 +147,18 @@ def _airflow_providers() -> List[ConfigProvider]: # check if we are in task context and provide more info from airflow.operators.python import get_current_context # noqa + ti: TaskInstance = get_current_context()["ti"] # type: ignore # log outside of stderr/out redirect if secrets_toml_var is None: - message = f"Airflow variable '{SECRETS_TOML_KEY}' was not found. " + \ - "This Airflow variable is a recommended place to hold the content of secrets.toml." + \ - "If you do not use Airflow variables to hold dlt configuration or use variables with other names you can ignore this warning." + message = ( + f"Airflow variable '{SECRETS_TOML_KEY}' was not found. " + + "This Airflow variable is a recommended place to hold the content of" + " secrets.toml." + + "If you do not use Airflow variables to hold dlt configuration or use variables" + " with other names you can ignore this warning." + ) ti.log.warning(message) except Exception: diff --git a/dlt/common/configuration/specs/config_section_context.py b/dlt/common/configuration/specs/config_section_context.py index 753eb3b439..a656a2b0fe 100644 --- a/dlt/common/configuration/specs/config_section_context.py +++ b/dlt/common/configuration/specs/config_section_context.py @@ -3,9 +3,9 @@ from dlt.common.configuration.specs.base_configuration import ContainerInjectableContext, configspec + @configspec class ConfigSectionContext(ContainerInjectableContext): - TMergeFunc = Callable[["ConfigSectionContext", "ConfigSectionContext"], None] pipeline_name: Optional[str] @@ -13,7 +13,6 @@ class ConfigSectionContext(ContainerInjectableContext): merge_style: TMergeFunc = None source_state_key: str = None - def merge(self, existing: "ConfigSectionContext") -> None: """Merges existing context into incoming using a merge style function""" merge_style_f = self.merge_style or self.prefer_incoming @@ -40,27 +39,44 @@ def prefer_incoming(incoming: "ConfigSectionContext", existing: "ConfigSectionCo @staticmethod def prefer_existing(incoming: "ConfigSectionContext", existing: "ConfigSectionContext") -> None: """Prefer existing section context when merging this context before injecting""" - incoming.pipeline_name = existing.pipeline_name or incoming.pipeline_name - incoming.sections = existing.sections or incoming.sections - incoming.source_state_key = existing.source_state_key or incoming.source_state_key + incoming.pipeline_name = existing.pipeline_name or incoming.pipeline_name + incoming.sections = existing.sections or incoming.sections + incoming.source_state_key = existing.source_state_key or incoming.source_state_key @staticmethod - def resource_merge_style(incoming: "ConfigSectionContext", existing: "ConfigSectionContext") -> None: + def resource_merge_style( + incoming: "ConfigSectionContext", existing: "ConfigSectionContext" + ) -> None: """If top level section is same and there are 3 sections it replaces second element (source module) from existing and keeps the 3rd element (name)""" incoming.pipeline_name = incoming.pipeline_name or existing.pipeline_name - if len(incoming.sections) == 3 == len(existing.sections) and incoming.sections[0] == existing.sections[0]: + if ( + len(incoming.sections) == 3 == len(existing.sections) + and incoming.sections[0] == existing.sections[0] + ): # existing does not have middle section then keep incoming # standalone resources do not emit existing to not overwrite each other - incoming.sections = (incoming.sections[0], existing.sections[1] or incoming.sections[1], incoming.sections[2]) + incoming.sections = ( + incoming.sections[0], + existing.sections[1] or incoming.sections[1], + incoming.sections[2], + ) incoming.source_state_key = existing.source_state_key or incoming.source_state_key else: incoming.sections = incoming.sections or existing.sections incoming.source_state_key = incoming.source_state_key or existing.source_state_key def __str__(self) -> str: - return super().__str__() + f": {self.pipeline_name} {self.sections}@{self.merge_style} state['{self.source_state_key}']" + return ( + super().__str__() + + f": {self.pipeline_name} {self.sections}@{self.merge_style} state['{self.source_state_key}']" + ) if TYPE_CHECKING: # provide __init__ signature when type checking - def __init__(self, pipeline_name:str = None, sections: Tuple[str, ...] = (), merge_style: TMergeFunc = None, source_state_key: str = None) -> None: - ... + def __init__( + self, + pipeline_name: str = None, + sections: Tuple[str, ...] = (), + merge_style: TMergeFunc = None, + source_state_key: str = None, + ) -> None: ... diff --git a/dlt/common/configuration/specs/connection_string_credentials.py b/dlt/common/configuration/specs/connection_string_credentials.py index 386535122b..e7b0e5f900 100644 --- a/dlt/common/configuration/specs/connection_string_credentials.py +++ b/dlt/common/configuration/specs/connection_string_credentials.py @@ -24,9 +24,7 @@ def parse_native_representation(self, native_value: Any) -> None: try: url = make_url(native_value) # update only values that are not None - self.update( - {k: v for k,v in url._asdict().items() if v is not None} - ) + self.update({k: v for k, v in url._asdict().items() if v is not None}) if self.query is not None: self.query = dict(self.query) except Exception: @@ -40,7 +38,15 @@ def to_native_representation(self) -> str: return self.to_url().render_as_string(hide_password=False) def to_url(self) -> URL: - return URL.create(self.drivername, self.username, self.password, self.host, self.port, self.database, self.query) + return URL.create( + self.drivername, + self.username, + self.password, + self.host, + self.port, + self.database, + self.query, + ) def __str__(self) -> str: return self.to_url().render_as_string(hide_password=True) diff --git a/dlt/common/configuration/specs/exceptions.py b/dlt/common/configuration/specs/exceptions.py index 054d21c78c..7a0b283630 100644 --- a/dlt/common/configuration/specs/exceptions.py +++ b/dlt/common/configuration/specs/exceptions.py @@ -9,7 +9,10 @@ class SpecException(ConfigurationException): class OAuth2ScopesRequired(SpecException): def __init__(self, spec: type) -> None: self.spec = spec - super().__init__("Scopes are required to retrieve refresh_token. Use 'openid' scope for a token without any permissions to resources.") + super().__init__( + "Scopes are required to retrieve refresh_token. Use 'openid' scope for a token without" + " any permissions to resources." + ) class NativeValueError(SpecException, ValueError): @@ -22,29 +25,46 @@ def __init__(self, spec: Type[Any], native_value: str, msg: str) -> None: class InvalidConnectionString(NativeValueError): def __init__(self, spec: Type[Any], native_value: str, driver: str): driver = driver or "driver" - msg = f"The expected representation for {spec.__name__} is a standard database connection string with the following format: {driver}://username:password@host:port/database." + msg = ( + f"The expected representation for {spec.__name__} is a standard database connection" + f" string with the following format: {driver}://username:password@host:port/database." + ) super().__init__(spec, native_value, msg) class InvalidGoogleNativeCredentialsType(NativeValueError): def __init__(self, spec: Type[Any], native_value: Any): - msg = f"Credentials {spec.__name__} accept a string with serialized credentials json file or an instance of Credentials object from google.* namespace. The value passed is of type {type(native_value)}" + msg = ( + f"Credentials {spec.__name__} accept a string with serialized credentials json file or" + " an instance of Credentials object from google.* namespace. The value passed is of" + f" type {type(native_value)}" + ) super().__init__(spec, native_value, msg) class InvalidGoogleServicesJson(NativeValueError): def __init__(self, spec: Type[Any], native_value: Any): - msg = f"The expected representation for {spec.__name__} is a string with serialized service account credentials, where at least 'project_id', 'private_key' and 'client_email` keys are present" + msg = ( + f"The expected representation for {spec.__name__} is a string with serialized service" + " account credentials, where at least 'project_id', 'private_key' and 'client_email`" + " keys are present" + ) super().__init__(spec, native_value, msg) class InvalidGoogleOauth2Json(NativeValueError): def __init__(self, spec: Type[Any], native_value: Any): - msg = f"The expected representation for {spec.__name__} is a string with serialized oauth2 user info and may be wrapped in 'install'/'web' node - depending of oauth2 app type." + msg = ( + f"The expected representation for {spec.__name__} is a string with serialized oauth2" + " user info and may be wrapped in 'install'/'web' node - depending of oauth2 app type." + ) super().__init__(spec, native_value, msg) class InvalidBoto3Session(NativeValueError): def __init__(self, spec: Type[Any], native_value: Any): - msg = f"The expected representation for {spec.__name__} is and instance of boto3.Session containing credentials" + msg = ( + f"The expected representation for {spec.__name__} is and instance of boto3.Session" + " containing credentials" + ) super().__init__(spec, native_value, msg) diff --git a/dlt/common/configuration/specs/gcp_credentials.py b/dlt/common/configuration/specs/gcp_credentials.py index f96c1d44f5..fc42fe93d5 100644 --- a/dlt/common/configuration/specs/gcp_credentials.py +++ b/dlt/common/configuration/specs/gcp_credentials.py @@ -4,10 +4,20 @@ from dlt.common import json, pendulum from dlt.common.configuration.specs.api_credentials import OAuth2Credentials -from dlt.common.configuration.specs.exceptions import InvalidGoogleNativeCredentialsType, InvalidGoogleOauth2Json, InvalidGoogleServicesJson, NativeValueError, OAuth2ScopesRequired +from dlt.common.configuration.specs.exceptions import ( + InvalidGoogleNativeCredentialsType, + InvalidGoogleOauth2Json, + InvalidGoogleServicesJson, + NativeValueError, + OAuth2ScopesRequired, +) from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import DictStrAny, TSecretValue, StrAny -from dlt.common.configuration.specs.base_configuration import CredentialsConfiguration, CredentialsWithDefault, configspec +from dlt.common.configuration.specs.base_configuration import ( + CredentialsConfiguration, + CredentialsWithDefault, + configspec, +) from dlt.common.utils import is_interactive @@ -18,7 +28,9 @@ class GcpCredentials(CredentialsConfiguration): project_id: str = None - location: str = "US" # DEPRECATED! and present only for backward compatibility. please set bigquery location in BigQuery configuration + location: str = ( # DEPRECATED! and present only for backward compatibility. please set bigquery location in BigQuery configuration + "US" + ) def parse_native_representation(self, native_value: Any) -> None: if not isinstance(native_value, str): @@ -49,12 +61,13 @@ def parse_native_representation(self, native_value: Any) -> None: service_dict: DictStrAny = None try: from google.oauth2.service_account import Credentials as ServiceAccountCredentials + if isinstance(native_value, ServiceAccountCredentials): # extract credentials service_dict = { "project_id": native_value.project_id, "client_email": native_value.service_account_email, - "private_key": native_value # keep native credentials in private key + "private_key": native_value, # keep native credentials in private key } self.__is_resolved__ = True except ImportError: @@ -84,6 +97,7 @@ def to_native_credentials(self) -> Any: """Returns google.oauth2.service_account.Credentials""" from google.oauth2.service_account import Credentials as ServiceAccountCredentials + if isinstance(self.private_key, ServiceAccountCredentials): # private key holds the native instance if it was passed to parse_native_representation return self.private_key @@ -105,6 +119,7 @@ def parse_native_representation(self, native_value: Any) -> None: oauth_dict: DictStrAny = None try: from google.oauth2.credentials import Credentials as GoogleOAuth2Credentials + if isinstance(native_value, GoogleOAuth2Credentials): # extract credentials, project id may not be present oauth_dict = { @@ -113,7 +128,7 @@ def parse_native_representation(self, native_value: Any) -> None: "client_secret": native_value.client_secret, "refresh_token": native_value.refresh_token, "scopes": native_value.scopes, - "token": native_value.token + "token": native_value.token, } # if token is present, we are logged in self.__is_resolved__ = native_value.token is not None @@ -141,8 +156,12 @@ def auth(self, scopes: Union[str, List[str]] = None, redirect_url: str = None) - self.add_scopes(scopes) if not self.scopes: raise OAuth2ScopesRequired(self.__class__) - assert sys.stdin.isatty() or is_interactive(), "Must have a tty or interactive mode for web flow" - self.refresh_token, self.token = self._get_refresh_token(redirect_url or "http://localhost") + assert ( + sys.stdin.isatty() or is_interactive() + ), "Must have a tty or interactive mode for web flow" + self.refresh_token, self.token = self._get_refresh_token( + redirect_url or "http://localhost" + ) else: # if scopes or redirect_url: # logger.warning("Please note that scopes and redirect_url are ignored when getting access token") @@ -164,11 +183,10 @@ def _get_access_token(self) -> TSecretValue: raise MissingDependencyException("GcpOAuthCredentials", ["requests_oauthlib"]) google = OAuth2Session(client_id=self.client_id, scope=self.scopes) - extra = { - "client_id": self.client_id, - "client_secret": self.client_secret - } - token = google.refresh_token(token_url=self.token_uri, refresh_token=self.refresh_token, **extra)["access_token"] + extra = {"client_id": self.client_id, "client_secret": self.client_secret} + token = google.refresh_token( + token_url=self.token_uri, refresh_token=self.refresh_token, **extra + )["access_token"] return TSecretValue(token) def _get_refresh_token(self, redirect_url: str) -> Tuple[TSecretValue, TSecretValue]: @@ -191,9 +209,7 @@ def to_native_credentials(self) -> Any: return credentials def _installed_dict(self, redirect_url: str = "http://localhost") -> StrAny: - installed_dict = { - self.client_type: self._info_dict() - } + installed_dict = {self.client_type: self._info_dict()} if redirect_url: installed_dict[self.client_type]["redirect_uris"] = [redirect_url] @@ -211,13 +227,13 @@ def __str__(self) -> str: @configspec class GcpDefaultCredentials(CredentialsWithDefault, GcpCredentials): - _LAST_FAILED_DEFAULT: float = 0.0 def parse_native_representation(self, native_value: Any) -> None: """Accepts google credentials as native value""" try: from google.auth.credentials import Credentials as GoogleCredentials + if isinstance(native_value, GoogleCredentials): self.project_id = self.project_id or native_value.quota_project_id self._set_default_credentials(native_value) @@ -226,11 +242,12 @@ def parse_native_representation(self, native_value: Any) -> None: return except ImportError: pass - raise NativeValueError(self.__class__, native_value, "Default Google Credentials not present") + raise NativeValueError( + self.__class__, native_value, "Default Google Credentials not present" + ) @staticmethod def _get_default_credentials(retry_timeout_s: float = 600.0) -> Tuple[Any, str]: - now = pendulum.now().timestamp() if now - GcpDefaultCredentials._LAST_FAILED_DEFAULT < retry_timeout_s: return None, None @@ -268,7 +285,9 @@ def to_native_credentials(self) -> Any: @configspec -class GcpServiceAccountCredentials(GcpDefaultCredentials, GcpServiceAccountCredentialsWithoutDefaults): +class GcpServiceAccountCredentials( + GcpDefaultCredentials, GcpServiceAccountCredentialsWithoutDefaults +): def parse_native_representation(self, native_value: Any) -> None: try: GcpDefaultCredentials.parse_native_representation(self, native_value) diff --git a/dlt/common/configuration/specs/known_sections.py b/dlt/common/configuration/specs/known_sections.py index 31ca0ff7ff..97ba85ffd6 100644 --- a/dlt/common/configuration/specs/known_sections.py +++ b/dlt/common/configuration/specs/known_sections.py @@ -19,8 +19,8 @@ DATA_WRITER = "data_writer" """default section holding BufferedDataWriter settings""" -DBT_PACKAGE_RUNNER = "dbt_package_runner" +DBT_PACKAGE_RUNNER = "dbt_package_runner" """dbt package runner configuration (DBTRunnerConfiguration)""" -DBT_CLOUD = "dbt_cloud" -"""dbt cloud helpers configuration (DBTCloudConfiguration)""" \ No newline at end of file +DBT_CLOUD = "dbt_cloud" +"""dbt cloud helpers configuration (DBTCloudConfiguration)""" diff --git a/dlt/common/configuration/specs/run_configuration.py b/dlt/common/configuration/specs/run_configuration.py index 2ec3648dbe..4ca58c20db 100644 --- a/dlt/common/configuration/specs/run_configuration.py +++ b/dlt/common/configuration/specs/run_configuration.py @@ -16,7 +16,9 @@ class RunConfiguration(BaseConfiguration): slack_incoming_hook: Optional[TSecretStrValue] = None dlthub_telemetry: bool = True # enable or disable dlthub telemetry dlthub_telemetry_segment_write_key: str = "a1F2gc6cNYw2plyAt02sZouZcsRjG7TD" - log_format: str = '{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}' + log_format: str = ( + "{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}" + ) log_level: str = "WARNING" request_timeout: float = 60 """Timeout for http requests""" @@ -38,7 +40,9 @@ def on_resolved(self) -> None: # it may be obfuscated base64 value # TODO: that needs to be removed ASAP try: - self.slack_incoming_hook = TSecretStrValue(reveal_pseudo_secret(self.slack_incoming_hook, b"dlt-runtime-2022")) + self.slack_incoming_hook = TSecretStrValue( + reveal_pseudo_secret(self.slack_incoming_hook, b"dlt-runtime-2022") + ) except binascii.Error: # just keep the original value pass diff --git a/dlt/common/configuration/utils.py b/dlt/common/configuration/utils.py index 4841c8e3fa..5a7330447b 100644 --- a/dlt/common/configuration/utils.py +++ b/dlt/common/configuration/utils.py @@ -10,7 +10,10 @@ from dlt.common.data_types import coerce_value, py_type_to_sc_type from dlt.common.configuration.providers import EnvironProvider from dlt.common.configuration.exceptions import ConfigValueCannotBeCoercedException, LookupTrace -from dlt.common.configuration.specs.base_configuration import BaseConfiguration, is_base_configuration_inner_hint +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + is_base_configuration_inner_hint, +) class ResolvedValueTrace(NamedTuple): @@ -111,40 +114,56 @@ def auto_cast(value: str) -> Any: return value - -def log_traces(config: Optional[BaseConfiguration], key: str, hint: Type[Any], value: Any, default_value: Any, traces: Sequence[LookupTrace]) -> None: +def log_traces( + config: Optional[BaseConfiguration], + key: str, + hint: Type[Any], + value: Any, + default_value: Any, + traces: Sequence[LookupTrace], +) -> None: from dlt.common import logger # if logger.is_logging() and logger.log_level() == "DEBUG" and config: # logger.debug(f"Field {key} with type {hint} in {type(config).__name__} {'NOT RESOLVED' if value is None else 'RESOLVED'}") - # print(f"Field {key} with type {hint} in {type(config).__name__} {'NOT RESOLVED' if value is None else 'RESOLVED'}") - # for tr in traces: - # # print(str(tr)) - # logger.debug(str(tr)) + # print(f"Field {key} with type {hint} in {type(config).__name__} {'NOT RESOLVED' if value is None else 'RESOLVED'}") + # for tr in traces: + # # print(str(tr)) + # logger.debug(str(tr)) # store all traces with resolved values resolved_trace = next((trace for trace in traces if trace.value is not None), None) if resolved_trace is not None: path = f'{".".join(resolved_trace.sections)}.{key}' - _RESOLVED_TRACES[path] = ResolvedValueTrace(key, resolved_trace.value, default_value, hint, resolved_trace.sections, resolved_trace.provider, config) + _RESOLVED_TRACES[path] = ResolvedValueTrace( + key, + resolved_trace.value, + default_value, + hint, + resolved_trace.sections, + resolved_trace.provider, + config, + ) def get_resolved_traces() -> Dict[str, ResolvedValueTrace]: return _RESOLVED_TRACES -def add_config_to_env(config: BaseConfiguration, sections: Tuple[str, ...] = ()) -> None: +def add_config_to_env(config: BaseConfiguration, sections: Tuple[str, ...] = ()) -> None: """Writes values in configuration back into environment using the naming convention of EnvironProvider. Will descend recursively if embedded BaseConfiguration instances are found""" if config.__section__: - sections += (config.__section__, ) + sections += (config.__section__,) return add_config_dict_to_env(dict(config), sections, overwrite_keys=True) -def add_config_dict_to_env(dict_: Mapping[str, Any], sections: Tuple[str, ...] = (), overwrite_keys: bool = False) -> None: +def add_config_dict_to_env( + dict_: Mapping[str, Any], sections: Tuple[str, ...] = (), overwrite_keys: bool = False +) -> None: """Writes values in dict_ back into environment using the naming convention of EnvironProvider. Applies `sections` if specified. Does not overwrite existing keys by default""" for k, v in dict_.items(): if isinstance(v, BaseConfiguration): if not v.__section__: - embedded_sections = sections + (k, ) + embedded_sections = sections + (k,) else: embedded_sections = sections add_config_to_env(v, embedded_sections) diff --git a/dlt/common/data_types/__init__.py b/dlt/common/data_types/__init__.py index 83e123f124..672aeddea4 100644 --- a/dlt/common/data_types/__init__.py +++ b/dlt/common/data_types/__init__.py @@ -1,6 +1,4 @@ from dlt.common.data_types.type_helpers import coerce_value, py_type_to_sc_type from dlt.common.data_types.typing import TDataType, DATA_TYPES -__all__ = [ - "coerce_value", "py_type_to_sc_type", "TDataType", "DATA_TYPES" -] +__all__ = ["coerce_value", "py_type_to_sc_type", "TDataType", "DATA_TYPES"] diff --git a/dlt/common/data_types/type_helpers.py b/dlt/common/data_types/type_helpers.py index f42f81b06f..9e1cd2278d 100644 --- a/dlt/common/data_types/type_helpers.py +++ b/dlt/common/data_types/type_helpers.py @@ -10,7 +10,12 @@ from dlt.common.json._simplejson import custom_encode as json_custom_encode from dlt.common.arithmetics import InvalidOperation from dlt.common.data_types.typing import TDataType -from dlt.common.time import ensure_pendulum_datetime, parse_iso_like_datetime, ensure_pendulum_date, ensure_pendulum_time +from dlt.common.time import ( + ensure_pendulum_datetime, + parse_iso_like_datetime, + ensure_pendulum_date, + ensure_pendulum_time, +) from dlt.common.utils import map_nested_in_place, str2bool @@ -93,7 +98,7 @@ def coerce_value(to_type: TDataType, from_type: TDataType, value: Any) -> Any: return map_nested_in_place(custom_pua_remove, value) # Make sure we use enum value instead of the object itself # This check is faster than `isinstance(value, Enum)` for non-enum types - if hasattr(value, 'value'): + if hasattr(value, "value"): if to_type == "text": return str(value.value) elif to_type == "bigint": @@ -120,7 +125,7 @@ def coerce_value(to_type: TDataType, from_type: TDataType, value: Any) -> Any: except binascii.Error: raise ValueError(value) if from_type == "bigint": - return value.to_bytes((value.bit_length() + 7) // 8, 'little') + return value.to_bytes((value.bit_length() + 7) // 8, "little") if to_type == "bigint": if from_type in ["wei", "decimal", "double"]: diff --git a/dlt/common/data_types/typing.py b/dlt/common/data_types/typing.py index c32e6a38c9..d061b28df0 100644 --- a/dlt/common/data_types/typing.py +++ b/dlt/common/data_types/typing.py @@ -1,5 +1,17 @@ from typing import Literal, Set, get_args -TDataType = Literal["text", "double", "bool", "timestamp", "bigint", "binary", "complex", "decimal", "wei", "date", "time"] +TDataType = Literal[ + "text", + "double", + "bool", + "timestamp", + "bigint", + "binary", + "complex", + "decimal", + "wei", + "date", + "time", +] DATA_TYPES: Set[TDataType] = set(get_args(TDataType)) diff --git a/dlt/common/data_writers/__init__.py b/dlt/common/data_writers/__init__.py index 5865466b8f..fefe2d6486 100644 --- a/dlt/common/data_writers/__init__.py +++ b/dlt/common/data_writers/__init__.py @@ -1,8 +1,16 @@ from dlt.common.data_writers.writers import DataWriter, TLoaderFileFormat from dlt.common.data_writers.buffered import BufferedDataWriter -from dlt.common.data_writers.escape import escape_redshift_literal, escape_redshift_identifier, escape_bigquery_identifier +from dlt.common.data_writers.escape import ( + escape_redshift_literal, + escape_redshift_identifier, + escape_bigquery_identifier, +) __all__ = [ - "DataWriter", "TLoaderFileFormat", "BufferedDataWriter", - "escape_redshift_literal", "escape_redshift_identifier", "escape_bigquery_identifier" + "DataWriter", + "TLoaderFileFormat", + "BufferedDataWriter", + "escape_redshift_literal", + "escape_redshift_identifier", + "escape_bigquery_identifier", ] diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 783a3501d2..0c0f0a62f5 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -5,7 +5,11 @@ from dlt.common.utils import uniq_id from dlt.common.typing import TDataItem, TDataItems from dlt.common.data_writers import TLoaderFileFormat -from dlt.common.data_writers.exceptions import BufferedDataWriterClosed, DestinationCapabilitiesRequired, InvalidFileNameTemplateException +from dlt.common.data_writers.exceptions import ( + BufferedDataWriterClosed, + DestinationCapabilitiesRequired, + InvalidFileNameTemplateException, +) from dlt.common.data_writers.writers import DataWriter from dlt.common.schema.typing import TTableSchemaColumns from dlt.common.configuration import with_config, known_sections, configspec @@ -17,7 +21,6 @@ class BufferedDataWriter(Generic[TWriter]): - @configspec class BufferedDataWriterConfiguration(BaseConfiguration): buffer_max_items: int = 5000 @@ -28,7 +31,6 @@ class BufferedDataWriterConfiguration(BaseConfiguration): __section__ = known_sections.DATA_WRITER - @with_config(spec=BufferedDataWriterConfiguration) def __init__( self, @@ -54,7 +56,11 @@ def __init__( self.file_max_bytes = file_max_bytes self.file_max_items = file_max_items # the open function is either gzip.open or open - self.open = gzip.open if self._file_format_spec.supports_compression and not disable_compression else open + self.open = ( + gzip.open + if self._file_format_spec.supports_compression and not disable_compression + else open + ) self._current_columns: TTableSchemaColumns = None self._file_name: str = None @@ -72,7 +78,11 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> int self._ensure_open() # rotate file if columns changed and writer does not allow for that # as the only allowed change is to add new column (no updates/deletes), we detect the change by comparing lengths - if self._writer and not self._writer.data_format().supports_schema_changes and len(columns) != len(self._current_columns): + if ( + self._writer + and not self._writer.data_format().supports_schema_changes + and len(columns) != len(self._current_columns) + ): assert len(columns) > len(self._current_columns) self._rotate_file() # until the first chunk is written we can change the columns schema freely @@ -131,7 +141,9 @@ def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb def _rotate_file(self) -> None: self._flush_and_close_file() - self._file_name = self.file_name_template % uniq_id(5) + "." + self._file_format_spec.file_extension + self._file_name = ( + self.file_name_template % uniq_id(5) + "." + self._file_format_spec.file_extension + ) def _flush_items(self, allow_empty_file: bool = False) -> None: if self._buffered_items_count > 0 or allow_empty_file: @@ -139,9 +151,9 @@ def _flush_items(self, allow_empty_file: bool = False) -> None: if not self._writer: # create new writer and write header if self._file_format_spec.is_binary_format: - self._file = self.open(self._file_name, "wb") # type: ignore + self._file = self.open(self._file_name, "wb") # type: ignore else: - self._file = self.open(self._file_name, "wt", encoding="utf-8") # type: ignore + self._file = self.open(self._file_name, "wt", encoding="utf-8") # type: ignore self._writer = DataWriter.from_file_format(self.file_format, self._file, caps=self._caps) # type: ignore[assignment] self._writer.write_header(self._current_columns) # write buffer diff --git a/dlt/common/data_writers/escape.py b/dlt/common/data_writers/escape.py index 0656a69634..5bf8f29ccb 100644 --- a/dlt/common/data_writers/escape.py +++ b/dlt/common/data_writers/escape.py @@ -8,14 +8,19 @@ # use regex to escape characters in single pass SQL_ESCAPE_DICT = {"'": "''", "\\": "\\\\", "\n": "\\n", "\r": "\\r"} + def _make_sql_escape_re(escape_dict: Dict[str, str]) -> re.Pattern: # type: ignore[type-arg] - return re.compile("|".join([re.escape(k) for k in sorted(escape_dict, key=len, reverse=True)]), flags=re.DOTALL) + return re.compile( + "|".join([re.escape(k) for k in sorted(escape_dict, key=len, reverse=True)]), + flags=re.DOTALL, + ) SQL_ESCAPE_RE = _make_sql_escape_re(SQL_ESCAPE_DICT) + def _escape_extended( - v: str, prefix:str = "E'", escape_dict: Dict[str, str] = None, escape_re: re.Pattern = None # type: ignore[type-arg] + v: str, prefix: str = "E'", escape_dict: Dict[str, str] = None, escape_re: re.Pattern = None # type: ignore[type-arg] ) -> str: escape_dict = escape_dict or SQL_ESCAPE_DICT escape_re = escape_re or SQL_ESCAPE_RE @@ -33,7 +38,7 @@ def escape_redshift_literal(v: Any) -> Any: if isinstance(v, (datetime, date, time)): return f"'{v.isoformat()}'" if isinstance(v, (list, dict)): - return "json_parse(%s)" % _escape_extended(json.dumps(v), prefix='\'') + return "json_parse(%s)" % _escape_extended(json.dumps(v), prefix="'") if v is None: return "NULL" @@ -74,21 +79,26 @@ def escape_duckdb_literal(v: Any) -> Any: MS_SQL_ESCAPE_DICT = { "'": "''", - '\n': "' + CHAR(10) + N'", - '\r': "' + CHAR(13) + N'", - '\t': "' + CHAR(9) + N'", + "\n": "' + CHAR(10) + N'", + "\r": "' + CHAR(13) + N'", + "\t": "' + CHAR(9) + N'", } MS_SQL_ESCAPE_RE = _make_sql_escape_re(MS_SQL_ESCAPE_DICT) + def escape_mssql_literal(v: Any) -> Any: if isinstance(v, str): - return _escape_extended(v, prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE) + return _escape_extended( + v, prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE + ) if isinstance(v, (datetime, date, time)): return f"'{v.isoformat()}'" if isinstance(v, (list, dict)): - return _escape_extended(json.dumps(v), prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE) + return _escape_extended( + json.dumps(v), prefix="N'", escape_dict=MS_SQL_ESCAPE_DICT, escape_re=MS_SQL_ESCAPE_RE + ) if isinstance(v, bytes): - base_64_string = base64.b64encode(v).decode('ascii') + base_64_string = base64.b64encode(v).decode("ascii") return f"""CAST('' AS XML).value('xs:base64Binary("{base_64_string}")', 'VARBINARY(MAX)')""" if isinstance(v, bool): return str(int(v)) @@ -107,7 +117,7 @@ def escape_redshift_identifier(v: str) -> str: def escape_bigquery_identifier(v: str) -> str: # https://cloud.google.com/bigquery/docs/reference/standard-sql/lexical - return "`" + v.replace("\\", "\\\\").replace("`","\\`") + "`" + return "`" + v.replace("\\", "\\\\").replace("`", "\\`") + "`" def escape_snowflake_identifier(v: str) -> str: diff --git a/dlt/common/data_writers/exceptions.py b/dlt/common/data_writers/exceptions.py index a86bd9440e..d3a073cf4e 100644 --- a/dlt/common/data_writers/exceptions.py +++ b/dlt/common/data_writers/exceptions.py @@ -9,7 +9,10 @@ class DataWriterException(DltException): class InvalidFileNameTemplateException(DataWriterException, ValueError): def __init__(self, file_name_template: str): self.file_name_template = file_name_template - super().__init__(f"Wrong file name template {file_name_template}. File name template must contain exactly one %s formatter") + super().__init__( + f"Wrong file name template {file_name_template}. File name template must contain" + " exactly one %s formatter" + ) class BufferedDataWriterClosed(DataWriterException): @@ -21,4 +24,6 @@ def __init__(self, file_name: str): class DestinationCapabilitiesRequired(DataWriterException, ValueError): def __init__(self, file_format: TLoaderFileFormat): self.file_format = file_format - super().__init__(f"Writer for {file_format} requires destination capabilities which were not provided.") + super().__init__( + f"Writer for {file_format} requires destination capabilities which were not provided." + ) diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 412e732e97..2801656dc3 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -45,18 +45,21 @@ def write_all(self, columns_schema: TTableSchemaColumns, rows: Sequence[Any]) -> self.write_data(rows) self.write_footer() - @classmethod @abc.abstractmethod def data_format(cls) -> TFileFormatSpec: pass @classmethod - def from_file_format(cls, file_format: TLoaderFileFormat, f: IO[Any], caps: DestinationCapabilitiesContext = None) -> "DataWriter": + def from_file_format( + cls, file_format: TLoaderFileFormat, f: IO[Any], caps: DestinationCapabilitiesContext = None + ) -> "DataWriter": return cls.class_factory(file_format)(f, caps) @classmethod - def from_destination_capabilities(cls, caps: DestinationCapabilitiesContext, f: IO[Any]) -> "DataWriter": + def from_destination_capabilities( + cls, caps: DestinationCapabilitiesContext, f: IO[Any] + ) -> "DataWriter": return cls.class_factory(caps.preferred_loader_file_format)(f, caps) @classmethod @@ -74,13 +77,12 @@ def class_factory(file_format: TLoaderFileFormat) -> Type["DataWriter"]: elif file_format == "parquet": return ParquetDataWriter # type: ignore elif file_format == "arrow": - return ArrowWriter # type: ignore + return ArrowWriter # type: ignore else: raise ValueError(file_format) class JsonlWriter(DataWriter): - def write_header(self, columns_schema: TTableSchemaColumns) -> None: pass @@ -105,7 +107,6 @@ def data_format(cls) -> TFileFormatSpec: class JsonlListPUAEncodeWriter(JsonlWriter): - def write_data(self, rows: Sequence[Any]) -> None: # skip JsonlWriter when calling super super(JsonlWriter, self).write_data(rows) @@ -126,7 +127,6 @@ def data_format(cls) -> TFileFormatSpec: class InsertValuesWriter(DataWriter): - def __init__(self, f: IO[Any], caps: DestinationCapabilitiesContext = None) -> None: super().__init__(f, caps) self._chunks_written = 0 @@ -148,7 +148,7 @@ def write_data(self, rows: Sequence[Any]) -> None: def write_row(row: StrAny) -> None: output = ["NULL"] * len(self._headers_lookup) - for n,v in row.items(): + for n, v in row.items(): output[self._headers_lookup[n]] = self._caps.escape_literal(v) self._f.write("(") self._f.write(",".join(output)) @@ -194,19 +194,20 @@ class ParquetDataWriterConfiguration(BaseConfiguration): __section__: str = known_sections.DATA_WRITER -class ParquetDataWriter(DataWriter): +class ParquetDataWriter(DataWriter): @with_config(spec=ParquetDataWriterConfiguration) - def __init__(self, - f: IO[Any], - caps: DestinationCapabilitiesContext = None, - *, - flavor: str = "spark", - version: str = "2.4", - data_page_size: int = 1024 * 1024, - timestamp_timezone: str = "UTC", - row_group_size: Optional[int] = None - ) -> None: + def __init__( + self, + f: IO[Any], + caps: DestinationCapabilitiesContext = None, + *, + flavor: str = "spark", + version: str = "2.4", + data_page_size: int = 1024 * 1024, + timestamp_timezone: str = "UTC", + row_group_size: Optional[int] = None, + ) -> None: super().__init__(f, caps) from dlt.common.libs.pyarrow import pyarrow @@ -221,24 +222,35 @@ def __init__(self, def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter": from dlt.common.libs.pyarrow import pyarrow - return pyarrow.parquet.ParquetWriter(self._f, schema, flavor=self.parquet_flavor, version=self.parquet_version, data_page_size=self.parquet_data_page_size) + + return pyarrow.parquet.ParquetWriter( + self._f, + schema, + flavor=self.parquet_flavor, + version=self.parquet_version, + data_page_size=self.parquet_data_page_size, + ) def write_header(self, columns_schema: TTableSchemaColumns) -> None: from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype # build schema self.schema = pyarrow.schema( - [pyarrow.field( - name, - get_py_arrow_datatype(schema_item, self._caps, self.timestamp_timezone), - nullable=schema_item.get("nullable", True) - ) for name, schema_item in columns_schema.items()] + [ + pyarrow.field( + name, + get_py_arrow_datatype(schema_item, self._caps, self.timestamp_timezone), + nullable=schema_item.get("nullable", True), + ) + for name, schema_item in columns_schema.items() + ] ) # find row items that are of the complex type (could be abstracted out for use in other writers?) - self.complex_indices = [i for i, field in columns_schema.items() if field["data_type"] == "complex"] + self.complex_indices = [ + i for i, field in columns_schema.items() if field["data_type"] == "complex" + ] self.writer = self._create_writer(self.schema) - def write_data(self, rows: Sequence[Any]) -> None: super().write_data(rows) from dlt.common.libs.pyarrow import pyarrow @@ -257,10 +269,16 @@ def write_footer(self) -> None: self.writer.close() self.writer = None - @classmethod def data_format(cls) -> TFileFormatSpec: - return TFileFormatSpec("parquet", "parquet", True, False, requires_destination_capabilities=True, supports_compression=False) + return TFileFormatSpec( + "parquet", + "parquet", + True, + False, + requires_destination_capabilities=True, + supports_compression=False, + ) class ArrowWriter(ParquetDataWriter): @@ -270,6 +288,7 @@ def write_header(self, columns_schema: TTableSchemaColumns) -> None: def write_data(self, rows: Sequence[Any]) -> None: from dlt.common.libs.pyarrow import pyarrow + rows = list(rows) if not rows: return diff --git a/dlt/common/destination/__init__.py b/dlt/common/destination/__init__.py index 4857851fa9..00f129c69c 100644 --- a/dlt/common/destination/__init__.py +++ b/dlt/common/destination/__init__.py @@ -1,4 +1,8 @@ -from dlt.common.destination.capabilities import DestinationCapabilitiesContext, TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS +from dlt.common.destination.capabilities import ( + DestinationCapabilitiesContext, + TLoaderFileFormat, + ALL_SUPPORTED_FILE_FORMATS, +) from dlt.common.destination.reference import TDestinationReferenceArg, Destination, TDestination __all__ = [ diff --git a/dlt/common/destination/capabilities.py b/dlt/common/destination/capabilities.py index 06504ee590..2596b2bf99 100644 --- a/dlt/common/destination/capabilities.py +++ b/dlt/common/destination/capabilities.py @@ -14,17 +14,22 @@ # puae-jsonl - internal extract -> normalize format bases on jsonl # insert_values - insert SQL statements # sql - any sql statement -TLoaderFileFormat = Literal["jsonl", "puae-jsonl", "insert_values", "sql", "parquet", "reference", "arrow"] +TLoaderFileFormat = Literal[ + "jsonl", "puae-jsonl", "insert_values", "sql", "parquet", "reference", "arrow" +] ALL_SUPPORTED_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) # file formats used internally by dlt INTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = {"puae-jsonl", "sql", "reference", "arrow"} # file formats that may be chosen by the user -EXTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = set(get_args(TLoaderFileFormat)) - INTERNAL_LOADER_FILE_FORMATS +EXTERNAL_LOADER_FILE_FORMATS: Set[TLoaderFileFormat] = ( + set(get_args(TLoaderFileFormat)) - INTERNAL_LOADER_FILE_FORMATS +) @configspec class DestinationCapabilitiesContext(ContainerInjectableContext): """Injectable destination capabilities required for many Pipeline stages ie. normalize""" + preferred_loader_file_format: TLoaderFileFormat supported_loader_file_formats: List[TLoaderFileFormat] preferred_staging_file_format: Optional[TLoaderFileFormat] @@ -52,7 +57,9 @@ class DestinationCapabilitiesContext(ContainerInjectableContext): can_create_default: ClassVar[bool] = False @staticmethod - def generic_capabilities(preferred_loader_file_format: TLoaderFileFormat = None) -> "DestinationCapabilitiesContext": + def generic_capabilities( + preferred_loader_file_format: TLoaderFileFormat = None, + ) -> "DestinationCapabilitiesContext": caps = DestinationCapabilitiesContext() caps.preferred_loader_file_format = preferred_loader_file_format caps.supported_loader_file_formats = ["jsonl", "insert_values", "parquet"] diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 1c3560cbbd..953769ec39 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -1,14 +1,38 @@ from abc import ABC, abstractmethod, abstractproperty from importlib import import_module from types import TracebackType, ModuleType -from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any, Callable, TypeVar, Generic +from typing import ( + ClassVar, + Final, + Optional, + NamedTuple, + Literal, + Sequence, + Iterable, + Type, + Protocol, + Union, + TYPE_CHECKING, + cast, + List, + ContextManager, + Dict, + Any, + Callable, + TypeVar, + Generic, +) from contextlib import contextmanager import datetime # noqa: 251 from copy import deepcopy import inspect from dlt.common import logger -from dlt.common.exceptions import IdentifierTooLongException, InvalidDestinationReference, UnknownDestinationModule +from dlt.common.exceptions import ( + IdentifierTooLongException, + InvalidDestinationReference, + UnknownDestinationModule, +) from dlt.common.schema import Schema, TTableSchema, TSchemaTables from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName @@ -38,6 +62,7 @@ class StorageSchemaInfo(NamedTuple): inserted_at: datetime.datetime schema: str + class StateInfo(NamedTuple): version: int engine_version: int @@ -46,6 +71,7 @@ class StateInfo(NamedTuple): created_at: datetime.datetime dlt_load_id: str = None + @configspec class DestinationClientConfiguration(BaseConfiguration): destination_name: str = None # which destination to load data to @@ -60,9 +86,12 @@ def __str__(self) -> str: return str(self.credentials) if TYPE_CHECKING: - def __init__(self, destination_name: str = None, credentials: Optional[CredentialsConfiguration] = None -) -> None: - ... + + def __init__( + self, + destination_name: str = None, + credentials: Optional[CredentialsConfiguration] = None, + ) -> None: ... @configspec @@ -79,7 +108,7 @@ class DestinationClientDwhConfiguration(DestinationClientConfiguration): def normalize_dataset_name(self, schema: Schema) -> str: """Builds full db dataset (schema) name out of configured dataset name and schema name: {dataset_name}_{schema.name}. The resulting name is normalized. - If default schema name is None or equals schema.name, the schema suffix is skipped. + If default schema name is None or equals schema.name, the schema suffix is skipped. """ if not schema.name: raise ValueError("schema_name is None or empty") @@ -87,32 +116,41 @@ def normalize_dataset_name(self, schema: Schema) -> str: # if default schema is None then suffix is not added if self.default_schema_name is not None and schema.name != self.default_schema_name: # also normalize schema name. schema name is Python identifier and here convention may be different - return schema.naming.normalize_table_identifier((self.dataset_name or "") + "_" + schema.name) - - return self.dataset_name if not self.dataset_name else schema.naming.normalize_table_identifier(self.dataset_name) + return schema.naming.normalize_table_identifier( + (self.dataset_name or "") + "_" + schema.name + ) + + return ( + self.dataset_name + if not self.dataset_name + else schema.naming.normalize_table_identifier(self.dataset_name) + ) if TYPE_CHECKING: + def __init__( self, destination_name: str = None, credentials: Optional[CredentialsConfiguration] = None, dataset_name: str = None, default_schema_name: Optional[str] = None, - ) -> None: - ... + ) -> None: ... + @configspec class DestinationClientStagingConfiguration(DestinationClientDwhConfiguration): """Configuration of a staging destination, able to store files with desired `layout` at `bucket_url`. - Also supports datasets and can act as standalone destination. + Also supports datasets and can act as standalone destination. """ + as_staging: bool = False bucket_url: str = None # layout of the destination files layout: str = "{table_name}/{load_id}.{file_id}.{ext}" if TYPE_CHECKING: + def __init__( self, destination_name: str = None, @@ -121,25 +159,26 @@ def __init__( default_schema_name: Optional[str] = None, as_staging: bool = False, bucket_url: str = None, - layout: str = None - ) -> None: - ... + layout: str = None, + ) -> None: ... + @configspec class DestinationClientDwhWithStagingConfiguration(DestinationClientDwhConfiguration): """Configuration of a destination that can take data from staging destination""" + staging_config: Optional[DestinationClientStagingConfiguration] = None """configuration of the staging, if present, injected at runtime""" if TYPE_CHECKING: + def __init__( self, destination_name: str = None, credentials: Optional[CredentialsConfiguration] = None, dataset_name: str = None, default_schema_name: Optional[str] = None, - staging_config: Optional[DestinationClientStagingConfiguration] = None - ) -> None: - ... + staging_config: Optional[DestinationClientStagingConfiguration] = None, + ) -> None: ... TLoadJobState = Literal["running", "failed", "retry", "completed"] @@ -148,14 +187,15 @@ def __init__( class LoadJob: """Represents a job that loads a single file - Each job starts in "running" state and ends in one of terminal states: "retry", "failed" or "completed". - Each job is uniquely identified by a file name. The file is guaranteed to exist in "running" state. In terminal state, the file may not be present. - In "running" state, the loader component periodically gets the state via `status()` method. When terminal state is reached, load job is discarded and not called again. - `exception` method is called to get error information in "failed" and "retry" states. + Each job starts in "running" state and ends in one of terminal states: "retry", "failed" or "completed". + Each job is uniquely identified by a file name. The file is guaranteed to exist in "running" state. In terminal state, the file may not be present. + In "running" state, the loader component periodically gets the state via `status()` method. When terminal state is reached, load job is discarded and not called again. + `exception` method is called to get error information in "failed" and "retry" states. - The `__init__` method is responsible to put the Job in "running" state. It may raise `LoadClientTerminalException` and `LoadClientTransientException` to - immediately transition job into "failed" or "retry" state respectively. + The `__init__` method is responsible to put the Job in "running" state. It may raise `LoadClientTerminalException` and `LoadClientTransientException` to + immediately transition job into "failed" or "retry" state respectively. """ + def __init__(self, file_name: str) -> None: """ File name is also a job id (or job id is deterministically derived) so it must be globally unique @@ -198,12 +238,12 @@ def new_file_path(self) -> str: class FollowupJob: """Adds a trait that allows to create a followup job""" + def create_followup_jobs(self, next_state: str) -> List[NewLoadJob]: return [] class JobClientBase(ABC): - capabilities: ClassVar[DestinationCapabilitiesContext] = None def __init__(self, schema: Schema, config: DestinationClientConfiguration) -> None: @@ -212,8 +252,7 @@ def __init__(self, schema: Schema, config: DestinationClientConfiguration) -> No @abstractmethod def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: - """Prepares storage to be used ie. creates database schema or file system folder. Truncates requested tables. - """ + """Prepares storage to be used ie. creates database schema or file system folder. Truncates requested tables.""" pass @abstractmethod @@ -226,7 +265,9 @@ def drop_storage(self) -> None: """Brings storage back into not initialized state. Typically data in storage is destroyed.""" pass - def update_stored_schema(self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None) -> Optional[TSchemaTables]: + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> Optional[TSchemaTables]: """Updates storage to the current schema. Implementations should not assume that `expected_update` is the exact difference between destination state and the self.schema. This is only the case if @@ -254,7 +295,9 @@ def restore_file_load(self, file_path: str) -> LoadJob: def should_truncate_table_before_load(self, table: TTableSchema) -> bool: return table["write_disposition"] == "replace" - def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def create_table_chain_completed_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] @@ -268,7 +311,9 @@ def __enter__(self) -> "JobClientBase": pass @abstractmethod - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: pass def _verify_schema(self) -> None: @@ -281,17 +326,27 @@ def _verify_schema(self) -> None: for table in self.schema.data_tables(): table_name = table["name"] if len(table_name) > self.capabilities.max_identifier_length: - raise IdentifierTooLongException(self.config.destination_name, "table", table_name, self.capabilities.max_identifier_length) + raise IdentifierTooLongException( + self.config.destination_name, + "table", + table_name, + self.capabilities.max_identifier_length, + ) for column_name, column in dict(table["columns"]).items(): if len(column_name) > self.capabilities.max_column_identifier_length: raise IdentifierTooLongException( self.config.destination_name, "column", f"{table_name}.{column_name}", - self.capabilities.max_column_identifier_length + self.capabilities.max_column_identifier_length, ) if not is_complete_column(column): - logger.warning(f"A column {column_name} in table {table_name} in schema {self.schema.name} is incomplete. It was not bound to the data during normalizations stage and its data type is unknown. Did you add this column manually in code ie. as a merge key?") + logger.warning( + f"A column {column_name} in table {table_name} in schema" + f" {self.schema.name} is incomplete. It was not bound to the data during" + " normalizations stage and its data type is unknown. Did you add this" + " column manually in code ie. as a merge key?" + ) def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> TTableSchema: if table_name not in self.schema.tables: @@ -310,7 +365,6 @@ def get_load_table(self, table_name: str, prepare_for_staging: bool = False) -> class WithStateSync(ABC): - @abstractmethod def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" @@ -334,20 +388,24 @@ def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: return False @abstractmethod - def with_staging_dataset(self)-> ContextManager["JobClientBase"]: + def with_staging_dataset(self) -> ContextManager["JobClientBase"]: """Executes job client methods on staging dataset""" return self # type: ignore -class SupportsStagingDestination(): + +class SupportsStagingDestination: """Adds capability to support a staging destination for the load""" - def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTableSchema) -> bool: + def should_load_data_to_staging_dataset_on_staging_destination( + self, table: TTableSchema + ) -> bool: return False def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: # the default is to truncate the tables on the staging destination... return True + TDestinationReferenceArg = Union[str, "Destination", None] @@ -355,6 +413,7 @@ class Destination(ABC, Generic[TDestinationConfig, TDestinationClient]): """A destination factory that can be partially pre-configured with credentials and other config params. """ + config_params: Optional[Dict[str, Any]] = None def __init__(self, **kwargs: Any) -> None: @@ -364,8 +423,7 @@ def __init__(self, **kwargs: Any) -> None: sig = inspect.signature(self.__class__) params = sig.parameters self.config_params = { - k: v for k, v in kwargs.items() - if k not in params or v != params[k].default + k: v for k, v in kwargs.items() if k not in params or v != params[k].default } @property @@ -390,13 +448,12 @@ def client_class(self) -> Type[TDestinationClient]: ... def configuration(self, initial_config: TDestinationConfig) -> TDestinationConfig: - """Get a fully resolved destination config from the initial config - """ + """Get a fully resolved destination config from the initial config""" return resolve_configuration( initial_config, sections=(known_sections.DESTINATION, self.name), # Already populated values will supersede resolved env config - explicit_value=self.config_params + explicit_value=self.config_params, ) @staticmethod @@ -408,7 +465,11 @@ def to_name(ref: TDestinationReferenceArg) -> str: return ref.name @staticmethod - def from_reference(ref: TDestinationReferenceArg, credentials: Optional[CredentialsConfiguration] = None, **kwargs: Any) -> Optional["Destination[DestinationClientConfiguration, JobClientBase]"]: + def from_reference( + ref: TDestinationReferenceArg, + credentials: Optional[CredentialsConfiguration] = None, + **kwargs: Any, + ) -> Optional["Destination[DestinationClientConfiguration, JobClientBase]"]: """Instantiate destination from str reference. The ref can be a destination name or import path pointing to a destination class (e.g. `dlt.destinations.postgres`) """ @@ -424,12 +485,15 @@ def from_reference(ref: TDestinationReferenceArg, credentials: Optional[Credenti dest_module = import_module(module_path) else: from dlt import destinations as dest_module + attr_name = ref except ModuleNotFoundError as e: raise UnknownDestinationModule(ref) from e try: - factory: Type[Destination[DestinationClientConfiguration, JobClientBase]] = getattr(dest_module, attr_name) + factory: Type[Destination[DestinationClientConfiguration, JobClientBase]] = getattr( + dest_module, attr_name + ) except AttributeError as e: raise UnknownDestinationModule(ref) from e if credentials: @@ -441,7 +505,9 @@ def from_reference(ref: TDestinationReferenceArg, credentials: Optional[Credenti raise InvalidDestinationReference(ref) from e return dest - def client(self, schema: Schema, initial_config: TDestinationConfig = config.value) -> TDestinationClient: + def client( + self, schema: Schema, initial_config: TDestinationConfig = config.value + ) -> TDestinationClient: """Returns a configured instance of the destination's job client""" return self.client_class(schema, self.configuration(initial_config)) diff --git a/dlt/common/exceptions.py b/dlt/common/exceptions.py index aa987f6766..36e98d8c00 100644 --- a/dlt/common/exceptions.py +++ b/dlt/common/exceptions.py @@ -6,10 +6,14 @@ def __reduce__(self) -> Any: """Enables exceptions with parametrized constructor to be pickled""" return type(self).__new__, (type(self), *self.args), self.__dict__ + class UnsupportedProcessStartMethodException(DltException): def __init__(self, method: str) -> None: self.method = method - super().__init__(f"Process pool supports only fork start method, {method} not supported. Switch the pool type to threading") + super().__init__( + f"Process pool supports only fork start method, {method} not supported. Switch the pool" + " type to threading" + ) class CannotInstallDependencies(DltException): @@ -20,7 +24,10 @@ def __init__(self, dependencies: Sequence[str], interpreter: str, output: AnyStr str_output = output.decode("utf-8") else: str_output = output - super().__init__(f"Cannot install dependencies {', '.join(dependencies)} with {interpreter} and pip:\n{str_output}\n") + super().__init__( + f"Cannot install dependencies {', '.join(dependencies)} with {interpreter} and" + f" pip:\n{str_output}\n" + ) class VenvNotFound(DltException): @@ -49,6 +56,7 @@ class TerminalValueError(ValueError, TerminalException): class SignalReceivedException(KeyboardInterrupt, TerminalException): """Raises when signal comes. Derives from `BaseException` to not be caught in regular exception handlers.""" + def __init__(self, signal_code: int) -> None: self.signal_code = signal_code super().__init__(f"Signal {signal_code} received") @@ -87,7 +95,7 @@ def _get_msg(self, appendix: str) -> str: return msg def _to_pip_install(self) -> str: - return "\n".join([f"pip install \"{d}\"" for d in self.dependencies]) + return "\n".join([f'pip install "{d}"' for d in self.dependencies]) class SystemConfigurationException(DltException): @@ -132,11 +140,13 @@ def __init__(self, destination: str) -> None: self.destination = destination super().__init__(f"Destination {destination} does not support loading via staging.") + class DestinationLoadingWithoutStagingNotSupported(DestinationTerminalException): def __init__(self, destination: str) -> None: self.destination = destination super().__init__(f"Destination {destination} does not support loading without staging.") + class DestinationNoStagingMode(DestinationTerminalException): def __init__(self, destination: str) -> None: self.destination = destination @@ -144,7 +154,9 @@ def __init__(self, destination: str) -> None: class DestinationIncompatibleLoaderFileFormatException(DestinationTerminalException): - def __init__(self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str]) -> None: + def __init__( + self, destination: str, staging: str, file_format: str, supported_formats: Iterable[str] + ) -> None: self.destination = destination self.staging = staging self.file_format = file_format @@ -152,21 +164,41 @@ def __init__(self, destination: str, staging: str, file_format: str, supported_f supported_formats_str = ", ".join(supported_formats) if self.staging: if not supported_formats: - msg = f"Staging {staging} cannot be used with destination {destination} because they have no file formats in common." + msg = ( + f"Staging {staging} cannot be used with destination {destination} because they" + " have no file formats in common." + ) else: - msg = f"Unsupported file format {file_format} for destination {destination} in combination with staging destination {staging}. Supported formats: {supported_formats_str}" + msg = ( + f"Unsupported file format {file_format} for destination {destination} in" + f" combination with staging destination {staging}. Supported formats:" + f" {supported_formats_str}" + ) else: - msg = f"Unsupported file format {file_format} destination {destination}. Supported formats: {supported_formats_str}. Check the staging option in the dlt.pipeline for additional formats." + msg = ( + f"Unsupported file format {file_format} destination {destination}. Supported" + f" formats: {supported_formats_str}. Check the staging option in the dlt.pipeline" + " for additional formats." + ) super().__init__(msg) class IdentifierTooLongException(DestinationTerminalException): - def __init__(self, destination_name: str, identifier_type: str, identifier_name: str, max_identifier_length: int) -> None: + def __init__( + self, + destination_name: str, + identifier_type: str, + identifier_name: str, + max_identifier_length: int, + ) -> None: self.destination_name = destination_name self.identifier_type = identifier_type self.identifier_name = identifier_name self.max_identifier_length = max_identifier_length - super().__init__(f"The length of {identifier_type} {identifier_name} exceeds {max_identifier_length} allowed for {destination_name}") + super().__init__( + f"The length of {identifier_type} {identifier_name} exceeds" + f" {max_identifier_length} allowed for {destination_name}" + ) class DestinationHasFailedJobs(DestinationTerminalException): @@ -174,7 +206,9 @@ def __init__(self, destination_name: str, load_id: str, failed_jobs: List[Any]) self.destination_name = destination_name self.load_id = load_id self.failed_jobs = failed_jobs - super().__init__(f"Destination {destination_name} has failed jobs in load package {load_id}") + super().__init__( + f"Destination {destination_name} has failed jobs in load package {load_id}" + ) class PipelineException(DltException): @@ -187,21 +221,37 @@ def __init__(self, pipeline_name: str, msg: str) -> None: class PipelineStateNotAvailable(PipelineException): def __init__(self, source_state_key: Optional[str] = None) -> None: if source_state_key: - msg = f"The source {source_state_key} requested the access to pipeline state but no pipeline is active right now." + msg = ( + f"The source {source_state_key} requested the access to pipeline state but no" + " pipeline is active right now." + ) else: - msg = "The resource you called requested the access to pipeline state but no pipeline is active right now." - msg += " Call dlt.pipeline(...) before you call the @dlt.source or @dlt.resource decorated function." + msg = ( + "The resource you called requested the access to pipeline state but no pipeline is" + " active right now." + ) + msg += ( + " Call dlt.pipeline(...) before you call the @dlt.source or @dlt.resource decorated" + " function." + ) self.source_state_key = source_state_key super().__init__(None, msg) class ResourceNameNotAvailable(PipelineException): def __init__(self) -> None: - super().__init__(None, - "A resource state was requested but no active extract pipe context was found. Resource state may be only requested from @dlt.resource decorated function or with explicit resource name.") + super().__init__( + None, + "A resource state was requested but no active extract pipe context was found. Resource" + " state may be only requested from @dlt.resource decorated function or with explicit" + " resource name.", + ) class SourceSectionNotAvailable(PipelineException): def __init__(self) -> None: - msg = "Access to state was requested without source section active. State should be requested from within the @dlt.source and @dlt.resource decorated function." + msg = ( + "Access to state was requested without source section active. State should be requested" + " from within the @dlt.source and @dlt.resource decorated function." + ) super().__init__(None, msg) diff --git a/dlt/common/git.py b/dlt/common/git.py index 602e889a36..c4f83a7398 100644 --- a/dlt/common/git.py +++ b/dlt/common/git.py @@ -15,6 +15,7 @@ else: Repo = Any + @contextmanager def git_custom_key_command(private_key: Optional[str]) -> Iterator[str]: if private_key: @@ -24,7 +25,9 @@ def git_custom_key_command(private_key: Optional[str]) -> Iterator[str]: try: # permissions so SSH does not complain os.chmod(key_file, 0o600) - yield 'ssh -o "StrictHostKeyChecking accept-new" -i "%s"' % key_file.replace("\\", "\\\\") + yield 'ssh -o "StrictHostKeyChecking accept-new" -i "%s"' % key_file.replace( + "\\", "\\\\" + ) finally: os.remove(key_file) else: @@ -46,6 +49,7 @@ def is_dirty(repo: Repo) -> bool: status: str = repo.git.status("--short") return len(status.strip()) > 0 + # def is_dirty(repo: Repo) -> bool: # # get branch status # status: str = repo.git.status("--short", "--branch") @@ -53,7 +57,9 @@ def is_dirty(repo: Repo) -> bool: # return len(status.splitlines()) > 1 -def ensure_remote_head(repo_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> None: +def ensure_remote_head( + repo_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None +) -> None: from git import Repo, RepositoryDirtyError # update remotes and check if heads are same. ignores locally modified files @@ -70,7 +76,12 @@ def ensure_remote_head(repo_path: str, branch: Optional[str] = None, with_git_co raise RepositoryDirtyError(repo, status) -def clone_repo(repository_url: str, clone_path: str, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> Repo: +def clone_repo( + repository_url: str, + clone_path: str, + branch: Optional[str] = None, + with_git_command: Optional[str] = None, +) -> Repo: from git import Repo repo = Repo.clone_from(repository_url, clone_path, env=dict(GIT_SSH_COMMAND=with_git_command)) @@ -79,7 +90,13 @@ def clone_repo(repository_url: str, clone_path: str, branch: Optional[str] = Non return repo -def force_clone_repo(repo_url: str, repo_storage: FileStorage, repo_name: str, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> None: +def force_clone_repo( + repo_url: str, + repo_storage: FileStorage, + repo_name: str, + branch: Optional[str] = None, + with_git_command: Optional[str] = None, +) -> None: """Deletes the working directory repo_storage.root/repo_name and clones the `repo_url` into it. Will checkout `branch` if provided""" try: # delete repo folder @@ -89,7 +106,7 @@ def force_clone_repo(repo_url: str, repo_storage: FileStorage, repo_name: str, b repo_url, repo_storage.make_full_path(repo_name), branch=branch, - with_git_command=with_git_command + with_git_command=with_git_command, ).close() except Exception: # delete folder so we start clean next time @@ -98,7 +115,12 @@ def force_clone_repo(repo_url: str, repo_storage: FileStorage, repo_name: str, b raise -def get_fresh_repo_files(repo_location: str, working_dir: str = None, branch: Optional[str] = None, with_git_command: Optional[str] = None) -> FileStorage: +def get_fresh_repo_files( + repo_location: str, + working_dir: str = None, + branch: Optional[str] = None, + with_git_command: Optional[str] = None, +) -> FileStorage: """Returns a file storage leading to the newest repository files. If `repo_location` is url, file will be checked out into `working_dir/repo_name`""" from git import GitError @@ -113,7 +135,13 @@ def get_fresh_repo_files(repo_location: str, working_dir: str = None, branch: Op try: ensure_remote_head(repo_path, branch=branch, with_git_command=with_git_command) except GitError: - force_clone_repo(repo_location, FileStorage(working_dir, makedirs=True), repo_name, branch=branch, with_git_command=with_git_command) + force_clone_repo( + repo_location, + FileStorage(working_dir, makedirs=True), + repo_name, + branch=branch, + with_git_command=with_git_command, + ) return FileStorage(repo_path) diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index c4acf66c72..e9b52cc382 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -24,38 +24,29 @@ class SupportsJson(Protocol): _impl_name: str """Implementation name""" - def dump(self, obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty:bool = False) -> None: - ... + def dump( + self, obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty: bool = False + ) -> None: ... - def typed_dump(self, obj: Any, fp: IO[bytes], pretty:bool = False) -> None: - ... + def typed_dump(self, obj: Any, fp: IO[bytes], pretty: bool = False) -> None: ... - def typed_dumps(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: - ... + def typed_dumps(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: ... - def typed_loads(self, s: str) -> Any: - ... + def typed_loads(self, s: str) -> Any: ... - def typed_dumpb(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: - ... + def typed_dumpb(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: ... - def typed_loadb(self, s: Union[bytes, bytearray, memoryview]) -> Any: - ... + def typed_loadb(self, s: Union[bytes, bytearray, memoryview]) -> Any: ... - def dumps(self, obj: Any, sort_keys: bool = False, pretty:bool = False) -> str: - ... + def dumps(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: ... - def dumpb(self, obj: Any, sort_keys: bool = False, pretty:bool = False) -> bytes: - ... + def dumpb(self, obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: ... - def load(self, fp: Union[IO[bytes], IO[str]]) -> Any: - ... + def load(self, fp: Union[IO[bytes], IO[str]]) -> Any: ... - def loads(self, s: str) -> Any: - ... + def loads(self, s: str) -> Any: ... - def loadb(self, s: Union[bytes, bytearray, memoryview]) -> Any: - ... + def loadb(self, s: Union[bytes, bytearray, memoryview]) -> Any: ... def custom_encode(obj: Any) -> str: @@ -74,10 +65,10 @@ def custom_encode(obj: Any) -> str: elif isinstance(obj, HexBytes): return obj.hex() elif isinstance(obj, bytes): - return base64.b64encode(obj).decode('ascii') - elif hasattr(obj, 'asdict'): + return base64.b64encode(obj).decode("ascii") + elif hasattr(obj, "asdict"): return obj.asdict() # type: ignore - elif hasattr(obj, '_asdict'): + elif hasattr(obj, "_asdict"): return obj._asdict() # type: ignore elif PydanticBaseModel and isinstance(obj, PydanticBaseModel): return obj.dict() # type: ignore[return-value] @@ -89,23 +80,24 @@ def custom_encode(obj: Any) -> str: # use PUA range to encode additional types -_DECIMAL = '\uF026' -_DATETIME = '\uF027' -_DATE = '\uF028' -_UUIDT = '\uF029' -_HEXBYTES = '\uF02A' -_B64BYTES = '\uF02B' -_WEI = '\uF02C' -_TIME = '\uF02D' +_DECIMAL = "\uf026" +_DATETIME = "\uf027" +_DATE = "\uf028" +_UUIDT = "\uf029" +_HEXBYTES = "\uf02a" +_B64BYTES = "\uf02b" +_WEI = "\uf02c" +_TIME = "\uf02d" def _datetime_decoder(obj: str) -> datetime: - if obj.endswith('Z'): + if obj.endswith("Z"): # Backwards compatibility for data encoded with previous dlt version # fromisoformat does not support Z suffix (until py3.11) - obj = obj[:-1] + '+00:00' + obj = obj[:-1] + "+00:00" return pendulum.DateTime.fromisoformat(obj) # type: ignore[attr-defined, no-any-return] + # define decoder for each prefix DECODERS: List[Callable[[Any], Any]] = [ Decimal, @@ -139,10 +131,10 @@ def custom_pua_encode(obj: Any) -> str: elif isinstance(obj, HexBytes): return _HEXBYTES + obj.hex() elif isinstance(obj, bytes): - return _B64BYTES + base64.b64encode(obj).decode('ascii') - elif hasattr(obj, 'asdict'): + return _B64BYTES + base64.b64encode(obj).decode("ascii") + elif hasattr(obj, "asdict"): return obj.asdict() # type: ignore - elif hasattr(obj, '_asdict'): + elif hasattr(obj, "_asdict"): return obj._asdict() # type: ignore elif dataclasses.is_dataclass(obj): return dataclasses.asdict(obj) # type: ignore @@ -158,7 +150,7 @@ def custom_pua_decode(obj: Any) -> Any: if isinstance(obj, str) and len(obj) > 1: c = ord(obj[0]) - 0xF026 # decode only the PUA space defined in DECODERS - if c >=0 and c <= PUA_CHARACTER_MAX: + if c >= 0 and c <= PUA_CHARACTER_MAX: return DECODERS[c](obj[1:]) return obj @@ -176,27 +168,30 @@ def custom_pua_remove(obj: Any) -> Any: if isinstance(obj, str) and len(obj) > 1: c = ord(obj[0]) - 0xF026 # decode only the PUA space defined in DECODERS - if c >=0 and c <= PUA_CHARACTER_MAX: + if c >= 0 and c <= PUA_CHARACTER_MAX: return obj[1:] return obj def may_have_pua(line: bytes) -> bool: """Checks if bytes string contains pua marker""" - return b'\xef\x80' in line + return b"\xef\x80" in line # pick the right impl json: SupportsJson = None if os.environ.get("DLT_USE_JSON") == "simplejson": from dlt.common.json import _simplejson as _json_d + json = _json_d # type: ignore[assignment] else: try: from dlt.common.json import _orjson as _json_or + json = _json_or # type: ignore[assignment] except ImportError: from dlt.common.json import _simplejson as _json_simple + json = _json_simple # type: ignore[assignment] @@ -207,5 +202,5 @@ def may_have_pua(line: bytes) -> bool: "custom_pua_decode", "custom_pua_decode_nested", "custom_pua_remove", - "SupportsJson" + "SupportsJson", ] diff --git a/dlt/common/json/_orjson.py b/dlt/common/json/_orjson.py index ada91cbb1b..d2d960e6ce 100644 --- a/dlt/common/json/_orjson.py +++ b/dlt/common/json/_orjson.py @@ -7,7 +7,9 @@ _impl_name = "orjson" -def _dumps(obj: Any, sort_keys: bool, pretty:bool, default:AnyFun = custom_encode, options: int = 0) -> bytes: +def _dumps( + obj: Any, sort_keys: bool, pretty: bool, default: AnyFun = custom_encode, options: int = 0 +) -> bytes: options = options | orjson.OPT_UTC_Z | orjson.OPT_NON_STR_KEYS if pretty: options |= orjson.OPT_INDENT_2 @@ -16,11 +18,11 @@ def _dumps(obj: Any, sort_keys: bool, pretty:bool, default:AnyFun = custom_encod return orjson.dumps(obj, default=default, option=options) -def dump(obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty:bool = False) -> None: +def dump(obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty: bool = False) -> None: fp.write(_dumps(obj, sort_keys, pretty)) -def typed_dump(obj: Any, fp: IO[bytes], pretty:bool = False) -> None: +def typed_dump(obj: Any, fp: IO[bytes], pretty: bool = False) -> None: fp.write(typed_dumpb(obj, pretty=pretty)) @@ -29,7 +31,7 @@ def typed_dumpb(obj: Any, sort_keys: bool = False, pretty: bool = False) -> byte def typed_dumps(obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: - return typed_dumpb(obj, sort_keys, pretty).decode('utf-8') + return typed_dumpb(obj, sort_keys, pretty).decode("utf-8") def typed_loads(s: str) -> Any: @@ -40,11 +42,11 @@ def typed_loadb(s: Union[bytes, bytearray, memoryview]) -> Any: return custom_pua_decode_nested(loadb(s)) -def dumps(obj: Any, sort_keys: bool = False, pretty:bool = False) -> str: +def dumps(obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: return _dumps(obj, sort_keys, pretty).decode("utf-8") -def dumpb(obj: Any, sort_keys: bool = False, pretty:bool = False) -> bytes: +def dumpb(obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: return _dumps(obj, sort_keys, pretty) diff --git a/dlt/common/json/_simplejson.py b/dlt/common/json/_simplejson.py index c670717527..10ee17e2f6 100644 --- a/dlt/common/json/_simplejson.py +++ b/dlt/common/json/_simplejson.py @@ -15,7 +15,7 @@ _impl_name = "simplejson" -def dump(obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty:bool = False) -> None: +def dump(obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty: bool = False) -> None: if pretty: indent = 2 else: @@ -28,13 +28,13 @@ def dump(obj: Any, fp: IO[bytes], sort_keys: bool = False, pretty:bool = False) default=custom_encode, encoding=None, ensure_ascii=False, - separators=(',', ':'), + separators=(",", ":"), sort_keys=sort_keys, - indent=indent + indent=indent, ) -def typed_dump(obj: Any, fp: IO[bytes], pretty:bool = False) -> None: +def typed_dump(obj: Any, fp: IO[bytes], pretty: bool = False) -> None: if pretty: indent = 2 else: @@ -47,10 +47,11 @@ def typed_dump(obj: Any, fp: IO[bytes], pretty:bool = False) -> None: default=custom_pua_encode, encoding=None, ensure_ascii=False, - separators=(',', ':'), - indent=indent + separators=(",", ":"), + indent=indent, ) + def typed_dumps(obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: indent = 2 if pretty else None return simplejson.dumps( @@ -59,8 +60,8 @@ def typed_dumps(obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: default=custom_pua_encode, encoding=None, ensure_ascii=False, - separators=(',', ':'), - indent=indent + separators=(",", ":"), + indent=indent, ) @@ -69,14 +70,14 @@ def typed_loads(s: str) -> Any: def typed_dumpb(obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: - return typed_dumps(obj, sort_keys, pretty).encode('utf-8') + return typed_dumps(obj, sort_keys, pretty).encode("utf-8") def typed_loadb(s: Union[bytes, bytearray, memoryview]) -> Any: return custom_pua_decode_nested(loadb(s)) -def dumps(obj: Any, sort_keys: bool = False, pretty:bool = False) -> str: +def dumps(obj: Any, sort_keys: bool = False, pretty: bool = False) -> str: if pretty: indent = 2 else: @@ -87,13 +88,13 @@ def dumps(obj: Any, sort_keys: bool = False, pretty:bool = False) -> str: default=custom_encode, encoding=None, ensure_ascii=False, - separators=(',', ':'), + separators=(",", ":"), sort_keys=sort_keys, - indent=indent + indent=indent, ) -def dumpb(obj: Any, sort_keys: bool = False, pretty:bool = False) -> bytes: +def dumpb(obj: Any, sort_keys: bool = False, pretty: bool = False) -> bytes: return dumps(obj, sort_keys, pretty).encode("utf-8") diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index 585bee0d2f..cb6d8b371b 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -14,13 +14,17 @@ import pyarrow import pyarrow.parquet except ModuleNotFoundError: - raise MissingDependencyException("DLT parquet Helpers", [f"{version.DLT_PKG_NAME}[parquet]"], "DLT Helpers for for parquet.") + raise MissingDependencyException( + "DLT parquet Helpers", [f"{version.DLT_PKG_NAME}[parquet]"], "DLT Helpers for for parquet." + ) TAnyArrowItem = Union[pyarrow.Table, pyarrow.RecordBatch] -def get_py_arrow_datatype(column: TColumnType, caps: DestinationCapabilitiesContext, tz: str) -> Any: +def get_py_arrow_datatype( + column: TColumnType, caps: DestinationCapabilitiesContext, tz: str +) -> Any: column_type = column["data_type"] if column_type == "text": return pyarrow.string() @@ -39,7 +43,11 @@ def get_py_arrow_datatype(column: TColumnType, caps: DestinationCapabilitiesCont return pyarrow.string() elif column_type == "decimal": precision, scale = column.get("precision"), column.get("scale") - precision_tuple = (precision, scale) if precision is not None and scale is not None else caps.decimal_precision + precision_tuple = ( + (precision, scale) + if precision is not None and scale is not None + else caps.decimal_precision + ) return get_py_arrow_numeric(precision_tuple) elif column_type == "wei": return get_py_arrow_numeric(caps.wei_precision) @@ -93,8 +101,7 @@ def get_pyarrow_int(precision: Optional[int]) -> Any: def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: - """Returns (data_type, precision, scale) tuple from pyarrow.DataType - """ + """Returns (data_type, precision, scale) tuple from pyarrow.DataType""" if pyarrow.types.is_string(dtype) or pyarrow.types.is_large_string(dtype): return dict(data_type="text") elif pyarrow.types.is_floating(dtype): @@ -126,7 +133,7 @@ def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: return dict(data_type="time", precision=precision) elif pyarrow.types.is_integer(dtype): result: TColumnType = dict(data_type="bigint") - if dtype.bit_width != 64: # 64bit is a default bigint + if dtype.bit_width != 64: # 64bit is a default bigint result["precision"] = dtype.bit_width return result elif pyarrow.types.is_fixed_size_binary(dtype): @@ -143,7 +150,9 @@ def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: def remove_null_columns(item: TAnyArrowItem) -> TAnyArrowItem: """Remove all columns of datatype pyarrow.null() from the table or record batch""" - return remove_columns(item, [field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + return remove_columns( + item, [field.name for field in item.schema if pyarrow.types.is_null(field.type)] + ) def remove_columns(item: TAnyArrowItem, columns: Sequence[str]) -> TAnyArrowItem: @@ -166,7 +175,9 @@ def append_column(item: TAnyArrowItem, name: str, data: Any) -> TAnyArrowItem: return item.append_column(name, data) elif isinstance(item, pyarrow.RecordBatch): new_field = pyarrow.field(name, data.type) - return pyarrow.RecordBatch.from_arrays(item.columns + [data], schema=item.schema.append(new_field)) + return pyarrow.RecordBatch.from_arrays( + item.columns + [data], schema=item.schema.append(new_field) + ) else: raise ValueError(item) @@ -181,7 +192,9 @@ def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAny if isinstance(item, pyarrow.Table): return item.rename_columns(new_column_names) elif isinstance(item, pyarrow.RecordBatch): - new_fields = [field.with_name(new_name) for new_name, field in zip(new_column_names, item.schema)] + new_fields = [ + field.with_name(new_name) for new_name, field in zip(new_column_names, item.schema) + ] return pyarrow.RecordBatch.from_arrays(item.columns, schema=pyarrow.schema(new_fields)) else: raise TypeError(f"Unsupported data item type {type(item)}") @@ -191,13 +204,13 @@ def normalize_py_arrow_schema( item: TAnyArrowItem, columns: TTableSchemaColumns, naming: NamingConvention, - caps: DestinationCapabilitiesContext + caps: DestinationCapabilitiesContext, ) -> TAnyArrowItem: """Normalize arrow `item` schema according to the `columns`. - 1. arrow schema field names will be normalized according to `naming` - 2. arrows columns will be reordered according to `columns` - 3. empty columns will be inserted if they are missing, types will be generated using `caps` + 1. arrow schema field names will be normalized according to `naming` + 2. arrows columns will be reordered according to `columns` + 3. empty columns will be inserted if they are missing, types will be generated using `caps` """ rename_mapping = get_normalized_arrow_fields_mapping(item, naming) rev_mapping = {v: k for k, v in rename_mapping.items()} @@ -205,12 +218,16 @@ def normalize_py_arrow_schema( # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns # that should happen in the normalizer - columns = {name:column for name, column in columns.items() if not name.startswith(dlt_table_prefix) or name in rev_mapping} + columns = { + name: column + for name, column in columns.items() + if not name.startswith(dlt_table_prefix) or name in rev_mapping + } # check if nothing to rename if list(rename_mapping.keys()) == list(rename_mapping.values()): # check if nothing to reorder - if list(rename_mapping.keys())[:len(columns)]== list(columns.keys()): + if list(rename_mapping.keys())[: len(columns)] == list(columns.keys()): return item schema = item.schema @@ -228,10 +245,10 @@ def normalize_py_arrow_schema( else: # column does not exist in pyarrow. create empty field and column new_field = pyarrow.field( - column_name, - get_py_arrow_datatype(column, caps, "UTC"), - nullable=column.get("nullable", True) - ) + column_name, + get_py_arrow_datatype(column, caps, "UTC"), + nullable=column.get("nullable", True), + ) new_fields.append(new_field) new_columns.append(pyarrow.nulls(item.num_rows, type=new_field.type)) @@ -253,7 +270,10 @@ def get_normalized_arrow_fields_mapping(item: TAnyArrowItem, naming: NamingConve # verify if names uniquely normalize normalized_names = set(name_mapping.values()) if len(name_mapping) != len(normalized_names): - raise NameNormalizationClash(f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") + raise NameNormalizationClash( + f"Arrow schema fields normalized from {list(name_mapping.keys())} to" + f" {list(normalized_names)}" + ) return name_mapping @@ -292,9 +312,11 @@ def get_row_count(parquet_file: TFileOrPath) -> int: def is_arrow_item(item: Any) -> bool: return isinstance(item, (pyarrow.Table, pyarrow.RecordBatch)) + TNewColumns = Sequence[Tuple[int, pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] """Sequence of tuples: (field index, field, generating function)""" + def pq_stream_with_new_columns( parquet_file: TFileOrPath, columns: TNewColumns, row_groups_per_read: int = 1 ) -> Iterator[pyarrow.Table]: @@ -315,7 +337,9 @@ def pq_stream_with_new_columns( n_groups = reader.num_row_groups # Iterate through n row groups at a time for i in range(0, n_groups, row_groups_per_read): - tbl: pyarrow.Table = reader.read_row_groups(range(i, min(i + row_groups_per_read, n_groups))) + tbl: pyarrow.Table = reader.read_row_groups( + range(i, min(i + row_groups_per_read, n_groups)) + ) for idx, field, gen_ in columns: if idx == -1: tbl = tbl.append_column(field, gen_(tbl)) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index 1b65fa3a7e..e77d4a3c9f 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,25 +1,50 @@ from __future__ import annotations import inspect from copy import copy -from typing import Dict, Generic, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_args, Any +from typing import ( + Dict, + Generic, + Set, + TypedDict, + List, + Type, + Union, + TypeVar, + get_origin, + get_args, + Any, +) from dlt.common.exceptions import MissingDependencyException from dlt.common.schema import DataValidationError from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns from dlt.common.data_types import py_type_to_sc_type -from dlt.common.typing import TDataItem, TDataItems, extract_union_types, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union +from dlt.common.typing import ( + TDataItem, + TDataItems, + extract_union_types, + is_optional_type, + extract_inner_type, + is_list_generic_type, + is_dict_generic_type, + is_union, +) try: from pydantic import BaseModel, ValidationError, Json, create_model except ImportError: - raise MissingDependencyException("dlt Pydantic helpers", ["pydantic"], "Both Pydantic 1.x and 2.x are supported") + raise MissingDependencyException( + "dlt Pydantic helpers", ["pydantic"], "Both Pydantic 1.x and 2.x are supported" + ) _PYDANTIC_2 = False try: from pydantic import PydanticDeprecatedSince20 + _PYDANTIC_2 = True # hide deprecation warning import warnings + warnings.simplefilter("ignore", category=PydanticDeprecatedSince20) except ImportError: pass @@ -40,11 +65,14 @@ class DltConfig(TypedDict, total=False): >>> nested: Dict[str, Any] >>> dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} """ + skip_complex_types: bool """If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from dlt schema generated from the model""" -def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) -> TTableSchemaColumns: +def pydantic_to_table_schema_columns( + model: Union[BaseModel, Type[BaseModel]] +) -> TTableSchemaColumns: """Convert a pydantic model to a table schema columns dict See also DltConfig for more control over how the schema is created @@ -64,7 +92,7 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) - for field_name, field in model.__fields__.items(): # type: ignore[union-attr] annotation = field.annotation - if inner_annotation := getattr(annotation, 'inner_type', None): + if inner_annotation := getattr(annotation, "inner_type", None): # This applies to pydantic.Json fields, the inner type is the type after json parsing # (In pydantic 2 the outer annotation is the final type) annotation = inner_annotation @@ -93,7 +121,7 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) - # try to coerce unknown type to text data_type = "text" - if data_type == 'complex' and skip_complex_types: + if data_type == "complex" and skip_complex_types: continue result[name] = { @@ -134,20 +162,23 @@ def get_extra_from_model(model: Type[BaseModel]) -> str: def apply_schema_contract_to_model( model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, - data_mode: TSchemaEvolutionMode = "freeze" + data_mode: TSchemaEvolutionMode = "freeze", ) -> Type[_TPydanticModel]: """Configures or re-creates `model` so it behaves according to `column_mode` and `data_mode` settings. - `column_mode` sets the model behavior when unknown field is found. - `data_mode` sets model behavior when known field does not validate. currently `evolve` and `freeze` are supported here. + `column_mode` sets the model behavior when unknown field is found. + `data_mode` sets model behavior when known field does not validate. currently `evolve` and `freeze` are supported here. - `discard_row` is implemented in `validate_item`. + `discard_row` is implemented in `validate_item`. """ if data_mode == "evolve": # create a lenient model that accepts any data - model = create_model(model.__name__ + "Any", **{n:(Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] + model = create_model(model.__name__ + "Any", **{n: (Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] elif data_mode == "discard_value": - raise NotImplementedError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models.") + raise NotImplementedError( + "data_mode is discard_value. Cannot discard defined fields with validation errors using" + " Pydantic models." + ) extra = column_mode_to_extra(column_mode) @@ -165,7 +196,7 @@ def apply_schema_contract_to_model( _child_models: Dict[int, Type[BaseModel]] = {} def _process_annotation(t_: Type[Any]) -> Type[Any]: - """Recursively recreates models with applied schema contract """ + """Recursively recreates models with applied schema contract""" if is_list_generic_type(t_): l_t: Type[Any] = get_args(t_)[0] try: @@ -190,14 +221,16 @@ def _process_annotation(t_: Type[Any]) -> Type[Any]: if id(t_) in _child_models: return _child_models[id(t_)] else: - _child_models[id(t_)] = child_model = apply_schema_contract_to_model(t_, column_mode, data_mode) + _child_models[id(t_)] = child_model = apply_schema_contract_to_model( + t_, column_mode, data_mode + ) return child_model return t_ new_model: Type[_TPydanticModel] = create_model( # type: ignore[call-overload] model.__name__ + "Extra" + extra.title(), - __config__ = config, - **{n:(_process_annotation(f.annotation), f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] + __config__=config, + **{n: (_process_annotation(f.annotation), f) for n, f in model.__fields__.items()}, # type: ignore[attr-defined] ) # pass dlt config along dlt_config = getattr(model, "dlt_config", None) @@ -206,16 +239,17 @@ def _process_annotation(t_: Type[Any]) -> Type[Any]: return new_model -def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze") -> Type[ListModel[_TPydanticModel]]: +def create_list_model( + model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze" +) -> Type[ListModel[_TPydanticModel]]: """Creates a model from `model` for validating list of items in batch according to `data_mode` - Currently only freeze is supported. See comments in the code + Currently only freeze is supported. See comments in the code """ # TODO: use LenientList to create list model that automatically discards invalid items # https://github.com/pydantic/pydantic/issues/2274 and https://gist.github.com/dmontagu/7f0cef76e5e0e04198dd608ad7219573 return create_model( - "List" + __name__, - items=(List[model], ...) # type: ignore[return-value,valid-type] + "List" + __name__, items=(List[model], ...) # type: ignore[return-value,valid-type] ) @@ -224,11 +258,11 @@ def validate_items( list_model: Type[ListModel[_TPydanticModel]], items: List[TDataItem], column_mode: TSchemaEvolutionMode, - data_mode: TSchemaEvolutionMode + data_mode: TSchemaEvolutionMode, ) -> List[_TPydanticModel]: """Validates list of `item` with `list_model` and returns parsed Pydantic models - `list_model` should be created with `create_list_model` and have `items` field which this function returns. + `list_model` should be created with `create_list_model` and have `items` field which this function returns. """ try: return list_model(items=items).items @@ -241,29 +275,60 @@ def validate_items( if err_idx in deleted: # already dropped continue - err_item = items[err_idx - len(deleted)] + err_item = items[err_idx - len(deleted)] else: # top level error which means misalignment of list model and items - raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, items) from e + raise DataValidationError( + None, + table_name, + str(err["loc"]), + "columns", + "freeze", + list_model, + {"columns": "freeze"}, + items, + ) from e # raise on freeze - if err["type"] == 'extra_forbidden': + if err["type"] == "extra_forbidden": if column_mode == "freeze": - raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, err_item) from e + raise DataValidationError( + None, + table_name, + str(err["loc"]), + "columns", + "freeze", + list_model, + {"columns": "freeze"}, + err_item, + ) from e elif column_mode == "discard_row": # pop at the right index items.pop(err_idx - len(deleted)) # store original index so we do not pop again deleted.add(err_idx) else: - raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + raise NotImplementedError( + f"{column_mode} column mode not implemented for Pydantic validation" + ) else: if data_mode == "freeze": - raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", list_model, {"data_type": "freeze"}, err_item) from e + raise DataValidationError( + None, + table_name, + str(err["loc"]), + "data_type", + "freeze", + list_model, + {"data_type": "freeze"}, + err_item, + ) from e elif data_mode == "discard_row": items.pop(err_idx - len(deleted)) deleted.add(err_idx) else: - raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + raise NotImplementedError( + f"{column_mode} column mode not implemented for Pydantic validation" + ) # validate again with error items removed return validate_items(table_name, list_model, items, column_mode, data_mode) @@ -274,7 +339,7 @@ def validate_item( model: Type[_TPydanticModel], item: TDataItems, column_mode: TSchemaEvolutionMode, - data_mode: TSchemaEvolutionMode + data_mode: TSchemaEvolutionMode, ) -> _TPydanticModel: """Validates `item` against model `model` and returns an instance of it""" try: @@ -282,16 +347,38 @@ def validate_item( except ValidationError as e: for err in e.errors(): # raise on freeze - if err["type"] == 'extra_forbidden': + if err["type"] == "extra_forbidden": if column_mode == "freeze": - raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", model, {"columns": "freeze"}, item) from e + raise DataValidationError( + None, + table_name, + str(err["loc"]), + "columns", + "freeze", + model, + {"columns": "freeze"}, + item, + ) from e elif column_mode == "discard_row": return None - raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + raise NotImplementedError( + f"{column_mode} column mode not implemented for Pydantic validation" + ) else: if data_mode == "freeze": - raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", model, {"data_type": "freeze"}, item) from e + raise DataValidationError( + None, + table_name, + str(err["loc"]), + "data_type", + "freeze", + model, + {"data_type": "freeze"}, + item, + ) from e elif data_mode == "discard_row": return None - raise NotImplementedError(f"{data_mode} data mode not implemented for Pydantic validation") - raise AssertionError("unreachable") \ No newline at end of file + raise NotImplementedError( + f"{data_mode} data mode not implemented for Pydantic validation" + ) + raise AssertionError("unreachable") diff --git a/dlt/common/normalizers/__init__.py b/dlt/common/normalizers/__init__.py index e106419df9..2ff41d4c12 100644 --- a/dlt/common/normalizers/__init__.py +++ b/dlt/common/normalizers/__init__.py @@ -4,6 +4,8 @@ __all__ = [ "NormalizersConfiguration", - "TJSONNormalizer", "TNormalizersConfig", - "explicit_normalizers", "import_normalizers" + "TJSONNormalizer", + "TNormalizersConfig", + "explicit_normalizers", + "import_normalizers", ] diff --git a/dlt/common/normalizers/configuration.py b/dlt/common/normalizers/configuration.py index 2c13367abd..6957417f9d 100644 --- a/dlt/common/normalizers/configuration.py +++ b/dlt/common/normalizers/configuration.py @@ -24,5 +24,5 @@ def on_resolved(self) -> None: self.naming = self.destination_capabilities.naming_convention if TYPE_CHECKING: - def __init__(self, naming: str = None, json_normalizer: TJSONNormalizer = None) -> None: - ... + + def __init__(self, naming: str = None, json_normalizer: TJSONNormalizer = None) -> None: ... diff --git a/dlt/common/normalizers/exceptions.py b/dlt/common/normalizers/exceptions.py index b8ad4baed3..248aecc7fe 100644 --- a/dlt/common/normalizers/exceptions.py +++ b/dlt/common/normalizers/exceptions.py @@ -9,4 +9,7 @@ class InvalidJsonNormalizer(NormalizerException): def __init__(self, required_normalizer: str, present_normalizer: str) -> None: self.required_normalizer = required_normalizer self.present_normalizer = present_normalizer - super().__init__(f"Operation requires {required_normalizer} normalizer while {present_normalizer} normalizer is present") + super().__init__( + f"Operation requires {required_normalizer} normalizer while" + f" {present_normalizer} normalizer is present" + ) diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index ab133b36c9..a13bab15f4 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -2,6 +2,7 @@ from typing import Any, Generic, Type, Generator, Tuple, Protocol, TYPE_CHECKING, TypeVar from dlt.common.typing import DictStrAny, TDataItem, StrAny + if TYPE_CHECKING: from dlt.common.schema import Schema else: @@ -15,14 +16,16 @@ # type var for data item normalizer config TNormalizerConfig = TypeVar("TNormalizerConfig", bound=Any) -class DataItemNormalizer(abc.ABC, Generic[TNormalizerConfig]): +class DataItemNormalizer(abc.ABC, Generic[TNormalizerConfig]): @abc.abstractmethod def __init__(self, schema: Schema) -> None: pass @abc.abstractmethod - def normalize_data_item(self, item: TDataItem, load_id: str, table_name: str) -> TNormalizedRowIterator: + def normalize_data_item( + self, item: TDataItem, load_id: str, table_name: str + ) -> TNormalizedRowIterator: pass @abc.abstractmethod diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index c9ce5a9d25..e33bf2ab35 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -9,11 +9,16 @@ from dlt.common.schema.typing import TColumnSchema, TColumnName, TSimpleRegex from dlt.common.schema.utils import column_name_validator from dlt.common.utils import digest128, update_dict_nested -from dlt.common.normalizers.json import TNormalizedRowIterator, wrap_in_dict, DataItemNormalizer as DataItemNormalizerBase +from dlt.common.normalizers.json import ( + TNormalizedRowIterator, + wrap_in_dict, + DataItemNormalizer as DataItemNormalizerBase, +) from dlt.common.validation import validate_dict EMPTY_KEY_IDENTIFIER = "_empty" # replace empty keys with this + class TDataItemRow(TypedDict, total=False): _dlt_id: str # unique id of current row @@ -64,7 +69,7 @@ def _reset(self) -> None: # for those paths the complex nested objects should be left in place def _is_complex_type(self, table_name: str, field_name: str, _r_lvl: int) -> bool: # turn everything at the recursion level into complex type - max_nesting = self.max_nesting + max_nesting = self.max_nesting schema = self.schema assert _r_lvl <= max_nesting @@ -83,14 +88,9 @@ def _is_complex_type(self, table_name: str, field_name: str, _r_lvl: int) -> boo data_type = column["data_type"] return data_type == "complex" - def _flatten( - self, - table: str, - dict_row: TDataItemRow, - _r_lvl: int + self, table: str, dict_row: TDataItemRow, _r_lvl: int ) -> Tuple[TDataItemRow, Dict[Tuple[str, ...], Sequence[Any]]]: - out_rec_row: DictStrAny = {} out_rec_list: Dict[Tuple[str, ...], Sequence[Any]] = {} schema_naming = self.schema.naming @@ -104,7 +104,9 @@ def norm_row_dicts(dict_row: StrAny, __r_lvl: int, path: Tuple[str, ...] = ()) - norm_k = EMPTY_KEY_IDENTIFIER # if norm_k != k: # print(f"{k} -> {norm_k}") - child_name = norm_k if path == () else schema_naming.shorten_fragments(*path, norm_k) + child_name = ( + norm_k if path == () else schema_naming.shorten_fragments(*path, norm_k) + ) # for lists and dicts we must check if type is possibly complex if isinstance(v, (dict, list)): if not self._is_complex_type(table, child_name, __r_lvl): @@ -131,7 +133,6 @@ def _get_child_row_hash(parent_row_id: str, child_table: str, list_idx: int) -> # and all child tables must be lists return digest128(f"{parent_row_id}_{child_table}_{list_idx}", DLT_ID_LENGTH_BYTES) - @staticmethod def _link_row(row: TDataItemRowChild, parent_row_id: str, list_idx: int) -> TDataItemRowChild: assert parent_row_id @@ -144,7 +145,9 @@ def _link_row(row: TDataItemRowChild, parent_row_id: str, list_idx: int) -> TDat def _extend_row(extend: DictStrAny, row: TDataItemRow) -> None: row.update(extend) # type: ignore - def _add_row_id(self, table: str, row: TDataItemRow, parent_row_id: str, pos: int, _r_lvl: int) -> str: + def _add_row_id( + self, table: str, row: TDataItemRow, parent_row_id: str, pos: int, _r_lvl: int + ) -> str: # row_id is always random, no matter if primary_key is present or not row_id = generate_dlt_id() if _r_lvl > 0: @@ -183,19 +186,22 @@ def _normalize_list( ident_path: Tuple[str, ...], parent_path: Tuple[str, ...], parent_row_id: Optional[str] = None, - _r_lvl: int = 0 + _r_lvl: int = 0, ) -> TNormalizedRowIterator: - v: TDataItemRowChild = None table = self.schema.naming.shorten_fragments(*parent_path, *ident_path) for idx, v in enumerate(seq): # yield child table row if isinstance(v, dict): - yield from self._normalize_row(v, extend, ident_path, parent_path, parent_row_id, idx, _r_lvl) + yield from self._normalize_row( + v, extend, ident_path, parent_path, parent_row_id, idx, _r_lvl + ) elif isinstance(v, list): # to normalize lists of lists, we must create a tracking intermediary table by creating a mock row - yield from self._normalize_row({"list": v}, extend, ident_path, parent_path, parent_row_id, idx, _r_lvl + 1) + yield from self._normalize_row( + {"list": v}, extend, ident_path, parent_path, parent_row_id, idx, _r_lvl + 1 + ) else: # list of simple types child_row_hash = DataItemNormalizer._get_child_row_hash(parent_row_id, table, idx) @@ -213,9 +219,8 @@ def _normalize_row( parent_path: Tuple[str, ...] = (), parent_row_id: Optional[str] = None, pos: Optional[int] = None, - _r_lvl: int = 0 + _r_lvl: int = 0, ) -> TNormalizedRowIterator: - schema = self.schema table = schema.naming.shorten_fragments(*parent_path, *ident_path) @@ -229,7 +234,7 @@ def _normalize_row( row_id = self._add_row_id(table, flattened_row, parent_row_id, pos, _r_lvl) # find fields to propagate to child tables in config - extend.update(self._get_propagated_values(table, flattened_row, _r_lvl )) + extend.update(self._get_propagated_values(table, flattened_row, _r_lvl)) # yield parent table first should_descend = yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row @@ -238,11 +243,15 @@ def _normalize_row( # normalize and yield lists for list_path, list_content in lists.items(): - yield from self._normalize_list(list_content, extend, list_path, parent_path + ident_path, row_id, _r_lvl + 1) + yield from self._normalize_list( + list_content, extend, list_path, parent_path + ident_path, row_id, _r_lvl + 1 + ) def extend_schema(self) -> None: # validate config - config = cast(RelationalNormalizerConfig, self.schema._normalizers_config["json"].get("config") or {}) + config = cast( + RelationalNormalizerConfig, self.schema._normalizers_config["json"].get("config") or {} + ) DataItemNormalizer._validate_normalizer_config(self.schema, config) # quick check to see if hints are applied @@ -253,12 +262,15 @@ def extend_schema(self) -> None: self.schema.merge_hints( { "not_null": [ - TSimpleRegex("_dlt_id"), TSimpleRegex("_dlt_root_id"), TSimpleRegex("_dlt_parent_id"), - TSimpleRegex("_dlt_list_idx"), TSimpleRegex("_dlt_load_id") - ], + TSimpleRegex("_dlt_id"), + TSimpleRegex("_dlt_root_id"), + TSimpleRegex("_dlt_parent_id"), + TSimpleRegex("_dlt_list_idx"), + TSimpleRegex("_dlt_load_id"), + ], "foreign_key": [TSimpleRegex("_dlt_parent_id")], "root_key": [TSimpleRegex("_dlt_root_id")], - "unique": [TSimpleRegex("_dlt_id")] + "unique": [TSimpleRegex("_dlt_id")], } ) @@ -269,14 +281,14 @@ def extend_table(self, table_name: str) -> None: # if the table has a merge w_d, add propagation info to normalizer table = self.schema.tables.get(table_name) if not table.get("parent") and table.get("write_disposition") == "merge": - DataItemNormalizer.update_normalizer_config(self.schema, {"propagation": { - "tables": { - table_name: { - "_dlt_id": TColumnName("_dlt_root_id") - } - }}}) - - def normalize_data_item(self, item: TDataItem, load_id: str, table_name: str) -> TNormalizedRowIterator: + DataItemNormalizer.update_normalizer_config( + self.schema, + {"propagation": {"tables": {table_name: {"_dlt_id": TColumnName("_dlt_root_id")}}}}, + ) + + def normalize_data_item( + self, item: TDataItem, load_id: str, table_name: str + ) -> TNormalizedRowIterator: # wrap items that are not dictionaries in dictionary, otherwise they cannot be processed by the JSON normalizer if not isinstance(item, dict): item = wrap_in_dict(item) @@ -284,7 +296,11 @@ def normalize_data_item(self, item: TDataItem, load_id: str, table_name: str) -> row = cast(TDataItemRowRoot, item) # identify load id if loaded data must be processed after loading incrementally row["_dlt_load_id"] = load_id - yield from self._normalize_row(cast(TDataItemRowChild, row), {}, (self.schema.naming.normalize_table_identifier(table_name),)) + yield from self._normalize_row( + cast(TDataItemRowChild, row), + {}, + (self.schema.naming.normalize_table_identifier(table_name),), + ) @classmethod def ensure_this_normalizer(cls, norm_config: TJSONNormalizer) -> None: @@ -311,4 +327,9 @@ def get_normalizer_config(cls, schema: Schema) -> RelationalNormalizerConfig: @staticmethod def _validate_normalizer_config(schema: Schema, config: RelationalNormalizerConfig) -> None: - validate_dict(RelationalNormalizerConfig, config, "./normalizers/json/config", validator_f=column_name_validator(schema.naming)) + validate_dict( + RelationalNormalizerConfig, + config, + "./normalizers/json/config", + validator_f=column_name_validator(schema.naming), + ) diff --git a/dlt/common/normalizers/naming/__init__.py b/dlt/common/normalizers/naming/__init__.py index c8c08ddd63..967fb9643e 100644 --- a/dlt/common/normalizers/naming/__init__.py +++ b/dlt/common/normalizers/naming/__init__.py @@ -1,6 +1,3 @@ from .naming import SupportsNamingConvention, NamingConvention -__all__ = [ - 'SupportsNamingConvention', "NamingConvention" -] - +__all__ = ["SupportsNamingConvention", "NamingConvention"] diff --git a/dlt/common/normalizers/naming/direct.py b/dlt/common/normalizers/naming/direct.py index 3a973106fe..09403d9e53 100644 --- a/dlt/common/normalizers/naming/direct.py +++ b/dlt/common/normalizers/naming/direct.py @@ -17,4 +17,4 @@ def make_path(self, *identifiers: Any) -> str: return self.PATH_SEPARATOR.join(filter(lambda x: x.strip(), identifiers)) def break_path(self, path: str) -> Sequence[str]: - return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] \ No newline at end of file + return [ident for ident in path.split(self.PATH_SEPARATOR) if ident.strip()] diff --git a/dlt/common/normalizers/naming/duck_case.py b/dlt/common/normalizers/naming/duck_case.py index 200c0bbdad..063482a799 100644 --- a/dlt/common/normalizers/naming/duck_case.py +++ b/dlt/common/normalizers/naming/duck_case.py @@ -5,8 +5,7 @@ class NamingConvention(SnakeCaseNamingConvention): - - _CLEANUP_TABLE = str.maketrans("\n\r\"", "___") + _CLEANUP_TABLE = str.maketrans('\n\r"', "___") _RE_LEADING_DIGITS = None # do not remove leading digits @staticmethod @@ -18,7 +17,5 @@ def _normalize_identifier(identifier: str, max_length: int) -> str: # shorten identifier return NamingConvention.shorten_identifier( - NamingConvention._RE_UNDERSCORES.sub("_", normalized_ident), - identifier, - max_length + NamingConvention._RE_UNDERSCORES.sub("_", normalized_ident), identifier, max_length ) diff --git a/dlt/common/normalizers/naming/exceptions.py b/dlt/common/normalizers/naming/exceptions.py index b76362962e..572fc7e0d0 100644 --- a/dlt/common/normalizers/naming/exceptions.py +++ b/dlt/common/normalizers/naming/exceptions.py @@ -1,4 +1,3 @@ - from dlt.common.exceptions import DltException @@ -19,5 +18,8 @@ def __init__(self, naming_module: str) -> None: class InvalidNamingModule(NormalizersException): def __init__(self, naming_module: str) -> None: self.naming_module = naming_module - msg = f"Naming module {naming_module} does not implement required SupportsNamingConvention protocol" + msg = ( + f"Naming module {naming_module} does not implement required SupportsNamingConvention" + " protocol" + ) super().__init__(msg) diff --git a/dlt/common/normalizers/naming/naming.py b/dlt/common/normalizers/naming/naming.py index 80130bace6..fccb147981 100644 --- a/dlt/common/normalizers/naming/naming.py +++ b/dlt/common/normalizers/naming/naming.py @@ -7,7 +7,6 @@ class NamingConvention(ABC): - _TR_TABLE = bytes.maketrans(b"/+", b"ab") _DEFAULT_COLLISION_PROB = 0.001 @@ -46,7 +45,9 @@ def normalize_path(self, path: str) -> str: def normalize_tables_path(self, path: str) -> str: """Breaks path of table identifiers, normalizes components, reconstitutes and shortens the path""" - normalized_idents = [self.normalize_table_identifier(ident) for ident in self.break_path(path)] + normalized_idents = [ + self.normalize_table_identifier(ident) for ident in self.break_path(path) + ] # shorten the whole path return self.shorten_identifier(self.make_path(*normalized_idents), path, self.max_length) @@ -59,7 +60,12 @@ def shorten_fragments(self, *normalized_idents: str) -> str: @staticmethod @lru_cache(maxsize=None) - def shorten_identifier(normalized_ident: str, identifier: str, max_length: int, collision_prob: float = _DEFAULT_COLLISION_PROB) -> str: + def shorten_identifier( + normalized_ident: str, + identifier: str, + max_length: int, + collision_prob: float = _DEFAULT_COLLISION_PROB, + ) -> str: """Shortens the `name` to `max_length` and adds a tag to it to make it unique. Tag may be placed in the middle or at the end""" if max_length and len(normalized_ident) > max_length: # use original identifier to compute tag @@ -72,9 +78,14 @@ def shorten_identifier(normalized_ident: str, identifier: str, max_length: int, def _compute_tag(identifier: str, collision_prob: float) -> str: # assume that shake_128 has perfect collision resistance 2^N/2 then collision prob is 1/resistance: prob = 1/2^N/2, solving for prob # take into account that we are case insensitive in base64 so we need ~1.5x more bits (2+1) - tl_bytes = int(((2+1)*math.log2(1/(collision_prob)) // 8) + 1) - tag = base64.b64encode(hashlib.shake_128(identifier.encode("utf-8")).digest(tl_bytes) - ).rstrip(b"=").translate(NamingConvention._TR_TABLE).lower().decode("ascii") + tl_bytes = int(((2 + 1) * math.log2(1 / (collision_prob)) // 8) + 1) + tag = ( + base64.b64encode(hashlib.shake_128(identifier.encode("utf-8")).digest(tl_bytes)) + .rstrip(b"=") + .translate(NamingConvention._TR_TABLE) + .lower() + .decode("ascii") + ) return tag @staticmethod @@ -82,7 +93,11 @@ def _trim_and_tag(identifier: str, tag: str, max_length: int) -> str: assert len(tag) <= max_length remaining_length = max_length - len(tag) remaining_overflow = remaining_length % 2 - identifier = identifier[:remaining_length // 2 + remaining_overflow] + tag + identifier[len(identifier) - remaining_length // 2:] + identifier = ( + identifier[: remaining_length // 2 + remaining_overflow] + + tag + + identifier[len(identifier) - remaining_length // 2 :] + ) assert len(identifier) == max_length return identifier diff --git a/dlt/common/normalizers/naming/snake_case.py b/dlt/common/normalizers/naming/snake_case.py index 12aa887d6e..b3c65e9b8d 100644 --- a/dlt/common/normalizers/naming/snake_case.py +++ b/dlt/common/normalizers/naming/snake_case.py @@ -6,7 +6,6 @@ class NamingConvention(BaseNamingConvention): - _RE_UNDERSCORES = re.compile("__+") _RE_LEADING_DIGITS = re.compile(r"^\d+") # _RE_ENDING_UNDERSCORES = re.compile(r"_+$") @@ -41,16 +40,14 @@ def _normalize_identifier(identifier: str, max_length: int) -> str: # shorten identifier return NamingConvention.shorten_identifier( - NamingConvention._to_snake_case(normalized_ident), - identifier, - max_length + NamingConvention._to_snake_case(normalized_ident), identifier, max_length ) @classmethod def _to_snake_case(cls, identifier: str) -> str: # then convert to snake case - identifier = cls._SNAKE_CASE_BREAK_1.sub(r'\1_\2', identifier) - identifier = cls._SNAKE_CASE_BREAK_2.sub(r'\1_\2', identifier).lower() + identifier = cls._SNAKE_CASE_BREAK_1.sub(r"\1_\2", identifier) + identifier = cls._SNAKE_CASE_BREAK_2.sub(r"\1_\2", identifier).lower() # leading digits will be prefixed (if regex is defined) if cls._RE_LEADING_DIGITS and cls._RE_LEADING_DIGITS.match(identifier): @@ -63,4 +60,4 @@ def _to_snake_case(cls, identifier: str) -> str: # identifier = cls._RE_ENDING_UNDERSCORES.sub("x", identifier) # replace consecutive underscores with single one to prevent name clashes with PATH_SEPARATOR - return cls._RE_UNDERSCORES.sub("_", stripped_ident) \ No newline at end of file + return cls._RE_UNDERSCORES.sub("_", stripped_ident) diff --git a/dlt/common/normalizers/typing.py b/dlt/common/normalizers/typing.py index 93920fda1b..599426259f 100644 --- a/dlt/common/normalizers/typing.py +++ b/dlt/common/normalizers/typing.py @@ -11,4 +11,4 @@ class TJSONNormalizer(TypedDict, total=False): class TNormalizersConfig(TypedDict, total=False): names: str detections: Optional[List[str]] - json: TJSONNormalizer \ No newline at end of file + json: TJSONNormalizer diff --git a/dlt/common/normalizers/utils.py b/dlt/common/normalizers/utils.py index aabaa39017..dde78edede 100644 --- a/dlt/common/normalizers/utils.py +++ b/dlt/common/normalizers/utils.py @@ -14,10 +14,10 @@ DEFAULT_NAMING_MODULE = "dlt.common.normalizers.naming.snake_case" DLT_ID_LENGTH_BYTES = 10 + @with_config(spec=NormalizersConfiguration) def explicit_normalizers( - naming: str = dlt.config.value , - json_normalizer: TJSONNormalizer = dlt.config.value + naming: str = dlt.config.value, json_normalizer: TJSONNormalizer = dlt.config.value ) -> TNormalizersConfig: """Gets explicitly configured normalizers - via config or destination caps. May return None as naming or normalizer""" return {"names": naming, "json": json_normalizer} @@ -26,15 +26,17 @@ def explicit_normalizers( @with_config def import_normalizers( normalizers_config: TNormalizersConfig, - destination_capabilities: DestinationCapabilitiesContext = None + destination_capabilities: DestinationCapabilitiesContext = None, ) -> Tuple[TNormalizersConfig, NamingConvention, Type[DataItemNormalizer[Any]]]: """Imports the normalizers specified in `normalizers_config` or taken from defaults. Returns the updated config and imported modules. - `destination_capabilities` are used to get max length of the identifier. + `destination_capabilities` are used to get max length of the identifier. """ # add defaults to normalizer_config normalizers_config["names"] = names = normalizers_config["names"] or "snake_case" - normalizers_config["json"] = item_normalizer = normalizers_config["json"] or {"module": "dlt.common.normalizers.json.relational"} + normalizers_config["json"] = item_normalizer = normalizers_config["json"] or { + "module": "dlt.common.normalizers.json.relational" + } try: if "." in names: # TODO: bump schema engine version and migrate schema. also change the name in TNormalizersConfig from names to naming @@ -44,19 +46,28 @@ def import_normalizers( naming_module = cast(SupportsNamingConvention, import_module(names)) else: # from known location - naming_module = cast(SupportsNamingConvention, import_module(f"dlt.common.normalizers.naming.{names}")) + naming_module = cast( + SupportsNamingConvention, import_module(f"dlt.common.normalizers.naming.{names}") + ) except ImportError: raise UnknownNamingModule(names) if not hasattr(naming_module, "NamingConvention"): raise InvalidNamingModule(names) # get max identifier length if destination_capabilities: - max_length = min(destination_capabilities.max_identifier_length, destination_capabilities.max_column_identifier_length) + max_length = min( + destination_capabilities.max_identifier_length, + destination_capabilities.max_column_identifier_length, + ) else: max_length = None json_module = cast(SupportsDataItemNormalizer, import_module(item_normalizer["module"])) - return normalizers_config, naming_module.NamingConvention(max_length), json_module.DataItemNormalizer + return ( + normalizers_config, + naming_module.NamingConvention(max_length), + json_module.DataItemNormalizer, + ) def generate_dlt_ids(n_ids: int) -> List[str]: diff --git a/dlt/common/pendulum.py b/dlt/common/pendulum.py index 3d1c784488..fdf003531b 100644 --- a/dlt/common/pendulum.py +++ b/dlt/common/pendulum.py @@ -2,7 +2,7 @@ import pendulum # noqa: I251 # force UTC as the local timezone to prevent local dates to be written to dbs -pendulum.set_local_timezone(pendulum.timezone('UTC')) +pendulum.set_local_timezone(pendulum.timezone("UTC")) # type: ignore def __utcnow() -> pendulum.DateTime: diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index 973abb2451..fc632003c1 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -2,7 +2,20 @@ import datetime # noqa: 251 import humanize import contextlib -from typing import Any, Callable, ClassVar, Dict, List, NamedTuple, Optional, Protocol, Sequence, TYPE_CHECKING, Tuple, TypedDict +from typing import ( + Any, + Callable, + ClassVar, + Dict, + List, + NamedTuple, + Optional, + Protocol, + Sequence, + TYPE_CHECKING, + Tuple, + TypedDict, +) from typing_extensions import NotRequired from dlt.common import pendulum, logger @@ -15,7 +28,12 @@ from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.configuration.specs import RunConfiguration from dlt.common.destination import Destination, TDestinationReferenceArg, TDestination -from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable +from dlt.common.exceptions import ( + DestinationHasFailedJobs, + PipelineStateNotAvailable, + ResourceNameNotAvailable, + SourceSectionNotAvailable, +) from dlt.common.schema import Schema from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name @@ -72,6 +90,7 @@ def __str__(self) -> str: class LoadInfo(NamedTuple): """A tuple holding the information on recently loaded packages. Returned by pipeline `run` and `load` methods""" + pipeline: "SupportsPipeline" destination_name: str destination_displayable_credentials: str @@ -89,9 +108,7 @@ class LoadInfo(NamedTuple): def asdict(self) -> DictStrAny: """A dictionary representation of LoadInfo that can be loaded with `dlt`""" d = self._asdict() - d["pipeline"] = { - "pipeline_name": self.pipeline.pipeline_name - } + d["pipeline"] = {"pipeline_name": self.pipeline.pipeline_name} d["load_packages"] = [package.asdict() for package in self.load_packages] return d @@ -102,11 +119,20 @@ def asstr(self, verbosity: int = 0) -> str: msg += humanize.precisedelta(elapsed) else: msg += "---" - msg += f"\n{len(self.loads_ids)} load package(s) were loaded to destination {self.destination_name} and into dataset {self.dataset_name}\n" + msg += ( + f"\n{len(self.loads_ids)} load package(s) were loaded to destination" + f" {self.destination_name} and into dataset {self.dataset_name}\n" + ) if self.staging_name: - msg += f"The {self.staging_name} staging destination used {self.staging_displayable_credentials} location to stage data\n" - - msg += f"The {self.destination_name} destination used {self.destination_displayable_credentials} location to store data" + msg += ( + f"The {self.staging_name} staging destination used" + f" {self.staging_displayable_credentials} location to stage data\n" + ) + + msg += ( + f"The {self.destination_name} destination used" + f" {self.destination_displayable_credentials} location to store data" + ) for load_package in self.load_packages: cstr = load_package.state.upper() if load_package.completed_at else "NOT COMPLETED" # now enumerate all complete loads if we have any failed packages @@ -116,7 +142,9 @@ def asstr(self, verbosity: int = 0) -> str: msg += f"\nLoad package {load_package.load_id} is {cstr} and contains {jobs_str}" if verbosity > 0: for failed_job in failed_jobs: - msg += f"\n\t[{failed_job.job_file_info.job_id()}]: {failed_job.failed_message}\n" + msg += ( + f"\n\t[{failed_job.job_file_info.job_id()}]: {failed_job.failed_message}\n" + ) if verbosity > 1: msg += "\nPackage details:\n" msg += load_package.asstr() + "\n" @@ -135,11 +163,14 @@ def raise_on_failed_jobs(self) -> None: for load_package in self.load_packages: failed_jobs = load_package.jobs["failed_jobs"] if len(failed_jobs): - raise DestinationHasFailedJobs(self.destination_name, load_package.load_id, failed_jobs) + raise DestinationHasFailedJobs( + self.destination_name, load_package.load_id, failed_jobs + ) def __str__(self) -> str: return self.asstr(verbosity=1) + class TPipelineLocalState(TypedDict, total=False): first_run: bool """Indicates a first run of the pipeline, where run ends with successful loading of data""" @@ -149,6 +180,7 @@ class TPipelineLocalState(TypedDict, total=False): class TPipelineState(TypedDict, total=False): """Schema for a pipeline state that is stored within the pipeline working directory""" + pipeline_name: str dataset_name: str default_schema_name: Optional[str] @@ -173,6 +205,7 @@ class TSourceState(TPipelineState): class SupportsPipeline(Protocol): """A protocol with core pipeline operations that lets high level abstractions ie. sources to access pipeline methods and properties""" + pipeline_name: str """Name of the pipeline""" default_schema_name: str @@ -214,8 +247,7 @@ def run( schema: Schema = None, loader_file_format: TLoaderFileFormat = None, schema_contract: TSchemaContract = None, - ) -> LoadInfo: - ... + ) -> LoadInfo: ... def _set_context(self, is_active: bool) -> None: """Called when pipeline context activated or deactivate""" @@ -235,9 +267,8 @@ def __call__( write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None - ) -> LoadInfo: - ... + loader_file_format: TLoaderFileFormat = None, + ) -> LoadInfo: ... @configspec @@ -282,17 +313,19 @@ class StateInjectableContext(ContainerInjectableContext): can_create_default: ClassVar[bool] = False if TYPE_CHECKING: - def __init__(self, state: TPipelineState = None) -> None: - ... + + def __init__(self, state: TPipelineState = None) -> None: ... -def pipeline_state(container: Container, initial_default: TPipelineState = None) -> Tuple[TPipelineState, bool]: +def pipeline_state( + container: Container, initial_default: TPipelineState = None +) -> Tuple[TPipelineState, bool]: """Gets value of the state from context or active pipeline, if none found returns `initial_default` - Injected state is called "writable": it is injected by the `Pipeline` class and all the changes will be persisted. - The state coming from pipeline context or `initial_default` is called "read only" and all the changes to it will be discarded + Injected state is called "writable": it is injected by the `Pipeline` class and all the changes will be persisted. + The state coming from pipeline context or `initial_default` is called "read only" and all the changes to it will be discarded - Returns tuple (state, writable) + Returns tuple (state, writable) """ try: # get injected state if present. injected state is typically "managed" so changes will be persisted @@ -364,7 +397,9 @@ def source_state() -> DictStrAny: _last_full_state: TPipelineState = None -def _delete_source_state_keys(key: TAnyJsonPath, source_state_: Optional[DictStrAny] = None, /) -> None: +def _delete_source_state_keys( + key: TAnyJsonPath, source_state_: Optional[DictStrAny] = None, / +) -> None: """Remove one or more key from the source state. The `key` can be any number of keys and/or json paths to be removed. """ @@ -372,7 +407,9 @@ def _delete_source_state_keys(key: TAnyJsonPath, source_state_: Optional[DictStr delete_matches(key, state_) -def resource_state(resource_name: str = None, source_state_: Optional[DictStrAny] = None, /) -> DictStrAny: +def resource_state( + resource_name: str = None, source_state_: Optional[DictStrAny] = None, / +) -> DictStrAny: """Returns a dictionary with the resource-scoped state. Resource-scoped state is visible only to resource requesting the access. Dlt state is preserved across pipeline runs and may be used to implement incremental loads. Note that this function accepts the resource name as optional argument. There are rare cases when `dlt` is not able to resolve resource name due to requesting function @@ -422,7 +459,7 @@ def resource_state(resource_name: str = None, source_state_: Optional[DictStrAny resource_name = get_current_pipe_name() if not resource_name: raise ResourceNameNotAvailable() - return state_.setdefault('resources', {}).setdefault(resource_name, {}) # type: ignore + return state_.setdefault("resources", {}).setdefault(resource_name, {}) # type: ignore def reset_resource_state(resource_name: str, source_state_: Optional[DictStrAny] = None, /) -> None: @@ -437,7 +474,9 @@ def reset_resource_state(resource_name: str, source_state_: Optional[DictStrAny] state_["resources"].pop(resource_name) -def _get_matching_resources(pattern: REPattern, source_state_: Optional[DictStrAny] = None, /) -> List[str]: +def _get_matching_resources( + pattern: REPattern, source_state_: Optional[DictStrAny] = None, / +) -> List[str]: """Get all resource names in state matching the regex pattern""" state_ = source_state() if source_state_ is None else source_state_ if "resources" not in state_: @@ -446,10 +485,10 @@ def _get_matching_resources(pattern: REPattern, source_state_: Optional[DictStrA def get_dlt_pipelines_dir() -> str: - """ Gets default directory where pipelines' data will be stored - 1. in user home directory ~/.dlt/pipelines/ - 2. if current user is root in /var/dlt/pipelines - 3. if current user does not have a home directory in /tmp/dlt/pipelines + """Gets default directory where pipelines' data will be stored + 1. in user home directory ~/.dlt/pipelines/ + 2. if current user is root in /var/dlt/pipelines + 3. if current user does not have a home directory in /tmp/dlt/pipelines """ return os.path.join(get_dlt_data_dir(), "pipelines") diff --git a/dlt/common/reflection/function_visitor.py b/dlt/common/reflection/function_visitor.py index 3b89403745..6cb6016a7f 100644 --- a/dlt/common/reflection/function_visitor.py +++ b/dlt/common/reflection/function_visitor.py @@ -2,6 +2,7 @@ from ast import NodeVisitor from typing import Any + class FunctionVisitor(NodeVisitor): def __init__(self, source: str): self.source = source diff --git a/dlt/common/reflection/spec.py b/dlt/common/reflection/spec.py index 58a75fb53e..0a486088c8 100644 --- a/dlt/common/reflection/spec.py +++ b/dlt/common/reflection/spec.py @@ -15,7 +15,9 @@ def _get_spec_name_from_f(f: AnyFun) -> str: - func_name = get_callable_name(f, "__qualname__").replace(".", "") # func qual name contains position in the module, separated by dots + func_name = get_callable_name(f, "__qualname__").replace( + ".", "" + ) # func qual name contains position in the module, separated by dots def _first_up(s: str) -> str: return s[0].upper() + s[1:] @@ -23,7 +25,9 @@ def _first_up(s: str) -> str: return "".join(map(_first_up, _SLEEPING_CAT_SPLIT.findall(func_name))) + "Configuration" -def spec_from_signature(f: AnyFun, sig: Signature, include_defaults: bool = True) -> Type[BaseConfiguration]: +def spec_from_signature( + f: AnyFun, sig: Signature, include_defaults: bool = True +) -> Type[BaseConfiguration]: name = _get_spec_name_from_f(f) module = inspect.getmodule(f) @@ -60,7 +64,10 @@ def dlt_config_literal_to_type(arg_name: str) -> AnyType: for p in sig.parameters.values(): # skip *args and **kwargs, skip typical method params - if p.kind not in (Parameter.VAR_KEYWORD, Parameter.VAR_POSITIONAL) and p.name not in ["self", "cls"]: + if p.kind not in (Parameter.VAR_KEYWORD, Parameter.VAR_POSITIONAL) and p.name not in [ + "self", + "cls", + ]: field_type = AnyType if p.annotation == Parameter.empty else p.annotation # only valid hints and parameters with defaults are eligible if is_valid_hint(field_type) and p.default != Parameter.empty: diff --git a/dlt/common/reflection/utils.py b/dlt/common/reflection/utils.py index c9c1ad92ed..9bd3cb6775 100644 --- a/dlt/common/reflection/utils.py +++ b/dlt/common/reflection/utils.py @@ -68,12 +68,16 @@ def creates_func_def_name_node(func_def: ast.FunctionDef, source_lines: Sequence """Recreate function name as a ast.Name with known source code location""" func_name = ast.Name(func_def.name) func_name.lineno = func_name.end_lineno = func_def.lineno - func_name.col_offset = source_lines[func_name.lineno - 1].index(func_def.name) # find where function name starts + func_name.col_offset = source_lines[func_name.lineno - 1].index( + func_def.name + ) # find where function name starts func_name.end_col_offset = func_name.col_offset + len(func_def.name) return func_name -def rewrite_python_script(source_script_lines: List[str], transformed_nodes: List[Tuple[ast.AST, ast.AST]]) -> List[str]: +def rewrite_python_script( + source_script_lines: List[str], transformed_nodes: List[Tuple[ast.AST, ast.AST]] +) -> List[str]: """Replaces all the nodes present in `transformed_nodes` in the `script_lines`. The `transformed_nodes` is a tuple where the first element is must be a node with full location information created out of `script_lines`""" script_lines: List[str] = [] @@ -87,12 +91,12 @@ def rewrite_python_script(source_script_lines: List[str], transformed_nodes: Lis if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) # add all new lines from previous line to current - script_lines.extend(source_script_lines[last_line+1:node.lineno-1]) + script_lines.extend(source_script_lines[last_line + 1 : node.lineno - 1]) # add trailing characters until node in current line starts - script_lines.append(source_script_lines[node.lineno-1][:node.col_offset]) + script_lines.append(source_script_lines[node.lineno - 1][: node.col_offset]) elif last_offset >= 0: # no line change, add the characters from the end of previous node to the current - script_lines.append(source_script_lines[last_line][last_offset:node.col_offset]) + script_lines.append(source_script_lines[last_line][last_offset : node.col_offset]) # replace node value script_lines.append(astunparse.unparse(t_value).strip()) @@ -102,7 +106,7 @@ def rewrite_python_script(source_script_lines: List[str], transformed_nodes: Lis # add all that was missing if last_offset >= 0: script_lines.append(source_script_lines[last_line][last_offset:]) - script_lines.extend(source_script_lines[last_line+1:]) + script_lines.extend(source_script_lines[last_line + 1 :]) return script_lines diff --git a/dlt/common/runners/__init__.py b/dlt/common/runners/__init__.py index 9af668ce87..2c5916eaec 100644 --- a/dlt/common/runners/__init__.py +++ b/dlt/common/runners/__init__.py @@ -5,8 +5,12 @@ __all__ = [ - "run_pool", "NullExecutor", - "Runnable", "workermethod", "TExecutor", + "run_pool", + "NullExecutor", + "Runnable", + "workermethod", + "TExecutor", "TRunMetrics", - "Venv", "VenvNotFound" + "Venv", + "VenvNotFound", ] diff --git a/dlt/common/runners/configuration.py b/dlt/common/runners/configuration.py index 3231f83807..6953c72cf1 100644 --- a/dlt/common/runners/configuration.py +++ b/dlt/common/runners/configuration.py @@ -13,9 +13,5 @@ class PoolRunnerConfiguration(BaseConfiguration): run_sleep: float = 0.1 # how long to sleep between runs with workload, seconds if TYPE_CHECKING: - def __init__( - self, - pool_type: TPoolType = None, - workers: int = None - ) -> None: - ... + + def __init__(self, pool_type: TPoolType = None, workers: int = None) -> None: ... diff --git a/dlt/common/runners/pool_runner.py b/dlt/common/runners/pool_runner.py index 7b83d68e0f..31a809dc9c 100644 --- a/dlt/common/runners/pool_runner.py +++ b/dlt/common/runners/pool_runner.py @@ -43,12 +43,11 @@ def create_pool(config: PoolRunnerConfiguration) -> Executor: max_workers=config.workers, initializer=init.initialize_runtime, initargs=(init._RUN_CONFIGURATION,), - mp_context=multiprocessing.get_context() - ) + mp_context=multiprocessing.get_context(), + ) else: return ProcessPoolExecutor( - max_workers=config.workers, - mp_context=multiprocessing.get_context() + max_workers=config.workers, mp_context=multiprocessing.get_context() ) elif config.pool_type == "thread": return ThreadPoolExecutor(max_workers=config.workers) @@ -56,10 +55,15 @@ def create_pool(config: PoolRunnerConfiguration) -> Executor: return NullExecutor() -def run_pool(config: PoolRunnerConfiguration, run_f: Union[Runnable[TExecutor], Callable[[TExecutor], TRunMetrics]]) -> int: +def run_pool( + config: PoolRunnerConfiguration, + run_f: Union[Runnable[TExecutor], Callable[[TExecutor], TRunMetrics]], +) -> int: # validate the run function if not isinstance(run_f, Runnable) and not callable(run_f): - raise ValueError(run_f, "Pool runner entry point must be a function f(pool: TPool) or Runnable") + raise ValueError( + run_f, "Pool runner entry point must be a function f(pool: TPool) or Runnable" + ) # start pool pool = create_pool(config) diff --git a/dlt/common/runners/runnable.py b/dlt/common/runners/runnable.py index c2d9989bb5..fe795fe73e 100644 --- a/dlt/common/runners/runnable.py +++ b/dlt/common/runners/runnable.py @@ -20,7 +20,9 @@ class Runnable(ABC, Generic[TExecutor]): # use weak reference container, once other references are dropped the referenced object is garbage collected RUNNING: TWeakValueDictionary = WeakValueDictionary({}) - def __new__(cls: Type["Runnable[TExecutor]"], *args: Any, **kwargs: Any) -> "Runnable[TExecutor]": + def __new__( + cls: Type["Runnable[TExecutor]"], *args: Any, **kwargs: Any + ) -> "Runnable[TExecutor]": """Registers Runnable instance as running for a time when context is active. Used with `~workermethod` decorator to pass a class instance to decorator function that must be static thus avoiding pickling such instance. @@ -50,6 +52,7 @@ def workermethod(f: TFun) -> TFun: Returns: TFun: wrapped worker function """ + @wraps(f) def _wrap(rid: Union[int, Runnable[TExecutor]], *args: Any, **kwargs: Any) -> Any: if isinstance(rid, int): @@ -95,4 +98,3 @@ def _wrap(rid: Union[int, Runnable[TExecutor]], *args: Any, **kwargs: Any) -> An # return f(config, *args, **kwargs) # return _wrap # type: ignore - diff --git a/dlt/common/runners/stdout.py b/dlt/common/runners/stdout.py index a9f4ab1438..8ddfb45ee4 100644 --- a/dlt/common/runners/stdout.py +++ b/dlt/common/runners/stdout.py @@ -26,14 +26,16 @@ def exec_to_stdout(f: AnyFun) -> Iterator[Any]: def iter_stdout(venv: Venv, command: str, *script_args: Any) -> Iterator[str]: # start a process in virtual environment, assume that text comes from stdout - with venv.start_command(command, *script_args, stdout=PIPE, stderr=PIPE, bufsize=1, text=True) as process: + with venv.start_command( + command, *script_args, stdout=PIPE, stderr=PIPE, bufsize=1, text=True + ) as process: exit_code: int = None line = "" stderr: List[str] = [] def _r_stderr() -> None: nonlocal stderr - for line in iter(process.stderr.readline, ''): + for line in iter(process.stderr.readline, ""): stderr.append(line) # read stderr with a thread, selectors do not work on windows @@ -41,7 +43,7 @@ def _r_stderr() -> None: t.start() # read stdout with - for line in iter(process.stdout.readline, ''): + for line in iter(process.stdout.readline, ""): if line.endswith("\n"): yield line[:-1] else: @@ -57,9 +59,11 @@ def _r_stderr() -> None: raise CalledProcessError(exit_code, command, output=line, stderr="".join(stderr)) -def iter_stdout_with_result(venv: Venv, command: str, *script_args: Any) -> Generator[str, None, Any]: +def iter_stdout_with_result( + venv: Venv, command: str, *script_args: Any +) -> Generator[str, None, Any]: """Yields stdout lines coming from remote process and returns the last result decoded with decode_obj. In case of exit code != 0 if exception is decoded - it will be raised, otherwise CalledProcessError is raised""" + it will be raised, otherwise CalledProcessError is raised""" last_result: Any = None try: for line in iter_stdout(venv, command, *script_args): diff --git a/dlt/common/runners/synth_pickle.py b/dlt/common/runners/synth_pickle.py index 420e89a74a..ba0c87f28d 100644 --- a/dlt/common/runners/synth_pickle.py +++ b/dlt/common/runners/synth_pickle.py @@ -15,6 +15,7 @@ def __init__(*args: Any, **kwargs: Any) -> None: class SynthesizingUnpickler(pickle.Unpickler): """Unpickler that synthesizes missing types instead of raising""" + def find_class(self, module: str, name: str) -> Any: if module not in sys.modules: module_obj = sys.modules[__name__] @@ -24,7 +25,7 @@ def find_class(self, module: str, name: str) -> Any: return getattr(module_obj, name) except Exception: # synthesize type - t = type(name, (MissingUnpickledType, ), {"__module__": module}) + t = type(name, (MissingUnpickledType,), {"__module__": module}) setattr(module_obj, name, t) return t diff --git a/dlt/common/runners/venv.py b/dlt/common/runners/venv.py index d81e7384b4..9a92b30326 100644 --- a/dlt/common/runners/venv.py +++ b/dlt/common/runners/venv.py @@ -19,7 +19,7 @@ def post_setup(self, context: types.SimpleNamespace) -> None: self.context = context -class Venv(): +class Venv: """Creates and wraps the Python Virtual Environment to allow for code execution""" def __init__(self, context: types.SimpleNamespace, current: bool = False) -> None: @@ -59,6 +59,7 @@ def restore_current(cls) -> "Venv": venv = cls.restore(os.environ["VIRTUAL_ENV"], current=True) except KeyError: import sys + # do not set bin path because it is not known context = types.SimpleNamespace(bin_path="", env_exe=sys.executable) venv = cls(context, current=True) @@ -69,7 +70,9 @@ def __enter__(self) -> "Venv": raise NotImplementedError("Context manager does not work with current venv") return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType + ) -> None: self.delete_environment() def delete_environment(self) -> None: @@ -80,7 +83,9 @@ def delete_environment(self) -> None: if self.context.env_dir and os.path.isdir(self.context.env_dir): shutil.rmtree(self.context.env_dir) - def start_command(self, entry_point: str, *script_args: Any, **popen_kwargs: Any) -> "subprocess.Popen[str]": + def start_command( + self, entry_point: str, *script_args: Any, **popen_kwargs: Any + ) -> "subprocess.Popen[str]": command = os.path.join(self.context.bin_path, entry_point) cmd = [command, *script_args] return subprocess.Popen(cmd, **popen_kwargs) @@ -120,7 +125,6 @@ def _install_deps(context: types.SimpleNamespace, dependencies: List[str]) -> No except subprocess.CalledProcessError as exc: raise CannotInstallDependencies(dependencies, context.env_exe, exc.output) - @staticmethod def is_virtual_env() -> bool: """Checks if we are running in virtual environment""" diff --git a/dlt/common/runtime/collector.py b/dlt/common/runtime/collector.py index 5e7143241e..eec379564c 100644 --- a/dlt/common/runtime/collector.py +++ b/dlt/common/runtime/collector.py @@ -4,7 +4,20 @@ import time from abc import ABC, abstractmethod from collections import defaultdict -from typing import Any, ContextManager, Dict, Type, TYPE_CHECKING, DefaultDict, NamedTuple, Optional, Union, TextIO, TypeVar +from typing import ( + Any, + ContextManager, + Dict, + Type, + TYPE_CHECKING, + DefaultDict, + NamedTuple, + Optional, + Union, + TextIO, + TypeVar, +) + if TYPE_CHECKING: from tqdm import tqdm import enlighten @@ -20,11 +33,12 @@ class Collector(ABC): - step: str @abstractmethod - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None) -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None + ) -> None: """Creates or updates a counter This function updates a counter `name` with a value `inc`. If counter does not exist, it is created with optional total value of `total`. @@ -65,7 +79,9 @@ def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb class NullCollector(Collector): """A default counter that does not count anything.""" - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None) -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None + ) -> None: pass def _start(self, step: str) -> None: @@ -81,7 +97,9 @@ class DictCollector(Collector): def __init__(self) -> None: self.counters: DefaultDict[str, int] = None - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None) -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None + ) -> None: assert not label, "labels not supported in dict collector" self.counters[name] += inc @@ -103,7 +121,13 @@ class CounterInfo(NamedTuple): start_time: float total: Optional[int] - def __init__(self, log_period: float = 1.0, logger: Union[logging.Logger, TextIO] = sys.stdout, log_level: int = logging.INFO, dump_system_stats: bool = True) -> None: + def __init__( + self, + log_period: float = 1.0, + logger: Union[logging.Logger, TextIO] = sys.stdout, + log_level: int = logging.INFO, + dump_system_stats: bool = True, + ) -> None: """ Collector writing to a `logger` every `log_period` seconds. The logger can be a Python logger instance, text stream, or None that will attach `dlt` logger @@ -123,12 +147,19 @@ def __init__(self, log_period: float = 1.0, logger: Union[logging.Logger, TextIO try: import psutil except ImportError: - self._log(logging.WARNING, "psutil dependency is not installed and mem stats will not be available. add psutil to your environment or pass dump_system_stats argument as False to disable warning.") + self._log( + logging.WARNING, + "psutil dependency is not installed and mem stats will not be available. add" + " psutil to your environment or pass dump_system_stats argument as False to" + " disable warning.", + ) dump_system_stats = False self.dump_system_stats = dump_system_stats self.last_log_time: float = None - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None) -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = None + ) -> None: counter_key = f"{name}_{label}" if label else name if counter_key not in self.counters: @@ -169,7 +200,10 @@ def dump_counters(self) -> None: items_per_second_str = f"{items_per_second:.2f}/s" message = f"[{self.messages[name]}]" if self.messages[name] is not None else "" - counter_line = f"{info.description}: {progress} {percentage} | Time: {elapsed_time_str} | Rate: {items_per_second_str} {message}" + counter_line = ( + f"{info.description}: {progress} {percentage} | Time: {elapsed_time_str} | Rate:" + f" {items_per_second_str} {message}" + ) log_lines.append(counter_line.strip()) if self.dump_system_stats: @@ -177,10 +211,13 @@ def dump_counters(self) -> None: process = psutil.Process(os.getpid()) mem_info = process.memory_info() - current_mem = mem_info.rss / (1024 ** 2) # Convert to MB + current_mem = mem_info.rss / (1024**2) # Convert to MB mem_percent = psutil.virtual_memory().percent cpu_percent = process.cpu_percent() - log_lines.append(f"Memory usage: {current_mem:.2f} MB ({mem_percent:.2f}%) | CPU usage: {cpu_percent:.2f}%") + log_lines.append( + f"Memory usage: {current_mem:.2f} MB ({mem_percent:.2f}%) | CPU usage:" + f" {cpu_percent:.2f}%" + ) log_lines.append("") log_message = "\n".join(log_lines) @@ -218,12 +255,16 @@ def __init__(self, single_bar: bool = False, **tqdm_kwargs: Any) -> None: global tqdm from tqdm import tqdm except ModuleNotFoundError: - raise MissingDependencyException("TqdmCollector", ["tqdm"], "We need tqdm to display progress bars.") + raise MissingDependencyException( + "TqdmCollector", ["tqdm"], "We need tqdm to display progress bars." + ) self.single_bar = single_bar self._bars: Dict[str, tqdm[None]] = {} self.tqdm_kwargs = tqdm_kwargs or {} - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "") -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "" + ) -> None: key = f"{name}_{label}" bar = self._bars.get(key) if bar is None: @@ -263,13 +304,19 @@ def __init__(self, single_bar: bool = True, **alive_kwargs: Any) -> None: from alive_progress import alive_bar except ModuleNotFoundError: - raise MissingDependencyException("AliveCollector", ["alive-progress"], "We need alive-progress to display progress bars.") + raise MissingDependencyException( + "AliveCollector", + ["alive-progress"], + "We need alive-progress to display progress bars.", + ) self.single_bar = single_bar self._bars: Dict[str, Any] = {} self._bars_contexts: Dict[str, ContextManager[Any]] = {} self.alive_kwargs = alive_kwargs or {} - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "") -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "" + ) -> None: key = f"{name}_{label}" bar = self._bars.get(key) if bar is None: @@ -313,13 +360,23 @@ def __init__(self, single_bar: bool = False, **enlighten_kwargs: Any) -> None: global enlighten import enlighten - from enlighten import Counter as EnlCounter, StatusBar as EnlStatusBar, Manager as EnlManager + from enlighten import ( + Counter as EnlCounter, + StatusBar as EnlStatusBar, + Manager as EnlManager, + ) except ModuleNotFoundError: - raise MissingDependencyException("EnlightenCollector", ["enlighten"], "We need enlighten to display progress bars with a space for log messages.") + raise MissingDependencyException( + "EnlightenCollector", + ["enlighten"], + "We need enlighten to display progress bars with a space for log messages.", + ) self.single_bar = single_bar self.enlighten_kwargs = enlighten_kwargs - def update(self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "") -> None: + def update( + self, name: str, inc: int = 1, total: int = None, message: str = None, label: str = "" + ) -> None: key = f"{name}_{label}" bar = self._bars.get(key) if bar is None: @@ -328,7 +385,9 @@ def update(self, name: str, inc: int = 1, total: int = None, message: str = Non if len(self._bars) > 0 and self.single_bar: # do not add any more counters return - bar = self._manager.counter(desc=name, total=total, leave=True, force=True, **self.enlighten_kwargs) + bar = self._manager.counter( + desc=name, total=total, leave=True, force=True, **self.enlighten_kwargs + ) bar.refresh() self._bars[key] = bar bar.update(inc) @@ -336,7 +395,9 @@ def update(self, name: str, inc: int = 1, total: int = None, message: str = Non def _start(self, step: str) -> None: self._bars = {} self._manager = enlighten.get_manager(enabled=True) - self._status = self._manager.status_bar(leave=True, justify=enlighten.Justify.CENTER, fill="=") + self._status = self._manager.status_bar( + leave=True, justify=enlighten.Justify.CENTER, fill="=" + ) self._status.update(step) def _stop(self) -> None: @@ -352,4 +413,4 @@ def _stop(self) -> None: self._status = None -NULL_COLLECTOR = NullCollector() +NULL_COLLECTOR = NullCollector() diff --git a/dlt/common/runtime/exec_info.py b/dlt/common/runtime/exec_info.py index ecb8376aa7..d365156ad2 100644 --- a/dlt/common/runtime/exec_info.py +++ b/dlt/common/runtime/exec_info.py @@ -7,7 +7,17 @@ from dlt.version import __version__ -TExecInfoNames = Literal["kubernetes", "docker", "codespaces", "github_actions", "airflow", "notebook", "colab","aws_lambda","gcp_cloud_function"] +TExecInfoNames = Literal[ + "kubernetes", + "docker", + "codespaces", + "github_actions", + "airflow", + "notebook", + "colab", + "aws_lambda", + "gcp_cloud_function", +] # if one of these environment variables is set, we assume to be running in CI env CI_ENVIRONMENT_TELL = [ "bamboo.buildKey", @@ -100,7 +110,7 @@ def is_running_in_airflow_task() -> bool: from airflow.operators.python import get_current_context context = get_current_context() - return context is not None and 'ti' in context + return context is not None and "ti" in context except Exception: return False @@ -163,4 +173,4 @@ def is_aws_lambda() -> bool: def is_gcp_cloud_function() -> bool: "Return True if the process is running in the serverless platform GCP Cloud Functions" - return os.environ.get("FUNCTION_NAME") is not None \ No newline at end of file + return os.environ.get("FUNCTION_NAME") is not None diff --git a/dlt/common/runtime/json_logging.py b/dlt/common/runtime/json_logging.py index bfff063dab..042236a093 100644 --- a/dlt/common/runtime/json_logging.py +++ b/dlt/common/runtime/json_logging.py @@ -1,4 +1,3 @@ - import logging from datetime import datetime # noqa: I251 import traceback @@ -8,7 +7,7 @@ from dlt.common.json import json from dlt.common.typing import DictStrAny, StrAny -EMPTY_VALUE = '-' +EMPTY_VALUE = "-" JSON_SERIALIZER = lambda log: json.dumps(log) COMPONENT_ID = EMPTY_VALUE COMPONENT_NAME = EMPTY_VALUE @@ -17,15 +16,35 @@ # The list contains all the attributes listed in # http://docs.python.org/library/logging.html#logrecord-attributes RECORD_ATTR_SKIP_LIST = [ - 'asctime', 'created', 'exc_info', 'exc_text', 'filename', 'args', - 'funcName', 'id', 'levelname', 'levelno', 'lineno', 'module', 'msg', - 'msecs', 'msecs', 'message', 'name', 'pathname', 'process', - 'processName', 'relativeCreated', 'thread', 'threadName', 'extra', + "asctime", + "created", + "exc_info", + "exc_text", + "filename", + "args", + "funcName", + "id", + "levelname", + "levelno", + "lineno", + "module", + "msg", + "msecs", + "msecs", + "message", + "name", + "pathname", + "process", + "processName", + "relativeCreated", + "thread", + "threadName", + "extra", # Also exclude legacy 'props' - 'props', + "props", ] -RECORD_ATTR_SKIP_LIST.append('stack_info') +RECORD_ATTR_SKIP_LIST.append("stack_info") EASY_TYPES = (str, bool, dict, float, int, list, type(None)) _default_formatter: Type[logging.Formatter] = None @@ -34,10 +53,10 @@ def config_root_logger() -> None: """ - You must call this if you are using root logger. - Make all root logger' handlers produce JSON format - & remove duplicate handlers for request instrumentation logging. - Please made sure that you call this after you called "logging.basicConfig() or logging.getLogger() + You must call this if you are using root logger. + Make all root logger' handlers produce JSON format + & remove duplicate handlers for request instrumentation logging. + Please made sure that you call this after you called "logging.basicConfig() or logging.getLogger() """ global _default_formatter update_formatter_for_loggers([logging.root], _default_formatter) @@ -54,7 +73,9 @@ def init(custom_formatter: Type[logging.Formatter] = None) -> None: if custom_formatter: if not issubclass(custom_formatter, logging.Formatter): - raise ValueError('custom_formatter is not subclass of logging.Formatter', custom_formatter) + raise ValueError( + "custom_formatter is not subclass of logging.Formatter", custom_formatter + ) _default_formatter = custom_formatter if custom_formatter else JSONLogFormatter logging._defaultFormatter = _default_formatter() # type: ignore @@ -66,8 +87,9 @@ def init(custom_formatter: Type[logging.Formatter] = None) -> None: class BaseJSONFormatter(logging.Formatter): """ - Base class for JSON formatters + Base class for JSON formatters """ + base_object_common: DictStrAny = {} def __init__(self, *args: Any, **kw: Any) -> None: @@ -98,7 +120,7 @@ def _get_extra_fields(self, record: logging.LogRecord) -> StrAny: fields: DictStrAny = {} if record.args: - fields['msg'] = record.msg + fields["msg"] = record.msg for key, value in record.__dict__.items(): if key not in RECORD_ATTR_SKIP_LIST: @@ -108,15 +130,14 @@ def _get_extra_fields(self, record: logging.LogRecord) -> StrAny: fields[key] = repr(value) # Always add 'props' to the root of the log, assumes props is a dict - if hasattr(record, 'props') and isinstance(record.props, dict): + if hasattr(record, "props") and isinstance(record.props, dict): fields.update(record.props) return fields - def _sanitize_log_msg(record: logging.LogRecord) -> str: - return record.getMessage().replace('\n', '_').replace('\r', '_').replace('\t', '_') + return record.getMessage().replace("\n", "_").replace("\r", "_").replace("\t", "_") class JSONLogFormatter(BaseJSONFormatter): @@ -130,25 +151,27 @@ def get_exc_fields(self, record: logging.LogRecord) -> StrAny: else: exc_info = record.exc_text return { - 'exc_info': exc_info, - 'filename': record.filename, + "exc_info": exc_info, + "filename": record.filename, } @classmethod def format_exception(cls, exc_info: Any) -> str: - return ''.join(traceback.format_exception(*exc_info)) if exc_info else '' + return "".join(traceback.format_exception(*exc_info)) if exc_info else "" def _format_log_object(self, record: logging.LogRecord) -> DictStrAny: json_log_object = super(JSONLogFormatter, self)._format_log_object(record) - json_log_object.update({ - "msg": _sanitize_log_msg(record), - "type": "log", - "logger": record.name, - "thread": record.threadName, - "level": record.levelname, - "module": record.module, - "line_no": record.lineno, - }) + json_log_object.update( + { + "msg": _sanitize_log_msg(record), + "type": "log", + "logger": record.name, + "thread": record.threadName, + "level": record.levelname, + "module": record.module, + "line_no": record.lineno, + } + ) if record.exc_info or record.exc_text: json_log_object.update(self.get_exc_fields(record)) @@ -156,7 +179,9 @@ def _format_log_object(self, record: logging.LogRecord) -> DictStrAny: return json_log_object -def update_formatter_for_loggers(loggers_iter: List[Logger], formatter: Type[logging.Formatter]) -> None: +def update_formatter_for_loggers( + loggers_iter: List[Logger], formatter: Type[logging.Formatter] +) -> None: """ :param formatter: :param loggers_iter: @@ -174,6 +199,12 @@ def epoch_nano_second(datetime_: datetime) -> int: def iso_time_format(datetime_: datetime) -> str: - return '%04d-%02d-%02dT%02d:%02d:%02d.%03dZ' % ( - datetime_.year, datetime_.month, datetime_.day, datetime_.hour, datetime_.minute, datetime_.second, - int(datetime_.microsecond / 1000)) + return "%04d-%02d-%02dT%02d:%02d:%02d.%03dZ" % ( + datetime_.year, + datetime_.month, + datetime_.day, + datetime_.hour, + datetime_.minute, + datetime_.second, + int(datetime_.microsecond / 1000), + ) diff --git a/dlt/common/runtime/logger.py b/dlt/common/runtime/logger.py index f833d36608..9dd8ce4e3a 100644 --- a/dlt/common/runtime/logger.py +++ b/dlt/common/runtime/logger.py @@ -14,12 +14,12 @@ class LogMethod(Protocol): - def __call__(self, msg: str, *args: Any, **kwds: Any) -> None: - ... + def __call__(self, msg: str, *args: Any, **kwds: Any) -> None: ... def __getattr__(name: str) -> LogMethod: """Forwards log method calls (debug, info, error etc.) to LOGGER""" + def wrapper(msg: str, *args: Any, **kwargs: Any) -> None: if LOGGER: # skip stack frames when displaying log so the original logging frame is displayed @@ -28,6 +28,7 @@ def wrapper(msg: str, *args: Any, **kwargs: Any) -> None: # exception has one more frame stacklevel = 3 getattr(LOGGER, name)(msg, *args, **kwargs, stacklevel=stacklevel) + return wrapper @@ -50,11 +51,8 @@ def init_logging(config: RunConfiguration) -> None: version = dlt_version_info(config.pipeline_name) LOGGER = _init_logging( - DLT_LOGGER_NAME, - config.log_level, - config.log_format, - config.pipeline_name, - version) + DLT_LOGGER_NAME, config.log_level, config.log_format, config.pipeline_name, version + ) def is_logging() -> bool: @@ -84,7 +82,9 @@ def format(self, record: LogRecord) -> str: # noqa: A003 return s -def _init_logging(logger_name: str, level: str, fmt: str, component: str, version: StrStr) -> Logger: +def _init_logging( + logger_name: str, level: str, fmt: str, component: str, version: StrStr +) -> Logger: if logger_name == "root": logging.basicConfig(level=level) handler = logging.getLogger().handlers[0] @@ -120,6 +120,6 @@ def _format_log_object(self, record: LogRecord) -> Any: if logger_name == "root": json_logging.config_root_logger() else: - handler.setFormatter(_MetricsFormatter(fmt=fmt, style='{')) + handler.setFormatter(_MetricsFormatter(fmt=fmt, style="{")) return logger diff --git a/dlt/common/runtime/prometheus.py b/dlt/common/runtime/prometheus.py index 0634670a5a..1b233ffa9b 100644 --- a/dlt/common/runtime/prometheus.py +++ b/dlt/common/runtime/prometheus.py @@ -23,7 +23,9 @@ def get_metrics_from_prometheus(gauges: Iterable[MetricWrapperBase]) -> StrAny: name = g._name if g._is_parent(): # for gauges containing many label values, enumerate all - metrics.update(get_metrics_from_prometheus([g.labels(*label) for label in g._metrics.keys()])) + metrics.update( + get_metrics_from_prometheus([g.labels(*label) for label in g._metrics.keys()]) + ) continue # for gauges with labels: add the label to the name and enumerate samples if g._labelvalues: diff --git a/dlt/common/runtime/segment.py b/dlt/common/runtime/segment.py index b8d533cccb..d06ef80607 100644 --- a/dlt/common/runtime/segment.py +++ b/dlt/common/runtime/segment.py @@ -31,7 +31,9 @@ def init_segment(config: RunConfiguration) -> None: - assert config.dlthub_telemetry_segment_write_key, "dlthub_telemetry_segment_write_key not present in RunConfiguration" + assert ( + config.dlthub_telemetry_segment_write_key + ), "dlthub_telemetry_segment_write_key not present in RunConfiguration" # create thread pool to send telemetry to segment global _THREAD_POOL, _WRITE_KEY, _SESSION @@ -51,11 +53,7 @@ def disable_segment() -> None: _at_exit_cleanup() -def track( - event_category: TEventCategory, - event_name: str, - properties: DictStrAny -) -> None: +def track(event_category: TEventCategory, event_name: str, properties: DictStrAny) -> None: """Tracks a telemetry event. The segment event name will be created as "{event_category}_{event_name} @@ -68,10 +66,7 @@ def track( if properties is None: properties = {} - properties.update({ - "event_category": event_category, - "event_name": event_name - }) + properties.update({"event_category": event_category, "event_name": event_name}) try: _send_event(f"{event_category}_{event_name}", properties, _default_context_fields()) @@ -127,11 +122,7 @@ def get_anonymous_id() -> str: return anonymous_id -def _segment_request_payload( - event_name: str, - properties: StrAny, - context: StrAny -) -> DictStrAny: +def _segment_request_payload(event_name: str, properties: StrAny, context: StrAny) -> DictStrAny: """Compose a valid payload for the segment API. Args: @@ -167,7 +158,7 @@ def _default_context_fields() -> DictStrAny: "python": sys.version.split(" ")[0], "library": {"name": DLT_PKG_NAME, "version": __version__}, "cpu": multiprocessing.cpu_count(), - "exec_info": exec_info_names() + "exec_info": exec_info_names(), } # avoid returning the cached dict --> caller could modify the dictionary... @@ -176,11 +167,7 @@ def _default_context_fields() -> DictStrAny: return _SEGMENT_CONTEXT.copy() -def _send_event( - event_name: str, - properties: StrAny, - context: StrAny -) -> None: +def _send_event(event_name: str, properties: StrAny, context: StrAny) -> None: """Report the contents segment of an event to the /track Segment endpoint. Args: @@ -205,7 +192,9 @@ def _send_event( def _future_send() -> None: # import time # start_ts = time.time() - resp = _SESSION.post(_SEGMENT_ENDPOINT, headers=headers, json=payload, timeout=_SEGMENT_REQUEST_TIMEOUT) + resp = _SESSION.post( + _SEGMENT_ENDPOINT, headers=headers, json=payload, timeout=_SEGMENT_REQUEST_TIMEOUT + ) # print(f"SENDING TO Segment done {resp.status_code} {time.time() - start_ts} {base64.b64decode(_WRITE_KEY)}") # handle different failure cases if resp.status_code != 200: @@ -216,8 +205,6 @@ def _future_send() -> None: else: data = resp.json() if not data.get("success"): - logger.debug( - f"Segment telemetry request returned a failure. Response: {data}" - ) + logger.debug(f"Segment telemetry request returned a failure. Response: {data}") _THREAD_POOL.submit(_future_send) diff --git a/dlt/common/runtime/sentry.py b/dlt/common/runtime/sentry.py index 8bc70e46cf..7ea45affc0 100644 --- a/dlt/common/runtime/sentry.py +++ b/dlt/common/runtime/sentry.py @@ -8,7 +8,11 @@ from sentry_sdk.transport import HttpTransport from sentry_sdk.integrations.logging import LoggingIntegration except ModuleNotFoundError: - raise MissingDependencyException("sentry telemetry", ["sentry-sdk"], "Please install sentry-sdk if you have `sentry_dsn` set in your RuntimeConfiguration") + raise MissingDependencyException( + "sentry telemetry", + ["sentry-sdk"], + "Please install sentry-sdk if you have `sentry_dsn` set in your RuntimeConfiguration", + ) from dlt.common.typing import DictStrAny, Any, StrAny from dlt.common.configuration.specs import RunConfiguration @@ -27,10 +31,10 @@ def init_sentry(config: RunConfiguration) -> None: before_send=before_send, traces_sample_rate=1.0, # disable tornado, boto3, sql alchemy etc. - auto_enabling_integrations = False, + auto_enabling_integrations=False, integrations=[_get_sentry_log_level(config)], release=release, - transport=_SentryHttpTransport + transport=_SentryHttpTransport, ) # add version tags for k, v in version.items(): @@ -58,12 +62,11 @@ def before_send(event: DictStrAny, _unused_hint: Optional[StrAny] = None) -> Opt class _SentryHttpTransport(HttpTransport): - timeout: float = 0 def _get_pool_options(self, *a: Any, **kw: Any) -> DictStrAny: rv = HttpTransport._get_pool_options(self, *a, **kw) - rv['timeout'] = self.timeout + rv["timeout"] = self.timeout return rv @@ -71,6 +74,6 @@ def _get_sentry_log_level(config: RunConfiguration) -> LoggingIntegration: log_level = logging._nameToLevel[config.log_level] event_level = logging.WARNING if log_level <= logging.WARNING else log_level return LoggingIntegration( - level=logging.INFO, # Capture info and above as breadcrumbs - event_level=event_level # Send errors as events + level=logging.INFO, # Capture info and above as breadcrumbs + event_level=event_level, # Send errors as events ) diff --git a/dlt/common/runtime/slack.py b/dlt/common/runtime/slack.py index ce5e90b300..15da89f333 100644 --- a/dlt/common/runtime/slack.py +++ b/dlt/common/runtime/slack.py @@ -4,13 +4,10 @@ def send_slack_message(incoming_hook: str, message: str, is_markdown: bool = True) -> None: """Sends a `message` to Slack `incoming_hook`, by default formatted as markdown.""" - r = requests.post(incoming_hook, - data = json.dumps({ - "text": message, - "mrkdwn": is_markdown - } - ).encode("utf-8"), - headers={'Content-Type': 'application/json;charset=utf-8'} + r = requests.post( + incoming_hook, + data=json.dumps({"text": message, "mrkdwn": is_markdown}).encode("utf-8"), + headers={"Content-Type": "application/json;charset=utf-8"}, ) if r.status_code >= 400: logger.warning(f"Could not post the notification to slack: {r.status_code}") diff --git a/dlt/common/runtime/telemetry.py b/dlt/common/runtime/telemetry.py index 86b3355985..e03bc04d79 100644 --- a/dlt/common/runtime/telemetry.py +++ b/dlt/common/runtime/telemetry.py @@ -21,6 +21,7 @@ def start_telemetry(config: RunConfiguration) -> None: if config.sentry_dsn: # may raise if sentry is not installed from dlt.common.runtime.sentry import init_sentry + init_sentry(config) if config.dlthub_telemetry: @@ -36,6 +37,7 @@ def stop_telemetry() -> None: try: from dlt.common.runtime.sentry import disable_sentry + disable_sentry() except ImportError: pass @@ -49,14 +51,18 @@ def is_telemetry_started() -> bool: return _TELEMETRY_STARTED -def with_telemetry(category: TEventCategory, command: str, track_before: bool, *args: str) -> Callable[[TFun], TFun]: +def with_telemetry( + category: TEventCategory, command: str, track_before: bool, *args: str +) -> Callable[[TFun], TFun]: """Adds telemetry to f: TFun and add optional f *args values to `properties` of telemetry event""" + def decorator(f: TFun) -> TFun: sig: inspect.Signature = inspect.signature(f) + def _wrap(*f_args: Any, **f_kwargs: Any) -> Any: # look for additional arguments bound_args = sig.bind(*f_args, **f_kwargs) - props = {p:bound_args.arguments[p] for p in args if p in bound_args.arguments} + props = {p: bound_args.arguments[p] for p in args if p in bound_args.arguments} start_ts = time.time() def _track(success: bool) -> None: @@ -88,4 +94,5 @@ def _track(success: bool) -> None: raise return _wrap # type: ignore - return decorator \ No newline at end of file + + return decorator diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index ac320bef0a..9cb5e2ab76 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,11 +1,32 @@ -from dlt.common.schema.typing import TSchemaContractDict, TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase +from dlt.common.schema.typing import ( + TSchemaContractDict, + TSchemaUpdate, + TSchemaTables, + TTableSchema, + TStoredSchema, + TTableSchemaColumns, + TColumnHint, + TColumnSchema, + TColumnSchemaBase, +) from dlt.common.schema.typing import COLUMN_HINTS from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.utils import verify_schema_hash __all__ = [ - "TSchemaUpdate", "TSchemaTables", "TTableSchema", "TStoredSchema", "TTableSchemaColumns", "TColumnHint", - "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash", "TSchemaContractDict", - "DEFAULT_SCHEMA_CONTRACT_MODE", "DataValidationError" + "TSchemaUpdate", + "TSchemaTables", + "TTableSchema", + "TStoredSchema", + "TTableSchemaColumns", + "TColumnHint", + "TColumnSchema", + "TColumnSchemaBase", + "COLUMN_HINTS", + "Schema", + "verify_schema_hash", + "TSchemaContractDict", + "DEFAULT_SCHEMA_CONTRACT_MODE", + "DataValidationError", ] diff --git a/dlt/common/schema/detections.py b/dlt/common/schema/detections.py index 207c934091..30b23706af 100644 --- a/dlt/common/schema/detections.py +++ b/dlt/common/schema/detections.py @@ -43,7 +43,7 @@ def is_iso_date(t: Type[Any], v: Any) -> Optional[TDataType]: if not v: return None # don't cast iso timestamps as dates - if is_iso_timestamp(t,v): + if is_iso_timestamp(t, v): return None # strict autodetection of iso timestamps try: diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 96df6b7418..f040325da9 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -2,7 +2,11 @@ from dlt.common.exceptions import DltException from dlt.common.data_types import TDataType -from dlt.common.schema.typing import TSchemaContractDict, TSchemaContractEntities, TSchemaEvolutionMode +from dlt.common.schema.typing import ( + TSchemaContractDict, + TSchemaContractEntities, + TSchemaEvolutionMode, +) class SchemaException(DltException): @@ -14,23 +18,41 @@ class InvalidSchemaName(ValueError, SchemaException): def __init__(self, name: str) -> None: self.name = name - super().__init__(f"{name} is an invalid schema/source name. The source or schema name must be a valid Python identifier ie. a snake case function name and have maximum {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small letters, numbers and underscores.") + super().__init__( + f"{name} is an invalid schema/source name. The source or schema name must be a valid" + " Python identifier ie. a snake case function name and have maximum" + f" {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small" + " letters, numbers and underscores." + ) class InvalidDatasetName(ValueError, SchemaException): def __init__(self, destination_name: str) -> None: self.destination_name = destination_name - super().__init__(f"Destination {destination_name} does not accept empty datasets. Please pass the dataset name to the destination configuration ie. via dlt pipeline.") + super().__init__( + f"Destination {destination_name} does not accept empty datasets. Please pass the" + " dataset name to the destination configuration ie. via dlt pipeline." + ) class CannotCoerceColumnException(SchemaException): - def __init__(self, table_name: str, column_name: str, from_type: TDataType, to_type: TDataType, coerced_value: Any) -> None: + def __init__( + self, + table_name: str, + column_name: str, + from_type: TDataType, + to_type: TDataType, + coerced_value: Any, + ) -> None: self.table_name = table_name self.column_name = column_name self.from_type = from_type self.to_type = to_type self.coerced_value = coerced_value - super().__init__(f"Cannot coerce type in table {table_name} column {column_name} existing type {from_type} coerced type {to_type} value: {coerced_value}") + super().__init__( + f"Cannot coerce type in table {table_name} column {column_name} existing type" + f" {from_type} coerced type {to_type} value: {coerced_value}" + ) class TablePropertiesConflictException(SchemaException): @@ -39,19 +61,27 @@ def __init__(self, table_name: str, prop_name: str, val1: str, val2: str): self.prop_name = prop_name self.val1 = val1 self.val2 = val2 - super().__init__(f"Cannot merge partial tables for {table_name} due to property {prop_name}: {val1} != {val2}") + super().__init__( + f"Cannot merge partial tables for {table_name} due to property {prop_name}: {val1} !=" + f" {val2}" + ) class ParentTableNotFoundException(SchemaException): def __init__(self, table_name: str, parent_table_name: str, explanation: str = "") -> None: self.table_name = table_name self.parent_table_name = parent_table_name - super().__init__(f"Parent table {parent_table_name} for {table_name} was not found in the schema.{explanation}") + super().__init__( + f"Parent table {parent_table_name} for {table_name} was not found in the" + f" schema.{explanation}" + ) class CannotCoerceNullException(SchemaException): def __init__(self, table_name: str, column_name: str) -> None: - super().__init__(f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable") + super().__init__( + f"Cannot coerce NULL in table {table_name} column {column_name} which is not nullable" + ) class SchemaCorruptedException(SchemaException): @@ -59,12 +89,17 @@ class SchemaCorruptedException(SchemaException): class SchemaEngineNoUpgradePathException(SchemaException): - def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engine: int) -> None: + def __init__( + self, schema_name: str, init_engine: int, from_engine: int, to_engine: int + ) -> None: self.schema_name = schema_name self.init_engine = init_engine self.from_engine = from_engine self.to_engine = to_engine - super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + super().__init__( + f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}," + f" stopped at {from_engine}" + ) class DataValidationError(SchemaException): @@ -78,11 +113,11 @@ def __init__( table_schema: Any, schema_contract: TSchemaContractDict, data_item: Any = None, - extended_info: str = None + extended_info: str = None, ) -> None: """Raised when `data_item` violates `contract_mode` on a `contract_entity` as defined by `table_schema` - Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence. + Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence. """ msg = "" if schema_name: @@ -90,7 +125,12 @@ def __init__( msg += f"Table: {table_name} " if column_name: msg += f"Column: {column_name}" - msg = "In " + msg + f" . Contract on {contract_entity} with mode {contract_mode} is violated. " + (extended_info or "") + msg = ( + "In " + + msg + + f" . Contract on {contract_entity} with mode {contract_mode} is violated. " + + (extended_info or "") + ) super().__init__(msg) self.schema_name = schema_name self.table_name = table_name diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index bcfba11c61..36b7627a0b 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -4,16 +4,49 @@ from dlt.common import json from dlt.common.utils import extend_list_deduplicated -from dlt.common.typing import DictStrAny, StrAny, REPattern, SupportsVariant, VARIANT_FIELD_FORMAT, TDataItem +from dlt.common.typing import ( + DictStrAny, + StrAny, + REPattern, + SupportsVariant, + VARIANT_FIELD_FORMAT, + TDataItem, +) from dlt.common.normalizers import TNormalizersConfig, explicit_normalizers, import_normalizers from dlt.common.normalizers.naming import NamingConvention from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType -from dlt.common.schema.typing import (COLUMN_HINTS, DLT_NAME_PREFIX, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) -from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, - ParentTableNotFoundException, SchemaCorruptedException) +from dlt.common.schema.typing import ( + COLUMN_HINTS, + DLT_NAME_PREFIX, + SCHEMA_ENGINE_VERSION, + LOADS_TABLE_NAME, + VERSION_TABLE_NAME, + STATE_TABLE_NAME, + TPartialTableSchema, + TSchemaContractEntities, + TSchemaEvolutionMode, + TSchemaSettings, + TSimpleRegex, + TStoredSchema, + TSchemaTables, + TTableSchema, + TTableSchemaColumns, + TColumnSchema, + TColumnProp, + TColumnHint, + TTypeDetections, + TSchemaContractDict, + TSchemaContract, +) +from dlt.common.schema.exceptions import ( + CannotCoerceColumnException, + CannotCoerceNullException, + InvalidSchemaName, + ParentTableNotFoundException, + SchemaCorruptedException, +) from dlt.common.validation import validate_dict from dlt.common.schema.exceptions import DataValidationError @@ -21,9 +54,10 @@ DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { "tables": "evolve", "columns": "evolve", - "data_type": "evolve" + "data_type": "evolve", } + class Schema: ENGINE_VERSION: ClassVar[int] = SCHEMA_ENGINE_VERSION @@ -39,16 +73,15 @@ class Schema: state_table_name: str """Normalized name of the dlt state table""" - _schema_name: str _dlt_tables_prefix: str _stored_version: int # version at load/creation time _stored_version_hash: str # version hash at load/creation time - _stored_previous_hashes: Optional[List[str]] # list of ancestor hashes of the schema + _stored_previous_hashes: Optional[List[str]] # list of ancestor hashes of the schema _imported_version_hash: str # version hash of recently imported schema _schema_description: str # optional schema description _schema_tables: TSchemaTables - _settings: TSchemaSettings # schema settings to hold default hints, preferred types and other settings + _settings: TSchemaSettings # schema settings to hold default hints, preferred types and other settings # list of preferred types: map regex on columns into types _compiled_preferred_types: List[Tuple[REPattern, TDataType]] @@ -69,7 +102,6 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: @classmethod def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": - # upgrade engine if needed stored_schema = utils.migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) # verify schema @@ -85,7 +117,9 @@ def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": @classmethod def from_stored_schema(cls, stored_schema: TStoredSchema) -> "Schema": # create new instance from dict - self: Schema = cls(stored_schema["name"], normalizers=stored_schema.get("normalizers", None)) + self: Schema = cls( + stored_schema["name"], normalizers=stored_schema.get("normalizers", None) + ) self._from_stored_schema(stored_schema) return self @@ -103,7 +137,7 @@ def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> T "tables": self._schema_tables, "settings": self._settings, "normalizers": self._normalizers_config, - "previous_hashes": self._stored_previous_hashes + "previous_hashes": self._stored_previous_hashes, } if self._imported_version_hash and not remove_defaults: stored_schema["imported_version_hash"] = self._imported_version_hash @@ -118,7 +152,9 @@ def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> T utils.remove_defaults(stored_schema) return stored_schema - def normalize_data_item(self, item: TDataItem, load_id: str, table_name: str) -> TNormalizedRowIterator: + def normalize_data_item( + self, item: TDataItem, load_id: str, table_name: str + ) -> TNormalizedRowIterator: return self.data_item_normalizer.normalize_data_item(item, load_id, table_name) def filter_row(self, table_name: str, row: StrAny) -> StrAny: @@ -135,7 +171,9 @@ def filter_row(self, table_name: str, row: StrAny) -> StrAny: # most of the schema do not use them return row - def _exclude(path: str, excludes: Sequence[REPattern], includes: Sequence[REPattern]) -> bool: + def _exclude( + path: str, excludes: Sequence[REPattern], includes: Sequence[REPattern] + ) -> bool: is_included = False is_excluded = any(exclude.search(path) for exclude in excludes) if is_excluded: @@ -164,16 +202,18 @@ def _exclude(path: str, excludes: Sequence[REPattern], includes: Sequence[REPatt break return row - def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[DictStrAny, TPartialTableSchema]: + def coerce_row( + self, table_name: str, parent_table: str, row: StrAny + ) -> Tuple[DictStrAny, TPartialTableSchema]: """Fits values of fields present in `row` into a schema of `table_name`. Will coerce values into data types and infer new tables and column schemas. - Method expects that field names in row are already normalized. - * if table schema for `table_name` does not exist, new table is created - * if column schema for a field in `row` does not exist, it is inferred from data - * if incomplete column schema (no data type) exists, column is inferred from data and existing hints are applied - * fields with None value are removed + Method expects that field names in row are already normalized. + * if table schema for `table_name` does not exist, new table is created + * if column schema for a field in `row` does not exist, it is inferred from data + * if incomplete column schema (no data type) exists, column is inferred from data and existing hints are applied + * fields with None value are removed - Returns tuple with row with coerced values and a partial table containing just the newly added columns or None if no changes were detected + Returns tuple with row with coerced values and a partial table containing just the newly added columns or None if no changes were detected """ # get existing or create a new table updated_table_partial: TPartialTableSchema = None @@ -189,7 +229,9 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D # just check if column is nullable if it exists self._coerce_null_value(table_columns, table_name, col_name) else: - new_col_name, new_col_def, new_v = self._coerce_non_null_value(table_columns, table_name, col_name, v) + new_col_name, new_col_def, new_v = self._coerce_non_null_value( + table_columns, table_name, col_name, v + ) new_row[new_col_name] = new_v if new_col_def: if not updated_table_partial: @@ -201,12 +243,14 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial def apply_schema_contract( - self, - schema_contract: TSchemaContractDict, - partial_table: TPartialTableSchema, - data_item: TDataItem = None, - raise_on_freeze: bool = True - ) -> Tuple[TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]]]: + self, + schema_contract: TSchemaContractDict, + partial_table: TPartialTableSchema, + data_item: TDataItem = None, + raise_on_freeze: bool = True, + ) -> Tuple[ + TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]] + ]: """ Checks if `schema_contract` allows for the `partial_table` to update the schema. It applies the contract dropping the affected columns or the whole `partial_table`. It generates and returns a set of filters that should be applied to incoming data in order to modify it @@ -245,7 +289,15 @@ def apply_schema_contract( if is_new_table and schema_contract["tables"] != "evolve": if raise_on_freeze and schema_contract["tables"] == "freeze": raise DataValidationError( - self.name, table_name, None, "tables", "freeze", None, schema_contract, data_item, f"Trying to add table {table_name} but new tables are frozen." + self.name, + table_name, + None, + "tables", + "freeze", + None, + schema_contract, + data_item, + f"Trying to add table {table_name} but new tables are frozen.", ) # filter tables with name below return None, [("tables", table_name, schema_contract["tables"])] @@ -266,7 +318,16 @@ def apply_schema_contract( if column_mode != "evolve" and not is_variant: if raise_on_freeze and column_mode == "freeze": raise DataValidationError( - self.name, table_name, column_name, "columns", "freeze", existing_table, schema_contract, data_item, f"Trying to add column {column_name} to table {table_name} but columns are frozen." + self.name, + table_name, + column_name, + "columns", + "freeze", + existing_table, + schema_contract, + data_item, + f"Trying to add column {column_name} to table {table_name} but columns are" + " frozen.", ) # filter column with name below filters.append(("columns", column_name, column_mode)) @@ -277,7 +338,16 @@ def apply_schema_contract( if data_mode != "evolve" and is_variant: if raise_on_freeze and data_mode == "freeze": raise DataValidationError( - self.name, table_name, column_name, "data_type", "freeze", existing_table, schema_contract, data_item, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen." + self.name, + table_name, + column_name, + "data_type", + "freeze", + existing_table, + schema_contract, + data_item, + f"Trying to create new variant column {column_name} to table" + f" {table_name} but data_types are frozen.", ) # filter column with name below filters.append(("columns", column_name, data_mode)) @@ -287,13 +357,19 @@ def apply_schema_contract( return partial_table, filters @staticmethod - def expand_schema_contract_settings(settings: TSchemaContract, default: TSchemaContractDict = None) -> TSchemaContractDict: + def expand_schema_contract_settings( + settings: TSchemaContract, default: TSchemaContractDict = None + ) -> TSchemaContractDict: """Expand partial or shorthand settings into full settings dictionary using `default` for unset entities""" if isinstance(settings, str): settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) - return cast(TSchemaContractDict, {**(default or DEFAULT_SCHEMA_CONTRACT_MODE), **(settings or {})}) + return cast( + TSchemaContractDict, {**(default or DEFAULT_SCHEMA_CONTRACT_MODE), **(settings or {})} + ) - def resolve_contract_settings_for_table(self, table_name: str, new_table_schema: TTableSchema = None) -> TSchemaContractDict: + def resolve_contract_settings_for_table( + self, table_name: str, new_table_schema: TTableSchema = None + ) -> TSchemaContractDict: """Resolve the exact applicable schema contract settings for the table `table_name`. `new_table_schema` is added to the tree during the resolution.""" settings: TSchemaContract = {} @@ -320,9 +396,12 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem if parent_table_name is not None: if self._schema_tables.get(parent_table_name) is None: raise ParentTableNotFoundException( - table_name, parent_table_name, - f" This may be due to misconfigured excludes filter that fully deletes content of the {parent_table_name}. Add includes that will preserve the parent table." - ) + table_name, + parent_table_name, + " This may be due to misconfigured excludes filter that fully deletes content" + f" of the {parent_table_name}. Add includes that will preserve the parent" + " table.", + ) table = self._schema_tables.get(table_name) if table is None: # add the whole new table to SchemaTables @@ -334,7 +413,6 @@ def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchem self.data_item_normalizer.extend_table(table_name) return partial_table - def update_schema(self, schema: "Schema") -> None: """Updates this schema from an incoming schema""" # update all tables @@ -346,7 +424,6 @@ def update_schema(self, schema: "Schema") -> None: self._settings = deepcopy(schema.settings) self._compile_settings() - def bump_version(self) -> Tuple[int, str]: """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. @@ -355,7 +432,9 @@ def bump_version(self) -> Tuple[int, str]: Returns: Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple """ - self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) + self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified( + self.to_dict(bump_version=False) + ) return self._stored_version, self._stored_version_hash def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: @@ -378,7 +457,12 @@ def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: Str def merge_hints(self, new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]]) -> None: # validate regexes - validate_dict(TSchemaSettings, {"default_hints": new_hints}, ".", validator_f=utils.simple_regex_validator) + validate_dict( + TSchemaSettings, + {"default_hints": new_hints}, + ".", + validator_f=utils.simple_regex_validator, + ) # prepare hints to be added default_hints = self._settings.setdefault("default_hints", {}) # add `new_hints` to existing hints @@ -392,12 +476,12 @@ def merge_hints(self, new_hints: Mapping[TColumnHint, Sequence[TSimpleRegex]]) - def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: """Normalizes all table and column names in `table` schema according to current schema naming convention and returns - new normalized TTableSchema instance. + new normalized TTableSchema instance. - Naming convention like snake_case may produce name clashes with the column names. Clashing column schemas are merged - where the column that is defined later in the dictionary overrides earlier column. + Naming convention like snake_case may produce name clashes with the column names. Clashing column schemas are merged + where the column that is defined later in the dictionary overrides earlier column. - Note that resource name is not normalized. + Note that resource name is not normalized. """ # normalize all identifiers in table according to name normalizer of the schema @@ -413,13 +497,20 @@ def normalize_table_identifiers(self, table: TTableSchema) -> TTableSchema: # re-index columns as the name changed, if name space was reduced then # some columns now clash with each other. so make sure that we merge columns that are already there if new_col_name in new_columns: - new_columns[new_col_name] = utils.merge_columns(new_columns[new_col_name], c, merge_defaults=False) + new_columns[new_col_name] = utils.merge_columns( + new_columns[new_col_name], c, merge_defaults=False + ) else: new_columns[new_col_name] = c table["columns"] = new_columns return table - def get_new_table_columns(self, table_name: str, exiting_columns: TTableSchemaColumns, include_incomplete: bool = False) -> List[TColumnSchema]: + def get_new_table_columns( + self, + table_name: str, + exiting_columns: TTableSchemaColumns, + include_incomplete: bool = False, + ) -> List[TColumnSchema]: """Gets new columns to be added to `exiting_columns` to bring them up to date with `table_name` schema. Optionally includes incomplete columns (without data type)""" diff_c: List[TColumnSchema] = [] s_t = self.get_table_columns(table_name, include_incomplete=include_incomplete) @@ -431,27 +522,46 @@ def get_new_table_columns(self, table_name: str, exiting_columns: TTableSchemaCo def get_table(self, table_name: str) -> TTableSchema: return self._schema_tables[table_name] - def get_table_columns(self, table_name: str, include_incomplete: bool = False) -> TTableSchemaColumns: - """Gets columns of `table_name`. Optionally includes incomplete columns """ + def get_table_columns( + self, table_name: str, include_incomplete: bool = False + ) -> TTableSchemaColumns: + """Gets columns of `table_name`. Optionally includes incomplete columns""" if include_incomplete: return self._schema_tables[table_name]["columns"] else: - return {k:v for k, v in self._schema_tables[table_name]["columns"].items() if utils.is_complete_column(v)} + return { + k: v + for k, v in self._schema_tables[table_name]["columns"].items() + if utils.is_complete_column(v) + } def data_tables(self, include_incomplete: bool = False) -> List[TTableSchema]: """Gets list of all tables, that hold the loaded data. Excludes dlt tables. Excludes incomplete tables (ie. without columns)""" - return [t for t in self._schema_tables.values() if not t["name"].startswith(self._dlt_tables_prefix) and (len(t["columns"]) > 0 or include_incomplete)] + return [ + t + for t in self._schema_tables.values() + if not t["name"].startswith(self._dlt_tables_prefix) + and (len(t["columns"]) > 0 or include_incomplete) + ] def dlt_tables(self) -> List[TTableSchema]: """Gets dlt tables""" - return [t for t in self._schema_tables.values() if t["name"].startswith(self._dlt_tables_prefix)] + return [ + t for t in self._schema_tables.values() if t["name"].startswith(self._dlt_tables_prefix) + ] def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) def is_new_table(self, table_name: str) -> bool: """Returns true if this table does not exist OR is incomplete (has only incomplete columns) and therefore new""" - return (table_name not in self.tables) or (not [c for c in self.tables[table_name]["columns"].values() if utils.is_complete_column(c)]) + return (table_name not in self.tables) or ( + not [ + c + for c in self.tables[table_name]["columns"].values() + if utils.is_complete_column(c) + ] + ) @property def version(self) -> int: @@ -543,11 +653,13 @@ def remove_type_detection(self, detection: TTypeDetections) -> None: self.settings["detections"].remove(detection) self._compile_settings() - def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False) -> TColumnSchema: - column_schema = TColumnSchema( + def _infer_column( + self, k: str, v: Any, data_type: TDataType = None, is_variant: bool = False + ) -> TColumnSchema: + column_schema = TColumnSchema( name=k, data_type=data_type or self._infer_column_type(v, k), - nullable=not self._infer_hint("not_null", v, k) + nullable=not self._infer_hint("not_null", v, k), ) for hint in COLUMN_HINTS: column_prop = utils.hint_to_column_prop(hint) @@ -559,14 +671,23 @@ def _infer_column(self, k: str, v: Any, data_type: TDataType = None, is_variant: column_schema["variant"] = is_variant return column_schema - def _coerce_null_value(self, table_columns: TTableSchemaColumns, table_name: str, col_name: str) -> None: + def _coerce_null_value( + self, table_columns: TTableSchemaColumns, table_name: str, col_name: str + ) -> None: """Raises when column is explicitly not nullable""" if col_name in table_columns: existing_column = table_columns[col_name] if not existing_column.get("nullable", True): raise CannotCoerceNullException(table_name, col_name) - def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: str, col_name: str, v: Any, is_variant: bool = False) -> Tuple[str, TColumnSchema, Any]: + def _coerce_non_null_value( + self, + table_columns: TTableSchemaColumns, + table_name: str, + col_name: str, + v: Any, + is_variant: bool = False, + ) -> Tuple[str, TColumnSchema, Any]: new_column: TColumnSchema = None existing_column = table_columns.get(col_name) # if column exist but is incomplete then keep it as new column @@ -575,7 +696,11 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: existing_column = None # infer type or get it from existing table - col_type = existing_column["data_type"] if existing_column else self._infer_column_type(v, col_name, skip_preferred=is_variant) + col_type = ( + existing_column["data_type"] + if existing_column + else self._infer_column_type(v, col_name, skip_preferred=is_variant) + ) # get data type of value py_type = py_type_to_sc_type(type(v)) # and coerce type if inference changed the python type @@ -584,12 +709,18 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: except (ValueError, SyntaxError): if is_variant: # this is final call: we cannot generate any more auto-variants - raise CannotCoerceColumnException(table_name, col_name, py_type, table_columns[col_name]["data_type"], v) + raise CannotCoerceColumnException( + table_name, col_name, py_type, table_columns[col_name]["data_type"], v + ) # otherwise we must create variant extension to the table # pass final=True so no more auto-variants can be created recursively # TODO: generate callback so dlt user can decide what to do - variant_col_name = self.naming.shorten_fragments(col_name, VARIANT_FIELD_FORMAT % py_type) - return self._coerce_non_null_value(table_columns, table_name, variant_col_name, v, is_variant=True) + variant_col_name = self.naming.shorten_fragments( + col_name, VARIANT_FIELD_FORMAT % py_type + ) + return self._coerce_non_null_value( + table_columns, table_name, variant_col_name, v, is_variant=True + ) # if coerced value is variant, then extract variant value # note: checking runtime protocols with isinstance(coerced_v, SupportsVariant): is extremely slow so we check if callable as every variant is callable @@ -597,11 +728,17 @@ def _coerce_non_null_value(self, table_columns: TTableSchemaColumns, table_name: coerced_v = coerced_v() if isinstance(coerced_v, tuple): # variant recovered so call recursively with variant column name and variant value - variant_col_name = self.naming.shorten_fragments(col_name, VARIANT_FIELD_FORMAT % coerced_v[0]) - return self._coerce_non_null_value(table_columns, table_name, variant_col_name, coerced_v[1], is_variant=True) + variant_col_name = self.naming.shorten_fragments( + col_name, VARIANT_FIELD_FORMAT % coerced_v[0] + ) + return self._coerce_non_null_value( + table_columns, table_name, variant_col_name, coerced_v[1], is_variant=True + ) if not existing_column: - inferred_column = self._infer_column(col_name, v, data_type=col_type, is_variant=is_variant) + inferred_column = self._infer_column( + col_name, v, data_type=col_type, is_variant=is_variant + ) # if there's incomplete new_column then merge it with inferred column if new_column: # use all values present in incomplete column to override inferred column - also the defaults @@ -631,8 +768,12 @@ def _infer_hint(self, hint_type: TColumnHint, _: Any, col_name: str) -> bool: return False def _add_standard_tables(self) -> None: - self._schema_tables[self.version_table_name] = self.normalize_table_identifiers(utils.version_table()) - self._schema_tables[self.loads_table_name] = self.normalize_table_identifiers(utils.load_table()) + self._schema_tables[self.version_table_name] = self.normalize_table_identifiers( + utils.version_table() + ) + self._schema_tables[self.loads_table_name] = self.normalize_table_identifiers( + utils.load_table() + ) def _add_standard_hints(self) -> None: default_hints = utils.standard_hints() @@ -644,14 +785,16 @@ def _add_standard_hints(self) -> None: def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: # import desired modules - self._normalizers_config, naming_module, item_normalizer_class = import_normalizers(normalizers) + self._normalizers_config, naming_module, item_normalizer_class = import_normalizers( + normalizers + ) # print(f"{self.name}: {type(self.naming)} {type(naming_module)}") if self.naming and type(self.naming) is not type(naming_module): self.naming = naming_module for table in self._schema_tables.values(): self.normalize_table_identifiers(table) # re-index the table names - self._schema_tables = {t["name"]:t for t in self._schema_tables.values()} + self._schema_tables = {t["name"]: t for t in self._schema_tables.values()} # name normalization functions self.naming = naming_module @@ -730,9 +873,13 @@ def _compile_settings(self) -> None: for table in self._schema_tables.values(): if "filters" in table: if "excludes" in table["filters"]: - self._compiled_excludes[table["name"]] = list(map(utils.compile_simple_regex, table["filters"]["excludes"])) + self._compiled_excludes[table["name"]] = list( + map(utils.compile_simple_regex, table["filters"]["excludes"]) + ) if "includes" in table["filters"]: - self._compiled_includes[table["name"]] = list(map(utils.compile_simple_regex, table["filters"]["includes"])) + self._compiled_includes[table["name"]] = list( + map(utils.compile_simple_regex, table["filters"]["includes"]) + ) # look for auto-detections in settings and then normalizer self._type_detections = self._settings.get("detections") or self._normalizers_config.get("detections") or [] # type: ignore diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 1b6ef31800..9a27cbe4bb 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -1,4 +1,18 @@ -from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Set, Type, TypedDict, NewType, Union, get_args +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Sequence, + Set, + Type, + TypedDict, + NewType, + Union, + get_args, +) from typing_extensions import Never from dlt.common.data_types import TDataType @@ -19,20 +33,55 @@ STATE_TABLE_NAME = "_dlt_pipeline_state" DLT_NAME_PREFIX = "_dlt" -TColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "merge_key", "root_key"] +TColumnProp = Literal[ + "name", + "data_type", + "nullable", + "partition", + "cluster", + "primary_key", + "foreign_key", + "sort", + "unique", + "merge_key", + "root_key", +] """Known properties and hints of the column""" # TODO: merge TColumnHint with TColumnProp -TColumnHint = Literal["not_null", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "root_key", "merge_key"] +TColumnHint = Literal[ + "not_null", + "partition", + "cluster", + "primary_key", + "foreign_key", + "sort", + "unique", + "root_key", + "merge_key", +] """Known hints of a column used to declare hint regexes.""" TWriteDisposition = Literal["skip", "append", "replace", "merge"] TTableFormat = Literal["iceberg"] -TTypeDetections = Literal["timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double"] +TTypeDetections = Literal[ + "timestamp", "iso_timestamp", "iso_date", "large_integer", "hexbytes_to_text", "wei_to_double" +] TTypeDetectionFunc = Callable[[Type[Any], Any], Optional[TDataType]] TColumnNames = Union[str, Sequence[str]] """A string representing a column name or a list of""" COLUMN_PROPS: Set[TColumnProp] = set(get_args(TColumnProp)) -COLUMN_HINTS: Set[TColumnHint] = set(["partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "merge_key", "root_key"]) +COLUMN_HINTS: Set[TColumnHint] = set( + [ + "partition", + "cluster", + "primary_key", + "foreign_key", + "sort", + "unique", + "merge_key", + "root_key", + ] +) WRITE_DISPOSITIONS: Set[TWriteDisposition] = set(get_args(TWriteDisposition)) @@ -44,12 +93,14 @@ class TColumnType(TypedDict, total=False): class TColumnSchemaBase(TColumnType, total=False): """TypedDict that defines basic properties of a column: name, data type and nullable""" + name: Optional[str] nullable: Optional[bool] class TColumnSchema(TColumnSchemaBase, total=False): """TypedDict that defines additional column hints""" + description: Optional[str] partition: Optional[bool] cluster: Optional[bool] @@ -66,7 +117,9 @@ class TColumnSchema(TColumnSchemaBase, total=False): """A mapping from column name to column schema, typically part of a table schema""" -TAnySchemaColumns = Union[TTableSchemaColumns, Sequence[TColumnSchema], _PydanticBaseModel, Type[_PydanticBaseModel]] +TAnySchemaColumns = Union[ + TTableSchemaColumns, Sequence[TColumnSchema], _PydanticBaseModel, Type[_PydanticBaseModel] +] TSimpleRegex = NewType("TSimpleRegex", str) TColumnName = NewType("TColumnName", str) @@ -75,25 +128,33 @@ class TColumnSchema(TColumnSchemaBase, total=False): TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] TSchemaContractEntities = Literal["tables", "columns", "data_type"] + class TSchemaContractDict(TypedDict, total=False): """TypedDict defining the schema update settings""" + tables: Optional[TSchemaEvolutionMode] columns: Optional[TSchemaEvolutionMode] data_type: Optional[TSchemaEvolutionMode] + TSchemaContract = Union[TSchemaEvolutionMode, TSchemaContractDict] + class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] includes: Optional[List[TSimpleRegex]] + class NormalizerInfo(TypedDict, total=True): new_table: bool + # TypedDict that defines properties of a table + class TTableSchema(TypedDict, total=False): """TypedDict that defines properties of a table""" + name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] @@ -105,9 +166,11 @@ class TTableSchema(TypedDict, total=False): resource: Optional[str] table_format: Optional[TTableFormat] + class TPartialTableSchema(TTableSchema): pass + TSchemaTables = Dict[str, TTableSchema] TSchemaUpdate = Dict[str, List[TPartialTableSchema]] @@ -121,6 +184,7 @@ class TSchemaSettings(TypedDict, total=False): class TStoredSchema(TypedDict, total=False): """TypeDict defining the schema representation in storage""" + version: int version_hash: str previous_hashes: List[str] @@ -131,4 +195,3 @@ class TStoredSchema(TypedDict, total=False): settings: Optional[TSchemaSettings] tables: TSchemaTables normalizers: TNormalizersConfig - diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index b6a3cca0e2..3e649cbf12 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -14,11 +14,40 @@ from dlt.common.typing import DictStrAny, REPattern, is_dict_generic_type from dlt.common.validation import TCustomValidator, validate_dict, validate_dict_ignoring_xkeys from dlt.common.schema import detections -from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, - TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContract, TSchemaContractDict) -from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, - TablePropertiesConflictException, InvalidSchemaName, UnknownTableException) +from dlt.common.schema.typing import ( + COLUMN_HINTS, + SCHEMA_ENGINE_VERSION, + LOADS_TABLE_NAME, + SIMPLE_REGEX_PREFIX, + VERSION_TABLE_NAME, + TColumnName, + TPartialTableSchema, + TSchemaTables, + TSchemaUpdate, + TSimpleRegex, + TStoredSchema, + TTableSchema, + TTableSchemaColumns, + TColumnSchemaBase, + TColumnSchema, + TColumnProp, + TTableFormat, + TColumnHint, + TTypeDetectionFunc, + TTypeDetections, + TWriteDisposition, + TSchemaContract, + TSchemaContractDict, +) +from dlt.common.schema.exceptions import ( + CannotCoerceColumnException, + ParentTableNotFoundException, + SchemaEngineNoUpgradePathException, + SchemaException, + TablePropertiesConflictException, + InvalidSchemaName, + UnknownTableException, +) from dlt.common.normalizers.utils import import_normalizers from dlt.common.schema.typing import TAnySchemaColumns @@ -30,7 +59,11 @@ def is_valid_schema_name(name: str) -> bool: """Schema name must be a valid python identifier and have max len of 64""" - return name is not None and name.isidentifier() and len(name) <= InvalidSchemaName.MAXIMUM_SCHEMA_NAME_LENGTH + return ( + name is not None + and name.isidentifier() + and len(name) <= InvalidSchemaName.MAXIMUM_SCHEMA_NAME_LENGTH + ) def normalize_schema_name(name: str) -> str: @@ -42,7 +75,7 @@ def normalize_schema_name(name: str) -> str: def apply_defaults(stored_schema: TStoredSchema) -> TStoredSchema: """Applies default hint values to `stored_schema` in place - Updates only complete column hints, incomplete columns are preserved intact + Updates only complete column hints, incomplete columns are preserved intact """ for table_name, table in stored_schema["tables"].items(): # overwrite name @@ -51,8 +84,8 @@ def apply_defaults(stored_schema: TStoredSchema) -> TStoredSchema: if table.get("parent") is None: if table.get("write_disposition") is None: table["write_disposition"] = DEFAULT_WRITE_DISPOSITION - if table.get('resource') is None: - table['resource'] = table_name + if table.get("resource") is None: + table["resource"] = table_name for column_name in table["columns"]: # add default hints to tables column = table["columns"][column_name] @@ -66,13 +99,13 @@ def apply_defaults(stored_schema: TStoredSchema) -> TStoredSchema: def remove_defaults(stored_schema: TStoredSchema) -> TStoredSchema: """Removes default values from `stored_schema` in place, returns the input for chaining - Default values are removed from table schemas and complete column schemas. Incomplete columns are preserved intact. + Default values are removed from table schemas and complete column schemas. Incomplete columns are preserved intact. """ clean_tables = deepcopy(stored_schema["tables"]) for table_name, t in clean_tables.items(): del t["name"] - if t.get('resource') == table_name: - del t['resource'] + if t.get("resource") == table_name: + del t["resource"] for c in t["columns"].values(): # remove defaults only on complete columns # if is_complete_column(c): @@ -110,19 +143,20 @@ def remove_column_defaults(column_schema: TColumnSchema) -> TColumnSchema: def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: """Adds default boolean hints to column""" return { - **{ - "nullable": True, - "partition": False, - "cluster": False, - "unique": False, - "sort": False, - "primary_key": False, - "foreign_key": False, - "root_key": False, - "merge_key": False - }, - **column - } + **{ + "nullable": True, + "partition": False, + "cluster": False, + "unique": False, + "sort": False, + "primary_key": False, + "foreign_key": False, + "root_key": False, + "merge_key": False, + }, + **column, + } + # def add_complete_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: # """Adds default hints to `column` if it is completed, otherwise preserves `column` content intact @@ -171,10 +205,12 @@ def generate_version_hash(stored_schema: TStoredSchema) -> str: # add column names to hash in order for cn in (t.get("columns") or {}).keys(): h.update(cn.encode("utf-8")) - return base64.b64encode(h.digest()).decode('ascii') + return base64.b64encode(h.digest()).decode("ascii") -def verify_schema_hash(loaded_schema_dict: DictStrAny, verifies_if_not_migrated: bool = False) -> bool: +def verify_schema_hash( + loaded_schema_dict: DictStrAny, verifies_if_not_migrated: bool = False +) -> bool: # generates content hash and compares with existing engine_version: str = loaded_schema_dict.get("engine_version") # if upgrade is needed, the hash cannot be compared @@ -190,16 +226,32 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: # custom validator on type TSimpleRegex if t is TSimpleRegex: if not isinstance(pv, str): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while str is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while str" + " is expected", + path, + pk, + pv, + ) if pv.startswith(SIMPLE_REGEX_PREFIX): # check if regex try: re.compile(pv[3:]) except Exception as e: - raise DictValidationException(f"In {path}: field {pk} value {pv[3:]} does not compile as regex: {str(e)}", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv[3:]} does not compile as regex: {str(e)}", + path, + pk, + pv, + ) else: if RE_NON_ALPHANUMERIC_UNDERSCORE.match(pv): - raise DictValidationException(f"In {path}: field {pk} value {pv} looks like a regex, please prefix with re:", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} looks like a regex, please prefix with re:", + path, + pk, + pv, + ) # we know how to validate that type return True else: @@ -208,16 +260,25 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: def column_name_validator(naming: NamingConvention) -> TCustomValidator: - def validator(path: str, pk: str, pv: Any, t: Any) -> bool: if t is TColumnName: if not isinstance(pv, str): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while str is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while" + " str is expected", + path, + pk, + pv, + ) try: if naming.normalize_path(pv) != pv: - raise DictValidationException(f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv + ) except ValueError: - raise DictValidationException(f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk}: {pv} is not a valid column name", path, pk, pv + ) return True else: return False @@ -239,20 +300,16 @@ def compile_simple_regex(r: TSimpleRegex) -> REPattern: def compile_simple_regexes(r: Iterable[TSimpleRegex]) -> REPattern: """Compile multiple patterns as one""" - pattern = '|'.join(f"({_prepare_simple_regex(p)})" for p in r) + pattern = "|".join(f"({_prepare_simple_regex(p)})" for p in r) if not pattern: # Don't create an empty pattern that matches everything raise ValueError("Cannot create a regex pattern from empty sequence") return re.compile(pattern) def validate_stored_schema(stored_schema: TStoredSchema) -> None: - # use lambda to verify only non extra fields validate_dict_ignoring_xkeys( - spec=TStoredSchema, - doc=stored_schema, - path=".", - validator_f=simple_regex_validator + spec=TStoredSchema, doc=stored_schema, path=".", validator_f=simple_regex_validator ) # check child parent relationships for table_name, table in stored_schema["tables"].items(): @@ -263,7 +320,6 @@ def validate_stored_schema(stored_schema: TStoredSchema) -> None: def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> TStoredSchema: - if from_engine == to_engine: return cast(TStoredSchema, schema_dict) @@ -277,12 +333,8 @@ def migrate_schema(schema_dict: DictStrAny, from_engine: int, to_engine: int) -> # add default normalizers and root hash propagation current["normalizers"], _, _ = import_normalizers(explicit_normalizers()) current["normalizers"]["json"]["config"] = { - "propagation": { - "root": { - "_dlt_id": "_dlt_root_id" - } - } - } + "propagation": {"root": {"_dlt_id": "_dlt_root_id"}} + } # move settings, convert strings to simple regexes d_h: Dict[TColumnHint, List[TSimpleRegex]] = schema_dict.pop("hints", {}) for h_k, h_l in d_h.items(): @@ -319,8 +371,8 @@ def migrate_filters(group: str, filters: List[str]) -> None: # existing filter were always defined at the root table. find this table and move filters for f in filters: # skip initial ^ - root = f[1:f.find("__")] - path = f[f.find("__") + 2:] + root = f[1 : f.find("__")] + path = f[f.find("__") + 2 :] t = current["tables"].get(root) if t is None: # must add new table to hold filters @@ -363,7 +415,9 @@ def migrate_filters(group: str, filters: List[str]) -> None: schema_dict["engine_version"] = from_engine if from_engine != to_engine: - raise SchemaEngineNoUpgradePathException(schema_dict["name"], schema_dict["engine_version"], from_engine, to_engine) + raise SchemaEngineNoUpgradePathException( + schema_dict["name"], schema_dict["engine_version"], from_engine, to_engine + ) return cast(TStoredSchema, schema_dict) @@ -391,10 +445,12 @@ def compare_complete_columns(a: TColumnSchema, b: TColumnSchema) -> bool: return a["data_type"] == b["data_type"] and a["name"] == b["name"] -def merge_columns(col_a: TColumnSchema, col_b: TColumnSchema, merge_defaults: bool = True) -> TColumnSchema: +def merge_columns( + col_a: TColumnSchema, col_b: TColumnSchema, merge_defaults: bool = True +) -> TColumnSchema: """Merges `col_b` into `col_a`. if `merge_defaults` is True, only hints from `col_b` that are not default in `col_a` will be set. - Modifies col_a in place and returns it + Modifies col_a in place and returns it """ col_b_clean = col_b if merge_defaults else remove_column_defaults(copy(col_b)) for n, v in col_b_clean.items(): @@ -416,7 +472,9 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl table_name = tab_a["name"] # check if table properties can be merged if tab_a.get("parent") != tab_b.get("parent"): - raise TablePropertiesConflictException(table_name, "parent", tab_a.get("parent"), tab_b.get("parent")) + raise TablePropertiesConflictException( + table_name, "parent", tab_a.get("parent"), tab_b.get("parent") + ) # get new columns, changes in the column data type or other properties are not allowed tab_a_columns = tab_a["columns"] @@ -428,7 +486,13 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl if is_complete_column(col_a) and is_complete_column(col_b): if not compare_complete_columns(tab_a_columns[col_b_name], col_b): # attempt to update to incompatible columns - raise CannotCoerceColumnException(table_name, col_b_name, col_b["data_type"], tab_a_columns[col_b_name]["data_type"], None) + raise CannotCoerceColumnException( + table_name, + col_b_name, + col_b["data_type"], + tab_a_columns[col_b_name]["data_type"], + None, + ) # all other properties can change merged_column = merge_columns(copy(col_a), col_b) if merged_column != col_a: @@ -439,7 +503,7 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl # return partial table containing only name and properties that differ (column, filters etc.) partial_table: TPartialTableSchema = { "name": table_name, - "columns": {} if new_columns is None else {c["name"]: c for c in new_columns} + "columns": {} if new_columns is None else {c["name"]: c for c in new_columns}, } for k, v in tab_b.items(): if k in ["columns", None]: @@ -449,13 +513,14 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl partial_table[k] = v # type: ignore # this should not really happen - if tab_a.get('parent') is not None and (resource := tab_b.get('resource')): - raise TablePropertiesConflictException(table_name, "resource", resource, tab_a.get('parent')) + if tab_a.get("parent") is not None and (resource := tab_b.get("resource")): + raise TablePropertiesConflictException( + table_name, "resource", resource, tab_a.get("parent") + ) return partial_table - # def compare_tables(tab_a: TTableSchema, tab_b: TTableSchema) -> bool: # try: # table_name = tab_a["name"] @@ -471,14 +536,16 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl def merge_tables(table: TTableSchema, partial_table: TPartialTableSchema) -> TPartialTableSchema: """Merges "partial_table" into "table". `table` is merged in place. Returns the diff partial table. - `table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: - * new columns are added, updated columns are replaced from diff - * table hints are added or replaced from diff - * nothing gets deleted + `table` and `partial_table` names must be identical. A table diff is generated and applied to `table`: + * new columns are added, updated columns are replaced from diff + * table hints are added or replaced from diff + * nothing gets deleted """ if table["name"] != partial_table["name"]: - raise TablePropertiesConflictException(table["name"], "name", table["name"], partial_table["name"]) + raise TablePropertiesConflictException( + table["name"], "name", table["name"], partial_table["name"] + ) diff_table = diff_tables(table, partial_table) # add new columns when all checks passed table["columns"].update(diff_table["columns"]) @@ -495,10 +562,16 @@ def hint_to_column_prop(h: TColumnHint) -> TColumnProp: return h -def get_columns_names_with_prop(table: TTableSchema, column_prop: Union[TColumnProp, str], include_incomplete: bool = False) -> List[str]: +def get_columns_names_with_prop( + table: TTableSchema, column_prop: Union[TColumnProp, str], include_incomplete: bool = False +) -> List[str]: # column_prop: TColumnProp = hint_to_column_prop(hint_type) # default = column_prop != "nullable" # default is true, only for nullable false - return [c["name"] for c in table["columns"].values() if bool(c.get(column_prop, False)) is True and (include_incomplete or is_complete_column(c))] + return [ + c["name"] + for c in table["columns"].values() + if bool(c.get(column_prop, False)) is True and (include_incomplete or is_complete_column(c)) + ] def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTables: @@ -512,7 +585,9 @@ def merge_schema_updates(schema_updates: Sequence[TSchemaUpdate]) -> TSchemaTabl return aggregated_update -def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_name: str, allow_none: bool = False) -> Any: +def get_inherited_table_hint( + tables: TSchemaTables, table_name: str, table_hint_name: str, allow_none: bool = False +) -> Any: table = tables.get(table_name, {}) hint = table.get(table_hint_name) if hint: @@ -525,16 +600,23 @@ def get_inherited_table_hint(tables: TSchemaTables, table_name: str, table_hint_ if allow_none: return None - raise ValueError(f"No table hint '{table_hint_name} found in the chain of tables for '{table_name}'.") + raise ValueError( + f"No table hint '{table_hint_name} found in the chain of tables for '{table_name}'." + ) def get_write_disposition(tables: TSchemaTables, table_name: str) -> TWriteDisposition: """Returns table hint of a table if present. If not, looks up into parent table""" - return cast(TWriteDisposition, get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False)) + return cast( + TWriteDisposition, + get_inherited_table_hint(tables, table_name, "write_disposition", allow_none=False), + ) def get_table_format(tables: TSchemaTables, table_name: str) -> TTableFormat: - return cast(TTableFormat, get_inherited_table_hint(tables, table_name, "table_format", allow_none=True)) + return cast( + TTableFormat, get_inherited_table_hint(tables, table_name, "table_format", allow_none=True) + ) def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: @@ -544,7 +626,10 @@ def table_schema_has_type(table: TTableSchema, _typ: TDataType) -> bool: def table_schema_has_type_with_precision(table: TTableSchema, _typ: TDataType) -> bool: """Checks if `table` schema contains column with type _typ and precision set""" - return any(c.get("data_type") == _typ and c.get("precision") is not None for c in table["columns"].values()) + return any( + c.get("data_type") == _typ and c.get("precision") is not None + for c in table["columns"].values() + ) def get_top_level_table(tables: TSchemaTables, table_name: str) -> TTableSchema: @@ -571,54 +656,38 @@ def _child(t: TTableSchema) -> None: return chain -def group_tables_by_resource(tables: TSchemaTables, pattern: Optional[REPattern] = None) -> Dict[str, List[TTableSchema]]: +def group_tables_by_resource( + tables: TSchemaTables, pattern: Optional[REPattern] = None +) -> Dict[str, List[TTableSchema]]: """Create a dict of resources and their associated tables and descendant tables If `pattern` is supplied, the result is filtered to only resource names matching the pattern. """ result: Dict[str, List[TTableSchema]] = {} for table in tables.values(): - resource = table.get('resource') + resource = table.get("resource") if resource and (pattern is None or pattern.match(resource)): resource_tables = result.setdefault(resource, []) - resource_tables.extend(get_child_tables(tables, table['name'])) + resource_tables.extend(get_child_tables(tables, table["name"])) return result def version_table() -> TTableSchema: # NOTE: always add new columns at the end of the table so we have identical layout # after an update of existing tables (always at the end) - table = new_table(VERSION_TABLE_NAME, columns=[ + table = new_table( + VERSION_TABLE_NAME, + columns=[ { "name": "version", "data_type": "bigint", "nullable": False, }, - { - "name": "engine_version", - "data_type": "bigint", - "nullable": False - }, - { - "name": "inserted_at", - "data_type": "timestamp", - "nullable": False - }, - { - "name": "schema_name", - "data_type": "text", - "nullable": False - }, - { - "name": "version_hash", - "data_type": "text", - "nullable": False - }, - { - "name": "schema", - "data_type": "text", - "nullable": False - } - ] + {"name": "engine_version", "data_type": "bigint", "nullable": False}, + {"name": "inserted_at", "data_type": "timestamp", "nullable": False}, + {"name": "schema_name", "data_type": "text", "nullable": False}, + {"name": "version_hash", "data_type": "text", "nullable": False}, + {"name": "schema", "data_type": "text", "nullable": False}, + ], ) table["write_disposition"] = "skip" table["description"] = "Created by DLT. Tracks schema updates" @@ -628,33 +697,19 @@ def version_table() -> TTableSchema: def load_table() -> TTableSchema: # NOTE: always add new columns at the end of the table so we have identical layout # after an update of existing tables (always at the end) - table = new_table(LOADS_TABLE_NAME, columns=[ - { - "name": "load_id", - "data_type": "text", - "nullable": False - }, - { - "name": "schema_name", - "data_type": "text", - "nullable": True - }, - { - "name": "status", - "data_type": "bigint", - "nullable": False - }, - { - "name": "inserted_at", - "data_type": "timestamp", - "nullable": False - }, + table = new_table( + LOADS_TABLE_NAME, + columns=[ + {"name": "load_id", "data_type": "text", "nullable": False}, + {"name": "schema_name", "data_type": "text", "nullable": True}, + {"name": "status", "data_type": "bigint", "nullable": False}, + {"name": "inserted_at", "data_type": "timestamp", "nullable": False}, { "name": "schema_version_hash", "data_type": "text", "nullable": True, }, - ] + ], ) table["write_disposition"] = "skip" table["description"] = "Created by DLT. Tracks completed loads" @@ -669,12 +724,11 @@ def new_table( validate_schema: bool = False, resource: str = None, schema_contract: TSchemaContract = None, - table_format: TTableFormat = None + table_format: TTableFormat = None, ) -> TTableSchema: - table: TTableSchema = { "name": table_name, - "columns": {} if columns is None else {c["name"]: c for c in columns} + "columns": {} if columns is None else {c["name"]: c for c in columns}, } if parent_table_name: table["parent"] = parent_table_name @@ -699,11 +753,13 @@ def new_table( return table -def new_column(column_name: str, data_type: TDataType = None, nullable: bool = True, validate_schema: bool = False) -> TColumnSchema: - column: TColumnSchema = { - "name": column_name, - "nullable": nullable - } +def new_column( + column_name: str, + data_type: TDataType = None, + nullable: bool = True, + validate_schema: bool = False, +) -> TColumnSchema: + column: TColumnSchema = {"name": column_name, "nullable": nullable} if data_type: column["data_type"] = data_type if validate_schema: @@ -715,6 +771,7 @@ def new_column(column_name: str, data_type: TDataType = None, nullable: bool = T return column + def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: return None diff --git a/dlt/common/source.py b/dlt/common/source.py index a75c2dd948..249d54b4c5 100644 --- a/dlt/common/source.py +++ b/dlt/common/source.py @@ -10,6 +10,7 @@ class SourceInfo(NamedTuple): """Runtime information on the source/resource""" + SPEC: Type[BaseConfiguration] f: AnyFun module: ModuleType @@ -44,4 +45,4 @@ def _get_source_for_inner_function(f: AnyFun) -> Optional[SourceInfo]: # find source function parts = get_callable_name(f, "__qualname__").split(".") parent_fun = ".".join(parts[:-2]) - return _SOURCES.get(parent_fun) \ No newline at end of file + return _SOURCES.get(parent_fun) diff --git a/dlt/common/storages/__init__.py b/dlt/common/storages/__init__.py index c18b8ab04d..9beb3c0597 100644 --- a/dlt/common/storages/__init__.py +++ b/dlt/common/storages/__init__.py @@ -5,7 +5,13 @@ from .normalize_storage import NormalizeStorage from .load_storage import LoadStorage from .data_item_storage import DataItemStorage -from .configuration import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration, TSchemaFileFormat, FilesystemConfiguration +from .configuration import ( + LoadStorageConfiguration, + NormalizeStorageConfiguration, + SchemaStorageConfiguration, + TSchemaFileFormat, + FilesystemConfiguration, +) from .fsspec_filesystem import fsspec_from_config, fsspec_filesystem @@ -17,6 +23,11 @@ "NormalizeStorage", "LoadStorage", "DataItemStorage", - "LoadStorageConfiguration", "NormalizeStorageConfiguration", "SchemaStorageConfiguration", "TSchemaFileFormat", "FilesystemConfiguration", - "fsspec_from_config", "fsspec_filesystem", + "LoadStorageConfiguration", + "NormalizeStorageConfiguration", + "SchemaStorageConfiguration", + "TSchemaFileFormat", + "FilesystemConfiguration", + "fsspec_from_config", + "fsspec_filesystem", ] diff --git a/dlt/common/storages/configuration.py b/dlt/common/storages/configuration.py index 699465ce4a..83e7e88189 100644 --- a/dlt/common/storages/configuration.py +++ b/dlt/common/storages/configuration.py @@ -4,7 +4,14 @@ from dlt.common.configuration.specs import BaseConfiguration, configspec, CredentialsConfiguration from dlt.common.configuration import configspec, resolve_type -from dlt.common.configuration.specs import GcpServiceAccountCredentials, AwsCredentials, GcpOAuthCredentials, AzureCredentials, AzureCredentialsWithoutDefaults, BaseConfiguration +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + AwsCredentials, + GcpOAuthCredentials, + AzureCredentials, + AzureCredentialsWithoutDefaults, + BaseConfiguration, +) from dlt.common.utils import digest128 from dlt.common.configuration.exceptions import ConfigurationValueError @@ -18,11 +25,18 @@ class SchemaStorageConfiguration(BaseConfiguration): import_schema_path: Optional[str] = None # path from which to import a schema into storage export_schema_path: Optional[str] = None # path to which export schema from storage external_schema_format: TSchemaFileFormat = "yaml" # format in which to expect external schema - external_schema_format_remove_defaults: bool = True # remove default values when exporting schema + external_schema_format_remove_defaults: bool = ( + True # remove default values when exporting schema + ) if TYPE_CHECKING: - def __init__(self, schema_volume_path: str = None, import_schema_path: str = None, export_schema_path: str = None) -> None: - ... + + def __init__( + self, + schema_volume_path: str = None, + import_schema_path: str = None, + export_schema_path: str = None, + ) -> None: ... @configspec @@ -30,33 +44,43 @@ class NormalizeStorageConfiguration(BaseConfiguration): normalize_volume_path: str = None # path to volume where normalized loader files will be stored if TYPE_CHECKING: - def __init__(self, normalize_volume_path: str = None) -> None: - ... + + def __init__(self, normalize_volume_path: str = None) -> None: ... @configspec class LoadStorageConfiguration(BaseConfiguration): - load_volume_path: str = None # path to volume where files to be loaded to analytical storage are stored - delete_completed_jobs: bool = False # if set to true the folder with completed jobs will be deleted + load_volume_path: str = ( + None # path to volume where files to be loaded to analytical storage are stored + ) + delete_completed_jobs: bool = ( + False # if set to true the folder with completed jobs will be deleted + ) if TYPE_CHECKING: - def __init__(self, load_volume_path: str = None, delete_completed_jobs: bool = None) -> None: - ... + + def __init__( + self, load_volume_path: str = None, delete_completed_jobs: bool = None + ) -> None: ... -FileSystemCredentials = Union[AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials] +FileSystemCredentials = Union[ + AwsCredentials, GcpServiceAccountCredentials, AzureCredentials, GcpOAuthCredentials +] + @configspec class FilesystemConfiguration(BaseConfiguration): """A configuration defining filesystem location and access credentials. - When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class. - * s3 - * gs, gcs - * az, abfs, adl - * file, memory - * gdrive + When configuration is resolved, `bucket_url` is used to extract a protocol and request corresponding credentials class. + * s3 + * gs, gcs + * az, abfs, adl + * file, memory + * gdrive """ + PROTOCOL_CREDENTIALS: ClassVar[Dict[str, Any]] = { "gs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], "gcs": Union[GcpServiceAccountCredentials, GcpOAuthCredentials], @@ -84,13 +108,16 @@ def protocol(self) -> str: def on_resolved(self) -> None: url = urlparse(self.bucket_url) if not url.path and not url.netloc: - raise ConfigurationValueError("File path or netloc missing. Field bucket_url of FilesystemClientConfiguration must contain valid url with a path or host:password component.") + raise ConfigurationValueError( + "File path or netloc missing. Field bucket_url of FilesystemClientConfiguration" + " must contain valid url with a path or host:password component." + ) # this is just a path in local file system if url.path == self.bucket_url: url = url._replace(scheme="file") self.bucket_url = url.geturl() - @resolve_type('credentials') + @resolve_type("credentials") def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: # use known credentials or empty credentials for unknown protocol return self.PROTOCOL_CREDENTIALS.get(self.protocol) or Optional[CredentialsConfiguration] # type: ignore[return-value] @@ -113,9 +140,5 @@ def __str__(self) -> str: return self.bucket_url if TYPE_CHECKING: - def __init__( - self, - bucket_url: str, - credentials: FileSystemCredentials = None - ) -> None: - ... + + def __init__(self, bucket_url: str, credentials: FileSystemCredentials = None) -> None: ... diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 6621f07e26..4fb115757b 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -13,7 +13,9 @@ def __init__(self, load_file_type: TLoaderFileFormat, *args: Any) -> None: self.buffered_writers: Dict[str, BufferedDataWriter[DataWriter]] = {} super().__init__(*args) - def get_writer(self, load_id: str, schema_name: str, table_name: str) -> BufferedDataWriter[DataWriter]: + def get_writer( + self, load_id: str, schema_name: str, table_name: str + ) -> BufferedDataWriter[DataWriter]: # unique writer id writer_id = f"{load_id}.{schema_name}.{table_name}" writer = self.buffered_writers.get(writer_id, None) @@ -24,12 +26,21 @@ def get_writer(self, load_id: str, schema_name: str, table_name: str) -> Buffere self.buffered_writers[writer_id] = writer return writer - def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> int: + def write_data_item( + self, + load_id: str, + schema_name: str, + table_name: str, + item: TDataItems, + columns: TTableSchemaColumns, + ) -> int: writer = self.get_writer(load_id, schema_name, table_name) # write item(s) return writer.write_data_item(item, columns) - def write_empty_file(self, load_id: str, schema_name: str, table_name: str, columns: TTableSchemaColumns) -> None: + def write_empty_file( + self, load_id: str, schema_name: str, table_name: str, columns: TTableSchemaColumns + ) -> None: writer = self.get_writer(load_id, schema_name, table_name) writer.write_empty_file(columns) @@ -37,7 +48,10 @@ def close_writers(self, extract_id: str) -> None: # flush and close all files for name, writer in self.buffered_writers.items(): if name.startswith(extract_id): - logger.debug(f"Closing writer for {name} with file {writer._file} and actual name {writer._file_name}") + logger.debug( + f"Closing writer for {name} with file {writer._file} and actual name" + f" {writer._file_name}" + ) writer.close() def closed_files(self) -> List[str]: diff --git a/dlt/common/storages/exceptions.py b/dlt/common/storages/exceptions.py index 3203191cd8..8683679a7f 100644 --- a/dlt/common/storages/exceptions.py +++ b/dlt/common/storages/exceptions.py @@ -11,20 +11,36 @@ def __init__(self, msg: str) -> None: class NoMigrationPathException(StorageException): - def __init__(self, storage_path: str, initial_version: semver.VersionInfo, migrated_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None: + def __init__( + self, + storage_path: str, + initial_version: semver.VersionInfo, + migrated_version: semver.VersionInfo, + target_version: semver.VersionInfo, + ) -> None: self.storage_path = storage_path self.initial_version = initial_version self.migrated_version = migrated_version self.target_version = target_version - super().__init__(f"Could not find migration path for {storage_path} from v {initial_version} to {target_version}, stopped at {migrated_version}") + super().__init__( + f"Could not find migration path for {storage_path} from v {initial_version} to" + f" {target_version}, stopped at {migrated_version}" + ) class WrongStorageVersionException(StorageException): - def __init__(self, storage_path: str, initial_version: semver.VersionInfo, target_version: semver.VersionInfo) -> None: + def __init__( + self, + storage_path: str, + initial_version: semver.VersionInfo, + target_version: semver.VersionInfo, + ) -> None: self.storage_path = storage_path self.initial_version = initial_version self.target_version = target_version - super().__init__(f"Expected storage {storage_path} with v {target_version} but found {initial_version}") + super().__init__( + f"Expected storage {storage_path} with v {target_version} but found {initial_version}" + ) class LoadStorageException(StorageException): @@ -32,11 +48,16 @@ class LoadStorageException(StorageException): class JobWithUnsupportedWriterException(LoadStorageException): - def __init__(self, load_id: str, expected_file_formats: Iterable[TLoaderFileFormat], wrong_job: str) -> None: + def __init__( + self, load_id: str, expected_file_formats: Iterable[TLoaderFileFormat], wrong_job: str + ) -> None: self.load_id = load_id self.expected_file_formats = expected_file_formats self.wrong_job = wrong_job - super().__init__(f"Job {wrong_job} for load id {load_id} requires loader file format that is not one of {expected_file_formats}") + super().__init__( + f"Job {wrong_job} for load id {load_id} requires loader file format that is not one of" + f" {expected_file_formats}" + ) class LoadPackageNotFound(LoadStorageException, FileNotFoundError): @@ -51,12 +72,22 @@ class SchemaStorageException(StorageException): class InStorageSchemaModified(SchemaStorageException): def __init__(self, schema_name: str, storage_path: str) -> None: - msg = f"Schema {schema_name} in {storage_path} was externally modified. This is not allowed as that would prevent correct version tracking. Use import/export capabilities of dlt to provide external changes." + msg = ( + f"Schema {schema_name} in {storage_path} was externally modified. This is not allowed" + " as that would prevent correct version tracking. Use import/export capabilities of" + " dlt to provide external changes." + ) super().__init__(msg) class SchemaNotFoundError(SchemaStorageException, FileNotFoundError, KeyError): - def __init__(self, schema_name: str, storage_path: str, import_path: str = None, import_format: str = None) -> None: + def __init__( + self, + schema_name: str, + storage_path: str, + import_path: str = None, + import_format: str = None, + ) -> None: msg = f"Schema {schema_name} in {storage_path} could not be found." if import_path: msg += f"Import from {import_path} and format {import_format} failed." @@ -65,4 +96,7 @@ def __init__(self, schema_name: str, storage_path: str, import_path: str = None, class UnexpectedSchemaName(SchemaStorageException, ValueError): def __init__(self, schema_name: str, storage_path: str, stored_name: str) -> None: - super().__init__(f"A schema file name '{schema_name}' in {storage_path} does not correspond to the name of schema in the file {stored_name}") + super().__init__( + f"A schema file name '{schema_name}' in {storage_path} does not correspond to the name" + f" of schema in the file {stored_name}" + ) diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 3c5a391200..71c78a9b9d 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -14,11 +14,9 @@ FILE_COMPONENT_INVALID_CHARACTERS = re.compile(r"[.%{}]") + class FileStorage: - def __init__(self, - storage_path: str, - file_type: str = "t", - makedirs: bool = False) -> None: + def __init__(self, storage_path: str, file_type: str = "t", makedirs: bool = False) -> None: # make it absolute path self.storage_path = os.path.realpath(storage_path) # os.path.join(, '') self.file_type = file_type @@ -31,7 +29,9 @@ def save(self, relative_path: str, data: Any) -> str: @staticmethod def save_atomic(storage_path: str, relative_path: str, data: Any, file_type: str = "t") -> str: mode = "w" + file_type - with tempfile.NamedTemporaryFile(dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode)) as f: + with tempfile.NamedTemporaryFile( + dir=storage_path, mode=mode, delete=False, encoding=encoding_for_mode(mode) + ) as f: tmp_path = f.name f.write(data) try: @@ -93,7 +93,9 @@ def delete(self, relative_path: str) -> None: else: raise FileNotFoundError(file_path) - def delete_folder(self, relative_path: str, recursively: bool = False, delete_ro: bool = False) -> None: + def delete_folder( + self, relative_path: str, recursively: bool = False, delete_ro: bool = False + ) -> None: folder_path = self.make_full_path(relative_path) if os.path.isdir(folder_path): if recursively: @@ -116,7 +118,9 @@ def open_file(self, relative_path: str, mode: str = "r") -> IO[Any]: def open_temp(self, delete: bool = False, mode: str = "w", file_type: str = None) -> IO[Any]: mode = mode + file_type or self.file_type - return tempfile.NamedTemporaryFile(dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode)) + return tempfile.NamedTemporaryFile( + dir=self.storage_path, mode=mode, delete=delete, encoding=encoding_for_mode(mode) + ) def has_file(self, relative_path: str) -> bool: return os.path.isfile(self.make_full_path(relative_path)) @@ -137,7 +141,9 @@ def list_folder_files(self, relative_path: str, to_root: bool = True) -> List[st scan_path = self.make_full_path(relative_path) if to_root: # list files in relative path, returning paths relative to storage root - return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_file()] + return [ + os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_file() + ] else: # or to the folder return [e.name for e in os.scandir(scan_path) if e.is_file()] @@ -147,7 +153,9 @@ def list_folder_dirs(self, relative_path: str, to_root: bool = True) -> List[str scan_path = self.make_full_path(relative_path) if to_root: # list folders in relative path, returning paths relative to storage root - return [os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_dir()] + return [ + os.path.join(relative_path, e.name) for e in os.scandir(scan_path) if e.is_dir() + ] else: # or to the folder return [e.name for e in os.scandir(scan_path) if e.is_dir()] @@ -157,15 +165,11 @@ def create_folder(self, relative_path: str, exists_ok: bool = False) -> None: def link_hard(self, from_relative_path: str, to_relative_path: str) -> None: # note: some interesting stuff on links https://lightrun.com/answers/conan-io-conan-research-investigate-symlinks-and-hard-links - os.link( - self.make_full_path(from_relative_path), - self.make_full_path(to_relative_path) - ) + os.link(self.make_full_path(from_relative_path), self.make_full_path(to_relative_path)) @staticmethod def link_hard_with_fallback(external_file_path: str, to_file_path: str) -> None: - """Try to create a hardlink and fallback to copying when filesystem doesn't support links - """ + """Try to create a hardlink and fallback to copying when filesystem doesn't support links""" try: os.link(external_file_path, to_file_path) except OSError as ex: @@ -184,10 +188,7 @@ def atomic_rename(self, from_relative_path: str, to_relative_path: str) -> None: 3. All buckets mapped with FUSE are not atomic """ - os.rename( - self.make_full_path(from_relative_path), - self.make_full_path(to_relative_path) - ) + os.rename(self.make_full_path(from_relative_path), self.make_full_path(to_relative_path)) def rename_tree(self, from_relative_path: str, to_relative_path: str) -> None: """Renames a tree using os.rename if possible making it atomic @@ -226,7 +227,9 @@ def rename_tree_files(self, from_relative_path: str, to_relative_path: str) -> N if not os.listdir(root): os.rmdir(root) - def atomic_import(self, external_file_path: str, to_folder: str, new_file_name: Optional[str] = None) -> str: + def atomic_import( + self, external_file_path: str, to_folder: str, new_file_name: Optional[str] = None + ) -> str: """Moves a file at `external_file_path` into the `to_folder` effectively importing file into storage Args: @@ -239,7 +242,9 @@ def atomic_import(self, external_file_path: str, to_folder: str, new_file_name: """ new_file_name = new_file_name or os.path.basename(external_file_path) dest_file_path = os.path.join(self.make_full_path(to_folder), new_file_name) - return self.to_relative_path(FileStorage.move_atomic_to_file(external_file_path, dest_file_path)) + return self.to_relative_path( + FileStorage.move_atomic_to_file(external_file_path, dest_file_path) + ) def in_storage(self, path: str) -> bool: assert path is not None @@ -284,7 +289,9 @@ def validate_file_name_component(name: str) -> None: pathvalidate.validate_filename(name, platform="Universal") # component cannot contain "." if FILE_COMPONENT_INVALID_CHARACTERS.search(name): - raise pathvalidate.error.InvalidCharError(description="Component name cannot contain the following characters: . % { }") + raise pathvalidate.error.InvalidCharError( + description="Component name cannot contain the following characters: . % { }" + ) @staticmethod def rmtree_del_ro(action: AnyFun, name: str, exc: Any) -> Any: @@ -311,7 +318,6 @@ def open_zipsafe_ro(path: str, mode: str = "r", **kwargs: Any) -> IO[Any]: except (gzip.BadGzipFile, OSError): return open(path, origmode, encoding=encoding, **kwargs) - @staticmethod def is_gzipped(path: str) -> bool: """Checks if file under path is gzipped by reading a header""" diff --git a/dlt/common/storages/fsspec_filesystem.py b/dlt/common/storages/fsspec_filesystem.py index c084fcc12e..18c1837e00 100644 --- a/dlt/common/storages/fsspec_filesystem.py +++ b/dlt/common/storages/fsspec_filesystem.py @@ -13,7 +13,12 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.time import ensure_pendulum_datetime from dlt.common.typing import DictStrAny -from dlt.common.configuration.specs import CredentialsWithDefault, GcpCredentials, AwsCredentials, AzureCredentials +from dlt.common.configuration.specs import ( + CredentialsWithDefault, + GcpCredentials, + AwsCredentials, + AzureCredentials, +) from dlt.common.storages.configuration import FileSystemCredentials, FilesystemConfiguration from dlt import version @@ -21,6 +26,7 @@ class FileItem(TypedDict, total=False): """A DataItem representing a file""" + file_url: str file_name: str mime_type: str @@ -45,31 +51,32 @@ class FileItem(TypedDict, total=False): MTIME_DISPATCH["abfs"] = MTIME_DISPATCH["az"] -def fsspec_filesystem(protocol: str, credentials: FileSystemCredentials = None) -> Tuple[AbstractFileSystem, str]: +def fsspec_filesystem( + protocol: str, credentials: FileSystemCredentials = None +) -> Tuple[AbstractFileSystem, str]: """Instantiates an authenticated fsspec `FileSystem` for a given `protocol` and credentials. - Please supply credentials instance corresponding to the protocol. The `protocol` is just the code name of the filesystem ie: - * s3 - * az, abfs - * gcs, gs + Please supply credentials instance corresponding to the protocol. The `protocol` is just the code name of the filesystem ie: + * s3 + * az, abfs + * gcs, gs - also see filesystem_from_config + also see filesystem_from_config """ return fsspec_from_config(FilesystemConfiguration(protocol, credentials)) - def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSystem, str]: """Instantiates an authenticated fsspec `FileSystem` from `config` argument. - Authenticates following filesystems: - * s3 - * az, abfs - * gcs, gs + Authenticates following filesystems: + * s3 + * az, abfs + * gcs, gs - All other filesystems are not authenticated + All other filesystems are not authenticated - Returns: (fsspec filesystem, normalized url) + Returns: (fsspec filesystem, normalized url) """ proto = config.protocol @@ -78,14 +85,17 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys fs_kwargs.update(cast(AwsCredentials, config.credentials).to_s3fs_credentials()) elif proto in ["az", "abfs", "adl", "azure"]: fs_kwargs.update(cast(AzureCredentials, config.credentials).to_adlfs_credentials()) - elif proto in ['gcs', 'gs']: + elif proto in ["gcs", "gs"]: assert isinstance(config.credentials, GcpCredentials) # Default credentials are handled by gcsfs - if isinstance(config.credentials, CredentialsWithDefault) and config.credentials.has_default_credentials(): - fs_kwargs['token'] = None + if ( + isinstance(config.credentials, CredentialsWithDefault) + and config.credentials.has_default_credentials() + ): + fs_kwargs["token"] = None else: - fs_kwargs['token'] = dict(config.credentials) - fs_kwargs['project'] = config.credentials.project_id + fs_kwargs["token"] = dict(config.credentials) + fs_kwargs["project"] = config.credentials.project_id try: return url_to_fs(config.bucket_url, use_listings_cache=False, **fs_kwargs) # type: ignore[no-any-return] except ModuleNotFoundError as e: @@ -93,11 +103,12 @@ def fsspec_from_config(config: FilesystemConfiguration) -> Tuple[AbstractFileSys class FileItemDict(DictStrAny): - """A FileItem dictionary with additional methods to get fsspec filesystem, open and read files. - """ + """A FileItem dictionary with additional methods to get fsspec filesystem, open and read files.""" def __init__( - self, mapping: FileItem, credentials: Optional[Union[FileSystemCredentials, AbstractFileSystem]] = None + self, + mapping: FileItem, + credentials: Optional[Union[FileSystemCredentials, AbstractFileSystem]] = None, ): """Create a dictionary with the filesystem client. @@ -141,9 +152,7 @@ def open(self, mode: str = "rb", **kwargs: Any) -> IO[Any]: # noqa: A003 if "t" in mode: text_kwargs = { - k: kwargs.pop(k) - for k in ["encoding", "errors", "newline"] - if k in kwargs + k: kwargs.pop(k) for k in ["encoding", "errors", "newline"] if k in kwargs } return io.TextIOWrapper( bytes_io, @@ -191,6 +200,7 @@ def glob_files( Iterable[FileItem]: The list of files. """ import os + bucket_url_parsed = urlparse(bucket_url) # if this is file path without scheme if not bucket_url_parsed.scheme or (os.path.isabs(bucket_url) and "\\" in bucket_url): @@ -198,13 +208,16 @@ def glob_files( bucket_url = pathlib.Path(bucket_url).absolute().as_uri() bucket_url_parsed = urlparse(bucket_url) - bucket_path = bucket_url_parsed._replace(scheme='').geturl() + bucket_path = bucket_url_parsed._replace(scheme="").geturl() bucket_path = bucket_path[2:] if bucket_path.startswith("//") else bucket_path filter_url = posixpath.join(bucket_path, file_glob) glob_result = fs_client.glob(filter_url, detail=True) if isinstance(glob_result, list): - raise NotImplementedError("Cannot request details when using fsspec.glob. For ADSL (Azure) please use version 2023.9.0 or later") + raise NotImplementedError( + "Cannot request details when using fsspec.glob. For ADSL (Azure) please use version" + " 2023.9.0 or later" + ) for file, md in glob_result.items(): if md["type"] != "file": diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index 79aeb22e61..e3fd07cf72 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -7,8 +7,9 @@ class LiveSchemaStorage(SchemaStorage): - - def __init__(self, config: SchemaStorageConfiguration = config.value, makedirs: bool = False) -> None: + def __init__( + self, config: SchemaStorageConfiguration = config.value, makedirs: bool = False + ) -> None: self.live_schemas: Dict[str, Schema] = {} super().__init__(config, makedirs) diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index d8eee9b8d6..63573b9f18 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -6,7 +6,18 @@ from os.path import join from pathlib import Path from pendulum.datetime import DateTime -from typing import Dict, Iterable, List, NamedTuple, Literal, Optional, Sequence, Set, get_args, cast +from typing import ( + Dict, + Iterable, + List, + NamedTuple, + Literal, + Optional, + Sequence, + Set, + get_args, + cast, +) from dlt.common import json, pendulum from dlt.common.configuration import known_sections @@ -47,7 +58,9 @@ def parse(file_name: str) -> "ParsedLoadJobFileName": if len(parts) != 4: raise TerminalValueError(parts) - return ParsedLoadJobFileName(parts[0], parts[1], int(parts[2]), cast(TLoaderFileFormat, parts[3])) + return ParsedLoadJobFileName( + parts[0], parts[1], int(parts[2]), cast(TLoaderFileFormat, parts[3]) + ) class LoadJobInfo(NamedTuple): @@ -67,10 +80,22 @@ def asdict(self) -> DictStrAny: return d def asstr(self, verbosity: int = 0) -> str: - failed_msg = "The job FAILED TERMINALLY and cannot be restarted." if self.failed_message else "" - elapsed_msg = humanize.precisedelta(pendulum.duration(seconds=self.elapsed)) if self.elapsed else "---" - msg = f"Job: {self.job_file_info.job_id()}, table: {self.job_file_info.table_name} in {self.state}. " - msg += f"File type: {self.job_file_info.file_format}, size: {humanize.naturalsize(self.file_size, binary=True, gnu=True)}. " + failed_msg = ( + "The job FAILED TERMINALLY and cannot be restarted." if self.failed_message else "" + ) + elapsed_msg = ( + humanize.precisedelta(pendulum.duration(seconds=self.elapsed)) + if self.elapsed + else "---" + ) + msg = ( + f"Job: {self.job_file_info.job_id()}, table: {self.job_file_info.table_name} in" + f" {self.state}. " + ) + msg += ( + f"File type: {self.job_file_info.file_format}, size:" + f" {humanize.naturalsize(self.file_size, binary=True, gnu=True)}. " + ) msg += f"Started on: {self.created_at} and completed in {elapsed_msg}." if failed_msg: msg += "\nThe job FAILED TERMINALLY and cannot be restarted." @@ -113,8 +138,16 @@ def asdict(self) -> DictStrAny: return d def asstr(self, verbosity: int = 0) -> str: - completed_msg = f"The package was {self.state.upper()} at {self.completed_at}" if self.completed_at else "The package is being PROCESSED" - msg = f"The package with load id {self.load_id} for schema {self.schema_name} is in {self.state} state. It updated schema for {len(self.schema_update)} tables. {completed_msg}.\n" + completed_msg = ( + f"The package was {self.state.upper()} at {self.completed_at}" + if self.completed_at + else "The package is being PROCESSED" + ) + msg = ( + f"The package with load id {self.load_id} for schema {self.schema_name} is in" + f" {self.state} state. It updated schema for {len(self.schema_update)} tables." + f" {completed_msg}.\n" + ) msg += "Jobs details:\n" msg += "\n".join(job.asstr(verbosity) for job in flatten_list_or_items(iter(self.jobs.values()))) # type: ignore return msg @@ -124,7 +157,6 @@ def __str__(self) -> str: class LoadStorage(DataItemStorage, VersionedStorage): - STORAGE_VERSION = "1.0.0" NORMALIZED_FOLDER = "normalized" # folder within the volume where load packages are stored LOADED_FOLDER = "loaded" # folder to keep the loads that were completely processed @@ -134,10 +166,16 @@ class LoadStorage(DataItemStorage, VersionedStorage): STARTED_JOBS_FOLDER: TJobState = "started_jobs" COMPLETED_JOBS_FOLDER: TJobState = "completed_jobs" - SCHEMA_UPDATES_FILE_NAME = "schema_updates.json" # updates to the tables in schema created by normalizer - APPLIED_SCHEMA_UPDATES_FILE_NAME = "applied_" + "schema_updates.json" # updates applied to the destination + SCHEMA_UPDATES_FILE_NAME = ( # updates to the tables in schema created by normalizer + "schema_updates.json" + ) + APPLIED_SCHEMA_UPDATES_FILE_NAME = ( + "applied_" + "schema_updates.json" + ) # updates applied to the destination SCHEMA_FILE_NAME = "schema.json" # package schema - PACKAGE_COMPLETED_FILE_NAME = "package_completed.json" # completed package marker file, currently only to store data with os.stat + PACKAGE_COMPLETED_FILE_NAME = ( # completed package marker file, currently only to store data with os.stat + "package_completed.json" + ) ALL_SUPPORTED_FILE_FORMATS = ALL_SUPPORTED_FILE_FORMATS @@ -147,7 +185,7 @@ def __init__( is_owner: bool, preferred_file_format: TLoaderFileFormat, supported_file_formats: Iterable[TLoaderFileFormat], - config: LoadStorageConfiguration = config.value + config: LoadStorageConfiguration = config.value, ) -> None: if not LoadStorage.ALL_SUPPORTED_FILE_FORMATS.issuperset(supported_file_formats): raise TerminalValueError(supported_file_formats) @@ -158,7 +196,8 @@ def __init__( super().__init__( preferred_file_format, LoadStorage.STORAGE_VERSION, - is_owner, FileStorage(config.load_volume_path, "t", makedirs=is_owner) + is_owner, + FileStorage(config.load_volume_path, "t", makedirs=is_owner), ) if is_owner: self.initialize_storage() @@ -182,8 +221,19 @@ def _get_data_item_path_template(self, load_id: str, _: str, table_name: str) -> file_name = self.build_job_file_name(table_name, "%s", with_extension=False) return self.storage.make_full_path(join(load_id, LoadStorage.NEW_JOBS_FOLDER, file_name)) - def write_temp_job_file(self, load_id: str, table_name: str, table: TTableSchemaColumns, file_id: str, rows: Sequence[StrAny]) -> str: - file_name = self._get_data_item_path_template(load_id, None, table_name) % file_id + "." + self.loader_file_format + def write_temp_job_file( + self, + load_id: str, + table_name: str, + table: TTableSchemaColumns, + file_id: str, + rows: Sequence[StrAny], + ) -> str: + file_name = ( + self._get_data_item_path_template(load_id, None, table_name) % file_id + + "." + + self.loader_file_format + ) format_spec = DataWriter.data_format_from_file_format(self.loader_file_format) mode = "wb" if format_spec.is_binary_format else "w" with self.storage.open_file(file_name, mode=mode) as f: @@ -207,7 +257,9 @@ def save_temp_schema(self, schema: Schema, load_id: str) -> str: return self.storage.save(join(load_id, LoadStorage.SCHEMA_FILE_NAME), dump) def save_temp_schema_updates(self, load_id: str, schema_update: TSchemaTables) -> None: - with self.storage.open_file(join(load_id, LoadStorage.SCHEMA_UPDATES_FILE_NAME), mode="wb") as f: + with self.storage.open_file( + join(load_id, LoadStorage.SCHEMA_UPDATES_FILE_NAME), mode="wb" + ) as f: json.dump(schema_update, f) def commit_temp_load_package(self, load_id: str) -> None: @@ -226,39 +278,62 @@ def list_completed_packages(self) -> Sequence[str]: return sorted(loads) def list_new_jobs(self, load_id: str) -> Sequence[str]: - new_jobs = self.storage.list_folder_files(self._get_job_folder_path(load_id, LoadStorage.NEW_JOBS_FOLDER)) + new_jobs = self.storage.list_folder_files( + self._get_job_folder_path(load_id, LoadStorage.NEW_JOBS_FOLDER) + ) # make sure all jobs have supported writers - wrong_job = next((j for j in new_jobs if LoadStorage.parse_job_file_name(j).file_format not in self.supported_file_formats), None) + wrong_job = next( + ( + j + for j in new_jobs + if LoadStorage.parse_job_file_name(j).file_format not in self.supported_file_formats + ), + None, + ) if wrong_job is not None: raise JobWithUnsupportedWriterException(load_id, self.supported_file_formats, wrong_job) return new_jobs def list_started_jobs(self, load_id: str) -> Sequence[str]: - return self.storage.list_folder_files(self._get_job_folder_path(load_id, LoadStorage.STARTED_JOBS_FOLDER)) + return self.storage.list_folder_files( + self._get_job_folder_path(load_id, LoadStorage.STARTED_JOBS_FOLDER) + ) def list_failed_jobs(self, load_id: str) -> Sequence[str]: - return self.storage.list_folder_files(self._get_job_folder_path(load_id, LoadStorage.FAILED_JOBS_FOLDER)) + return self.storage.list_folder_files( + self._get_job_folder_path(load_id, LoadStorage.FAILED_JOBS_FOLDER) + ) def list_jobs_for_table(self, load_id: str, table_name: str) -> Sequence[LoadJobInfo]: - return [job for job in self.list_all_jobs(load_id) if job.job_file_info.table_name == table_name] + return [ + job for job in self.list_all_jobs(load_id) if job.job_file_info.table_name == table_name + ] def list_all_jobs(self, load_id: str) -> Sequence[LoadJobInfo]: info = self.get_load_package_info(load_id) return [job for job in flatten_list_or_items(iter(info.jobs.values()))] # type: ignore def list_completed_failed_jobs(self, load_id: str) -> Sequence[str]: - return self.storage.list_folder_files(self._get_job_folder_completed_path(load_id, LoadStorage.FAILED_JOBS_FOLDER)) + return self.storage.list_folder_files( + self._get_job_folder_completed_path(load_id, LoadStorage.FAILED_JOBS_FOLDER) + ) def list_failed_jobs_in_completed_package(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a completed load package with `load_id`""" failed_jobs: List[LoadJobInfo] = [] package_path = self.get_completed_package_path(load_id) package_created_at = pendulum.from_timestamp( - os.path.getmtime(self.storage.make_full_path(join(package_path, LoadStorage.PACKAGE_COMPLETED_FILE_NAME))) + os.path.getmtime( + self.storage.make_full_path( + join(package_path, LoadStorage.PACKAGE_COMPLETED_FILE_NAME) + ) + ) ) for file in self.list_completed_failed_jobs(load_id): if not file.endswith(".exception"): - failed_jobs.append(self._read_job_file_info("failed_jobs", file, package_created_at)) + failed_jobs.append( + self._read_job_file_info("failed_jobs", file, package_created_at) + ) return failed_jobs def get_load_package_info(self, load_id: str) -> LoadPackageInfo: @@ -272,10 +347,14 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: package_path = self.get_completed_package_path(load_id) if not self.storage.has_folder(package_path): raise LoadPackageNotFound(load_id) - completed_file_path = self.storage.make_full_path(join(package_path, LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) + completed_file_path = self.storage.make_full_path( + join(package_path, LoadStorage.PACKAGE_COMPLETED_FILE_NAME) + ) package_created_at = pendulum.from_timestamp(os.path.getmtime(completed_file_path)) package_state = self.storage.load(completed_file_path) - applied_schema_update_file = join(package_path, LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) + applied_schema_update_file = join( + package_path, LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME + ) if self.storage.has_file(applied_schema_update_file): applied_update = json.loads(self.storage.load(applied_schema_update_file)) schema = self._load_schema(join(package_path, LoadStorage.SCHEMA_FILE_NAME)) @@ -290,7 +369,15 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: jobs.append(self._read_job_file_info(state, file, package_created_at)) all_jobs[state] = jobs - return LoadPackageInfo(load_id, self.storage.make_full_path(package_path), package_state, schema.name, applied_update, package_created_at, all_jobs) + return LoadPackageInfo( + load_id, + self.storage.make_full_path(package_path), + package_state, + schema.name, + applied_update, + package_created_at, + all_jobs, + ) def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: package_path = self.get_normalized_package_path(load_id) @@ -313,37 +400,62 @@ def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> N # save applied update self.storage.save(processed_schema_update_file, json.dumps(applied_update)) - def add_new_job(self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs") -> None: + def add_new_job( + self, load_id: str, job_file_path: str, job_state: TJobState = "new_jobs" + ) -> None: """Adds new job by moving the `job_file_path` into `new_jobs` of package `load_id`""" self.storage.atomic_import(job_file_path, self._get_job_folder_path(load_id, job_state)) def atomic_import(self, external_file_path: str, to_folder: str) -> str: """Copies or links a file at `external_file_path` into the `to_folder` effectively importing file into storage""" # LoadStorage.parse_job_file_name - return self.storage.to_relative_path(FileStorage.move_atomic_to_folder(external_file_path, self.storage.make_full_path(to_folder))) + return self.storage.to_relative_path( + FileStorage.move_atomic_to_folder( + external_file_path, self.storage.make_full_path(to_folder) + ) + ) def start_job(self, load_id: str, file_name: str) -> str: - return self._move_job(load_id, LoadStorage.NEW_JOBS_FOLDER, LoadStorage.STARTED_JOBS_FOLDER, file_name) + return self._move_job( + load_id, LoadStorage.NEW_JOBS_FOLDER, LoadStorage.STARTED_JOBS_FOLDER, file_name + ) def fail_job(self, load_id: str, file_name: str, failed_message: Optional[str]) -> str: # save the exception to failed jobs if failed_message: self.storage.save( - self._get_job_file_path(load_id, LoadStorage.FAILED_JOBS_FOLDER, file_name + ".exception"), - failed_message + self._get_job_file_path( + load_id, LoadStorage.FAILED_JOBS_FOLDER, file_name + ".exception" + ), + failed_message, ) # move to failed jobs - return self._move_job(load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.FAILED_JOBS_FOLDER, file_name) + return self._move_job( + load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.FAILED_JOBS_FOLDER, file_name + ) def retry_job(self, load_id: str, file_name: str) -> str: # when retrying job we must increase the retry count source_fn = ParsedLoadJobFileName.parse(file_name) - dest_fn = ParsedLoadJobFileName(source_fn.table_name, source_fn.file_id, source_fn.retry_count + 1, source_fn.file_format) + dest_fn = ParsedLoadJobFileName( + source_fn.table_name, + source_fn.file_id, + source_fn.retry_count + 1, + source_fn.file_format, + ) # move it directly to new file name - return self._move_job(load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.NEW_JOBS_FOLDER, file_name, dest_fn.job_id()) + return self._move_job( + load_id, + LoadStorage.STARTED_JOBS_FOLDER, + LoadStorage.NEW_JOBS_FOLDER, + file_name, + dest_fn.job_id(), + ) def complete_job(self, load_id: str, file_name: str) -> str: - return self._move_job(load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.COMPLETED_JOBS_FOLDER, file_name) + return self._move_job( + load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.COMPLETED_JOBS_FOLDER, file_name + ) def complete_load_package(self, load_id: str, aborted: bool) -> None: load_path = self.get_normalized_package_path(load_id) @@ -352,7 +464,8 @@ def complete_load_package(self, load_id: str, aborted: bool) -> None: if self.config.delete_completed_jobs and not has_failed_jobs: self.storage.delete_folder( self._get_job_folder_path(load_id, LoadStorage.COMPLETED_JOBS_FOLDER), - recursively=True) + recursively=True, + ) # save marker file completed_state: TLoadPackageState = "aborted" if aborted else "loaded" self.storage.save(join(load_path, LoadStorage.PACKAGE_COMPLETED_FILE_NAME), completed_state) @@ -387,7 +500,14 @@ def _load_schema(self, schema_path: str) -> Schema: stored_schema: DictStrAny = json.loads(self.storage.load(schema_path)) return Schema.from_dict(stored_schema) - def _move_job(self, load_id: str, source_folder: TJobState, dest_folder: TJobState, file_name: str, new_file_name: str = None) -> str: + def _move_job( + self, + load_id: str, + source_folder: TJobState, + dest_folder: TJobState, + file_name: str, + new_file_name: str = None, + ) -> str: # ensure we move file names, not paths assert file_name == FileStorage.get_file_name_from_file_path(file_name) load_path = self.get_normalized_package_path(load_id) @@ -419,10 +539,17 @@ def _read_job_file_info(self, state: TJobState, file: str, now: DateTime = None) pendulum.from_timestamp(st.st_mtime), self.job_elapsed_time_seconds(full_path, now.timestamp() if now else None), self.parse_job_file_name(file), - failed_message + failed_message, ) - def build_job_file_name(self, table_name: str, file_id: str, retry_count: int = 0, validate_components: bool = True, with_extension: bool = True) -> str: + def build_job_file_name( + self, + table_name: str, + file_id: str, + retry_count: int = 0, + validate_components: bool = True, + with_extension: bool = True, + ) -> str: if validate_components: FileStorage.validate_file_name_component(table_name) # FileStorage.validate_file_name_component(file_id) @@ -439,7 +566,14 @@ def is_package_partially_loaded(package_info: LoadPackageInfo) -> bool: pending_jobs: Sequence[TJobState] = ["new_jobs"] else: pending_jobs = ["completed_jobs", "failed_jobs"] - return sum(len(package_info.jobs[job_state]) for job_state in WORKING_FOLDERS if job_state not in pending_jobs) > 0 + return ( + sum( + len(package_info.jobs[job_state]) + for job_state in WORKING_FOLDERS + if job_state not in pending_jobs + ) + > 0 + ) @staticmethod def parse_job_file_name(file_name: str) -> ParsedLoadJobFileName: diff --git a/dlt/common/storages/normalize_storage.py b/dlt/common/storages/normalize_storage.py index 44e6fe2f1c..2d153bcb8f 100644 --- a/dlt/common/storages/normalize_storage.py +++ b/dlt/common/storages/normalize_storage.py @@ -10,6 +10,7 @@ from dlt.common.destination import TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS from dlt.common.exceptions import TerminalValueError + class TParsedNormalizeFileName(NamedTuple): schema_name: str table_name: str @@ -18,13 +19,20 @@ class TParsedNormalizeFileName(NamedTuple): class NormalizeStorage(VersionedStorage): - STORAGE_VERSION: ClassVar[str] = "1.0.0" - EXTRACTED_FOLDER: ClassVar[str] = "extracted" # folder within the volume where extracted files to be normalized are stored + EXTRACTED_FOLDER: ClassVar[str] = ( + "extracted" # folder within the volume where extracted files to be normalized are stored + ) @with_config(spec=NormalizeStorageConfiguration, sections=(known_sections.NORMALIZE,)) - def __init__(self, is_owner: bool, config: NormalizeStorageConfiguration = config.value) -> None: - super().__init__(NormalizeStorage.STORAGE_VERSION, is_owner, FileStorage(config.normalize_volume_path, "t", makedirs=is_owner)) + def __init__( + self, is_owner: bool, config: NormalizeStorageConfiguration = config.value + ) -> None: + super().__init__( + NormalizeStorage.STORAGE_VERSION, + is_owner, + FileStorage(config.normalize_volume_path, "t", makedirs=is_owner), + ) self.config = config if is_owner: self.initialize_storage() diff --git a/dlt/common/storages/schema_storage.py b/dlt/common/storages/schema_storage.py index a9fee71531..a43b8a1f9b 100644 --- a/dlt/common/storages/schema_storage.py +++ b/dlt/common/storages/schema_storage.py @@ -4,21 +4,30 @@ from dlt.common import json, logger from dlt.common.configuration import with_config from dlt.common.configuration.accessors import config -from dlt.common.storages.configuration import SchemaStorageConfiguration, TSchemaFileFormat, SchemaFileExtensions +from dlt.common.storages.configuration import ( + SchemaStorageConfiguration, + TSchemaFileFormat, + SchemaFileExtensions, +) from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import Schema, verify_schema_hash from dlt.common.typing import DictStrAny -from dlt.common.storages.exceptions import InStorageSchemaModified, SchemaNotFoundError, UnexpectedSchemaName +from dlt.common.storages.exceptions import ( + InStorageSchemaModified, + SchemaNotFoundError, + UnexpectedSchemaName, +) class SchemaStorage(Mapping[str, Schema]): - SCHEMA_FILE_NAME = "schema.%s" NAMED_SCHEMA_FILE_PATTERN = f"%s.{SCHEMA_FILE_NAME}" @with_config(spec=SchemaStorageConfiguration, sections=("schema",)) - def __init__(self, config: SchemaStorageConfiguration = config.value, makedirs: bool = False) -> None: + def __init__( + self, config: SchemaStorageConfiguration = config.value, makedirs: bool = False + ) -> None: self.config = config self.storage = FileStorage(config.schema_volume_path, makedirs=makedirs) @@ -97,7 +106,11 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> # if schema was imported, overwrite storage schema rv_schema._imported_version_hash = rv_schema.version_hash self._save_schema(rv_schema) - logger.info(f"Schema {name} not present in {self.storage.storage_path} and got imported with version {rv_schema.stored_version} and imported hash {rv_schema._imported_version_hash}") + logger.info( + f"Schema {name} not present in {self.storage.storage_path} and got imported" + f" with version {rv_schema.stored_version} and imported hash" + f" {rv_schema._imported_version_hash}" + ) else: # import schema when imported schema was modified from the last import sc = Schema.from_dict(storage_schema) @@ -108,14 +121,23 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> rv_schema._imported_version_hash = rv_schema.version_hash # if schema was imported, overwrite storage schema self._save_schema(rv_schema) - logger.info(f"Schema {name} was present in {self.storage.storage_path} but is overwritten with imported schema version {rv_schema.stored_version} and imported hash {rv_schema._imported_version_hash}") + logger.info( + f"Schema {name} was present in {self.storage.storage_path} but is" + f" overwritten with imported schema version {rv_schema.stored_version} and" + f" imported hash {rv_schema._imported_version_hash}" + ) else: # use storage schema as nothing changed rv_schema = sc except FileNotFoundError: # no schema to import -> skip silently and return the original if storage_schema is None: - raise SchemaNotFoundError(name, self.config.schema_volume_path, self.config.import_schema_path, self.config.external_schema_format) + raise SchemaNotFoundError( + name, + self.config.schema_volume_path, + self.config.import_schema_path, + self.config.external_schema_format, + ) rv_schema = Schema.from_dict(storage_schema) assert rv_schema is not None @@ -124,20 +146,29 @@ def _maybe_import_schema(self, name: str, storage_schema: DictStrAny = None) -> def _load_import_schema(self, name: str) -> DictStrAny: import_storage = FileStorage(self.config.import_schema_path, makedirs=False) schema_file = self._file_name_in_store(name, self.config.external_schema_format) - return self._parse_schema_str(import_storage.load(schema_file), self.config.external_schema_format) + return self._parse_schema_str( + import_storage.load(schema_file), self.config.external_schema_format + ) def _export_schema(self, schema: Schema, export_path: str) -> None: if self.config.external_schema_format == "json": - exported_schema_s = schema.to_pretty_json(remove_defaults=self.config.external_schema_format_remove_defaults) + exported_schema_s = schema.to_pretty_json( + remove_defaults=self.config.external_schema_format_remove_defaults + ) elif self.config.external_schema_format == "yaml": - exported_schema_s = schema.to_pretty_yaml(remove_defaults=self.config.external_schema_format_remove_defaults) + exported_schema_s = schema.to_pretty_yaml( + remove_defaults=self.config.external_schema_format_remove_defaults + ) else: raise ValueError(self.config.external_schema_format) export_storage = FileStorage(export_path, makedirs=True) schema_file = self._file_name_in_store(schema.name, self.config.external_schema_format) export_storage.save(schema_file, exported_schema_s) - logger.info(f"Schema {schema.name} exported to {export_path} with version {schema.stored_version} as {self.config.external_schema_format}") + logger.info( + f"Schema {schema.name} exported to {export_path} with version" + f" {schema.stored_version} as {self.config.external_schema_format}" + ) def _save_schema(self, schema: Schema) -> str: # save a schema to schema store @@ -145,7 +176,9 @@ def _save_schema(self, schema: Schema) -> str: return self.storage.save(schema_file, schema.to_pretty_json(remove_defaults=False)) @staticmethod - def load_schema_file(path: str, name: str, extensions: Tuple[TSchemaFileFormat, ...]=SchemaFileExtensions) -> Schema: + def load_schema_file( + path: str, name: str, extensions: Tuple[TSchemaFileFormat, ...] = SchemaFileExtensions + ) -> Schema: storage = FileStorage(path) for extension in extensions: file = SchemaStorage._file_name_in_store(name, extension) diff --git a/dlt/common/storages/transactional_file.py b/dlt/common/storages/transactional_file.py index 9a10c812e2..e5ee220904 100644 --- a/dlt/common/storages/transactional_file.py +++ b/dlt/common/storages/transactional_file.py @@ -34,6 +34,7 @@ def lock_id(k: int = 4) -> str: class Heartbeat(Timer): """A thread designed to periodically execute a fn.""" + daemon = True def run(self) -> None: @@ -60,7 +61,9 @@ def __init__(self, path: str, fs: fsspec.AbstractFileSystem) -> None: parsed_path = Path(path) if not parsed_path.is_absolute(): - raise ValueError(f"{path} is not absolute. Please pass only absolute paths to TransactionalFile") + raise ValueError( + f"{path} is not absolute. Please pass only absolute paths to TransactionalFile" + ) self.path = path if proto == "file": # standardize path separator to POSIX. fsspec always uses POSIX. Windows may use either. @@ -102,7 +105,7 @@ def _sync_locks(self) -> t.List[str]: # Purge stale locks mtime = self.extract_mtime(lock) if now - mtime > timedelta(seconds=TransactionalFile.LOCK_TTL_SECONDS): - try: # Janitors can race, so we ignore errors + try: # Janitors can race, so we ignore errors self._fs.rm(name) except OSError: pass @@ -110,7 +113,10 @@ def _sync_locks(self) -> t.List[str]: # The name is timestamp + random suffix and is time sortable output.append(name) if not output: - raise RuntimeError(f"When syncing locks for path {self.path} and lock {self.lock_path} no lock file was found") + raise RuntimeError( + f"When syncing locks for path {self.path} and lock {self.lock_path} no lock file" + " was found" + ) return output def read(self) -> t.Optional[bytes]: @@ -136,7 +142,9 @@ def rollback(self) -> None: elif self._fs.isfile(self.path): self._fs.rm(self.path) - def acquire_lock(self, blocking: bool = True, timeout: float = -1, jitter_mean: float = 0) -> bool: + def acquire_lock( + self, blocking: bool = True, timeout: float = -1, jitter_mean: float = 0 + ) -> bool: """Acquires a lock on a path. Mimics the stdlib's `threading.Lock` interface. Acquire a lock, blocking or non-blocking. diff --git a/dlt/common/storages/versioned_storage.py b/dlt/common/storages/versioned_storage.py index c87f2a52b9..8e9a3eb88d 100644 --- a/dlt/common/storages/versioned_storage.py +++ b/dlt/common/storages/versioned_storage.py @@ -7,10 +7,11 @@ class VersionedStorage: - VERSION_FILE = ".version" - def __init__(self, version: Union[semver.VersionInfo, str], is_owner: bool, storage: FileStorage) -> None: + def __init__( + self, version: Union[semver.VersionInfo, str], is_owner: bool, storage: FileStorage + ) -> None: if isinstance(version, str): version = semver.VersionInfo.parse(version) self.storage = storage @@ -20,24 +21,34 @@ def __init__(self, version: Union[semver.VersionInfo, str], is_owner: bool, stor if existing_version != version: if existing_version > version: # version cannot be downgraded - raise NoMigrationPathException(storage.storage_path, existing_version, existing_version, version) + raise NoMigrationPathException( + storage.storage_path, existing_version, existing_version, version + ) if is_owner: # only owner can migrate storage self.migrate_storage(existing_version, version) # storage should be migrated to desired version migrated_version = self._load_version() if version != migrated_version: - raise NoMigrationPathException(storage.storage_path, existing_version, migrated_version, version) + raise NoMigrationPathException( + storage.storage_path, existing_version, migrated_version, version + ) else: # we cannot use storage and we must wait for owner to upgrade it - raise WrongStorageVersionException(storage.storage_path, existing_version, version) + raise WrongStorageVersionException( + storage.storage_path, existing_version, version + ) else: if is_owner: self._save_version(version) else: - raise WrongStorageVersionException(storage.storage_path, semver.VersionInfo.parse("0.0.0"), version) + raise WrongStorageVersionException( + storage.storage_path, semver.VersionInfo.parse("0.0.0"), version + ) - def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None: + def migrate_storage( + self, from_version: semver.VersionInfo, to_version: semver.VersionInfo + ) -> None: # migration example: # # semver lib supports comparing both to string and other semvers # if from_version == "1.0.0" and from_version < to_version: diff --git a/dlt/common/time.py b/dlt/common/time.py index f57ccce71d..05c4c617dd 100644 --- a/dlt/common/time.py +++ b/dlt/common/time.py @@ -13,11 +13,15 @@ DAY_DURATION_SEC: float = 24 * 60 * 60.0 -def timestamp_within(timestamp: float, min_exclusive: Optional[float], max_inclusive: Optional[float]) -> bool: +def timestamp_within( + timestamp: float, min_exclusive: Optional[float], max_inclusive: Optional[float] +) -> bool: """ check if timestamp within range uniformly treating none and range inclusiveness """ - return timestamp > (min_exclusive or PAST_TIMESTAMP) and timestamp <= (max_inclusive or FUTURE_TIMESTAMP) + return timestamp > (min_exclusive or PAST_TIMESTAMP) and timestamp <= ( + max_inclusive or FUTURE_TIMESTAMP + ) def timestamp_before(timestamp: float, max_inclusive: Optional[float]) -> bool: @@ -122,7 +126,9 @@ def ensure_pendulum_time(value: Union[str, datetime.time]) -> pendulum.Time: raise TypeError(f"Cannot coerce {value} to a pendulum.Time object.") -def _datetime_from_ts_or_iso(value: Union[int, float, str]) -> Union[pendulum.DateTime, pendulum.Date, pendulum.Time]: +def _datetime_from_ts_or_iso( + value: Union[int, float, str] +) -> Union[pendulum.DateTime, pendulum.Date, pendulum.Time]: if isinstance(value, (int, float)): return pendulum.from_timestamp(value) try: @@ -150,7 +156,8 @@ def to_seconds(td: Optional[TimedeltaSeconds]) -> Optional[float]: T = TypeVar("T", bound=Union[pendulum.DateTime, pendulum.Time]) + def reduce_pendulum_datetime_precision(value: T, microsecond_precision: int) -> T: if microsecond_precision >= 6: return value - return value.replace(microsecond=value.microsecond // 10**(6 - microsecond_precision) * 10**(6 - microsecond_precision)) # type: ignore + return value.replace(microsecond=value.microsecond // 10 ** (6 - microsecond_precision) * 10 ** (6 - microsecond_precision)) # type: ignore diff --git a/dlt/common/typing.py b/dlt/common/typing.py index 3b3a0d3353..e1a8860a13 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -3,7 +3,28 @@ import inspect import os from re import Pattern as _REPattern -from typing import Callable, Dict, Any, Final, Literal, List, Mapping, NewType, Optional, Tuple, Type, TypeVar, Generic, Protocol, TYPE_CHECKING, Union, runtime_checkable, get_args, get_origin, IO +from typing import ( + Callable, + Dict, + Any, + Final, + Literal, + List, + Mapping, + NewType, + Optional, + Tuple, + Type, + TypeVar, + Generic, + Protocol, + TYPE_CHECKING, + Union, + runtime_checkable, + get_args, + get_origin, + IO, +) from typing_extensions import TypeAlias, ParamSpec, Concatenate from dlt.common.pendulum import timedelta, pendulum @@ -11,10 +32,12 @@ if TYPE_CHECKING: from _typeshed import StrOrBytesPath from typing import _TypedDict + REPattern = _REPattern[str] else: StrOrBytesPath = Any from typing import _TypedDictMeta as _TypedDict + REPattern = _REPattern AnyType: TypeAlias = Any @@ -47,15 +70,16 @@ VARIANT_FIELD_FORMAT = "v_%s" TFileOrPath = Union[str, os.PathLike, IO[Any]] + @runtime_checkable class SupportsVariant(Protocol, Generic[TVariantBase]): """Defines variant type protocol that should be recognized by normalizers - Variant types behave like TVariantBase type (ie. Decimal) but also implement the protocol below that is used to extract the variant value from it. - See `Wei` type declaration which returns Decimal or str for values greater than supported by destination warehouse. + Variant types behave like TVariantBase type (ie. Decimal) but also implement the protocol below that is used to extract the variant value from it. + See `Wei` type declaration which returns Decimal or str for values greater than supported by destination warehouse. """ - def __call__(self) -> Union[TVariantBase, TVariantRV]: - ... + + def __call__(self) -> Union[TVariantBase, TVariantRV]: ... class SupportsHumanize(Protocol): @@ -71,35 +95,44 @@ def asstr(self, verbosity: int = 0) -> str: def is_union_type(t: Type[Any]) -> bool: return get_origin(t) is Union + def is_optional_type(t: Type[Any]) -> bool: return get_origin(t) is Union and type(None) in get_args(t) + def is_final_type(t: Type[Any]) -> bool: return get_origin(t) is Final + def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: if no_none: return [arg for arg in get_args(t) if arg is not type(None)] # noqa: E721 return list(get_args(t)) + def is_literal_type(hint: Type[Any]) -> bool: return get_origin(hint) is Literal + def is_union(hint: Type[Any]) -> bool: return get_origin(hint) is Union + def is_newtype_type(t: Type[Any]) -> bool: return hasattr(t, "__supertype__") + def is_typeddict(t: Type[Any]) -> bool: return isinstance(t, _TypedDict) + def is_list_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Sequence) except TypeError: return False + def is_dict_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Mapping) @@ -131,10 +164,16 @@ def extract_inner_type(hint: Type[Any], preserve_new_types: bool = False) -> Typ def get_all_types_of_class_in_union(hint: Type[Any], cls: Type[TAny]) -> List[Type[TAny]]: # hint is an Union that contains classes, return all classes that are a subclass or superclass of cls - return [t for t in get_args(hint) if inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t))] + return [ + t + for t in get_args(hint) + if inspect.isclass(t) and (issubclass(t, cls) or issubclass(cls, t)) + ] -def get_generic_type_argument_from_instance(instance: Any, sample_value: Optional[Any]) -> Type[Any]: +def get_generic_type_argument_from_instance( + instance: Any, sample_value: Optional[Any] +) -> Type[Any]: """Infers type argument of a Generic class from an `instance` of that class using optional `sample_value` of the argument type Inference depends on the presence of __orig_class__ attribute in instance, if not present - sample_Value will be used @@ -157,7 +196,10 @@ def get_generic_type_argument_from_instance(instance: Any, sample_value: Optiona TInputArgs = ParamSpec("TInputArgs") TReturnVal = TypeVar("TReturnVal") -def copy_sig(wrapper: Callable[TInputArgs, Any]) -> Callable[[Callable[..., TReturnVal]], Callable[TInputArgs, TReturnVal]]: + +def copy_sig( + wrapper: Callable[TInputArgs, Any] +) -> Callable[[Callable[..., TReturnVal]], Callable[TInputArgs, TReturnVal]]: """Copies docstring and signature from wrapper to func but keeps the func return value type""" def decorator(func: Callable[..., TReturnVal]) -> Callable[TInputArgs, TReturnVal]: diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 94c9144086..cdda239aca 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -10,7 +10,22 @@ from types import ModuleType import zlib -from typing import Any, ContextManager, Dict, Iterator, Optional, Sequence, Set, Tuple, TypeVar, Mapping, List, Union, Counter, Iterable +from typing import ( + Any, + ContextManager, + Dict, + Iterator, + Optional, + Sequence, + Set, + Tuple, + TypeVar, + Mapping, + List, + Union, + Counter, + Iterable, +) from collections.abc import Mapping as C_Mapping from dlt.common.typing import AnyFun, StrAny, DictStrAny, StrStr, TAny, TFun @@ -25,9 +40,10 @@ # row counts TRowCount = Dict[str, int] + def chunks(seq: Sequence[T], n: int) -> Iterator[Sequence[T]]: for i in range(0, len(seq), n): - yield seq[i:i + n] + yield seq[i : i + n] def uniq_id(len_: int = 16) -> str: @@ -37,7 +53,7 @@ def uniq_id(len_: int = 16) -> str: def uniq_id_base64(len_: int = 16) -> str: """Returns a base64 encoded crypto-grade string of random bytes with desired len_""" - return base64.b64encode(secrets.token_bytes(len_)).decode('ascii').rstrip("=") + return base64.b64encode(secrets.token_bytes(len_)).decode("ascii").rstrip("=") def many_uniq_ids_base64(n_ids: int, len_: int = 16) -> List[str]: @@ -46,34 +62,41 @@ def many_uniq_ids_base64(n_ids: int, len_: int = 16) -> List[str]: """ random_bytes = secrets.token_bytes(n_ids * len_) encode = base64.b64encode - return [encode(random_bytes[i:i+len_]).decode('ascii').rstrip("=") for i in range(0, n_ids * len_, len_)] + return [ + encode(random_bytes[i : i + len_]).decode("ascii").rstrip("=") + for i in range(0, n_ids * len_, len_) + ] def digest128(v: str, len_: int = 15) -> str: """Returns a base64 encoded shake128 hash of str `v` with digest of length `len_` (default: 15 bytes = 20 characters length)""" - return base64.b64encode(hashlib.shake_128(v.encode("utf-8")).digest(len_)).decode('ascii').rstrip("=") + return ( + base64.b64encode(hashlib.shake_128(v.encode("utf-8")).digest(len_)) + .decode("ascii") + .rstrip("=") + ) def digest128b(v: bytes, len_: int = 15) -> str: """Returns a base64 encoded shake128 hash of bytes `v` with digest of length `len_` (default: 15 bytes = 20 characters length)""" - enc_v = base64.b64encode(hashlib.shake_128(v).digest(len_)).decode('ascii') + enc_v = base64.b64encode(hashlib.shake_128(v).digest(len_)).decode("ascii") return enc_v.rstrip("=") def digest256(v: str) -> str: digest = hashlib.sha3_256(v.encode("utf-8")).digest() - return base64.b64encode(digest).decode('ascii') + return base64.b64encode(digest).decode("ascii") def str2bool(v: str) -> bool: if isinstance(v, bool): return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): + if v.lower() in ("yes", "true", "t", "y", "1"): return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): + elif v.lower() in ("no", "false", "f", "n", "0"): return False else: - raise ValueError('Boolean value expected.') + raise ValueError("Boolean value expected.") # def flatten_list_of_dicts(dicts: Sequence[StrAny]) -> StrAny: @@ -96,7 +119,7 @@ def flatten_list_of_str_or_dicts(seq: Sequence[Union[StrAny, str]]) -> DictStrAn o: DictStrAny = {} for e in seq: if isinstance(e, dict): - for k,v in e.items(): + for k, v in e.items(): if k in o: raise KeyError(f"Cannot flatten with duplicate key {k}") o[k] = v @@ -177,7 +200,9 @@ def concat_strings_with_limit(strings: List[str], separator: str, limit: int) -> sep_len = len(separator) for i in range(1, len(strings)): - if current_length + len(strings[i]) + sep_len > limit: # accounts for the length of separator + if ( + current_length + len(strings[i]) + sep_len > limit + ): # accounts for the length of separator yield separator.join(strings[start:i]) start = i current_length = len(strings[i]) @@ -187,7 +212,9 @@ def concat_strings_with_limit(strings: List[str], separator: str, limit: int) -> yield separator.join(strings[start:]) -def graph_edges_to_nodes(edges: Sequence[Tuple[TAny, TAny]], directed: bool = True) -> Dict[TAny, Set[TAny]]: +def graph_edges_to_nodes( + edges: Sequence[Tuple[TAny, TAny]], directed: bool = True +) -> Dict[TAny, Set[TAny]]: """Converts a directed graph represented as a sequence of edges to a graph represented as a mapping from nodes a set of connected nodes. Isolated nodes are represented as edges to itself. If `directed` is `False`, each edge is duplicated but going in opposite direction. @@ -221,7 +248,6 @@ def dfs(node: TAny, current_component: Set[TAny]) -> None: for neighbor in undag[node]: dfs(neighbor, current_component) - for node in undag: if node not in visited: component: Set[TAny] = set() @@ -302,9 +328,10 @@ def is_interactive() -> bool: bool: True if interactive (e.g., REPL, IPython, Jupyter Notebook), False if running as a script. """ import __main__ as main + # When running as a script, the __main__ module has a __file__ attribute. # In an interactive environment, the __file__ attribute is absent. - return not hasattr(main, '__file__') + return not hasattr(main, "__file__") def dict_remove_nones_in_place(d: Dict[Any, Any]) -> Dict[Any, Any]: @@ -332,7 +359,6 @@ def custom_environ(env: StrStr) -> Iterator[None]: def with_custom_environ(f: TFun) -> TFun: - @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> Any: saved_environ = os.environ.copy() @@ -405,11 +431,20 @@ def is_inner_callable(f: AnyFun) -> bool: def obfuscate_pseudo_secret(pseudo_secret: str, pseudo_key: bytes) -> str: - return base64.b64encode(bytes([_a ^ _b for _a, _b in zip(pseudo_secret.encode("utf-8"), pseudo_key*250)])).decode() + return base64.b64encode( + bytes([_a ^ _b for _a, _b in zip(pseudo_secret.encode("utf-8"), pseudo_key * 250)]) + ).decode() def reveal_pseudo_secret(obfuscated_secret: str, pseudo_key: bytes) -> str: - return bytes([_a ^ _b for _a, _b in zip(base64.b64decode(obfuscated_secret.encode("ascii"), validate=True), pseudo_key*250)]).decode("utf-8") + return bytes( + [ + _a ^ _b + for _a, _b in zip( + base64.b64decode(obfuscated_secret.encode("ascii"), validate=True), pseudo_key * 250 + ) + ] + ).decode("utf-8") def get_module_name(m: ModuleType) -> str: @@ -429,7 +464,7 @@ def derives_from_class_of_name(o: object, name: str) -> bool: def compressed_b64encode(value: bytes) -> str: """Compress and b64 encode the given bytestring""" - return base64.b64encode(zlib.compress(value, level=9)).decode('ascii') + return base64.b64encode(zlib.compress(value, level=9)).decode("ascii") def compressed_b64decode(value: str) -> bytes: diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 312371bbf1..d31b0c57c6 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -2,14 +2,31 @@ from typing import Callable, Any, Type, get_type_hints, get_args from dlt.common.exceptions import DictValidationException -from dlt.common.typing import StrAny, is_literal_type, is_optional_type, extract_union_types, is_union_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict, is_union +from dlt.common.typing import ( + StrAny, + is_literal_type, + is_optional_type, + extract_union_types, + is_union_type, + is_typeddict, + is_list_generic_type, + is_dict_generic_type, + _TypedDict, + is_union, +) TFilterFunc = Callable[[str], bool] TCustomValidator = Callable[[str, str, Any, Any], bool] -def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFilterFunc = None, validator_f: TCustomValidator = None) -> None: +def validate_dict( + spec: Type[_TypedDict], + doc: StrAny, + path: str, + filter_f: TFilterFunc = None, + validator_f: TCustomValidator = None, +) -> None: """Validate the `doc` dictionary based on the given typed dictionary specification `spec`. Args: @@ -44,11 +61,15 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil # check missing props missing = set(required_props.keys()).difference(props.keys()) if len(missing): - raise DictValidationException(f"In {path}: following required fields are missing {missing}", path) + raise DictValidationException( + f"In {path}: following required fields are missing {missing}", path + ) # check unknown props unexpected = set(props.keys()).difference(allowed_props.keys()) if len(unexpected): - raise DictValidationException(f"In {path}: following fields are unexpected {unexpected}", path) + raise DictValidationException( + f"In {path}: following fields are unexpected {unexpected}", path + ) def verify_prop(pk: str, pv: Any, t: Any) -> None: # covers none in optional and union types @@ -69,41 +90,82 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: except DictValidationException: pass if not has_passed: - type_names = [str(get_args(ut)) if is_literal_type(ut) else ut.__name__ for ut in union_types] - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}. One of these types expected: {', '.join(type_names)}.", path, pk, pv) + type_names = [ + str(get_args(ut)) if is_literal_type(ut) else ut.__name__ + for ut in union_types + ] + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}." + f" One of these types expected: {', '.join(type_names)}.", + path, + pk, + pv, + ) elif is_literal_type(t): a_l = get_args(t) if pv not in a_l: - raise DictValidationException(f"In {path}: field {pk} value {pv} not in allowed {a_l}", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} not in allowed {a_l}", path, pk, pv + ) elif t in [int, bool, str, float]: if not isinstance(pv, t): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while {t.__name__} is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while" + f" {t.__name__} is expected", + path, + pk, + pv, + ) elif is_typeddict(t): if not isinstance(pv, dict): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while dict is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while" + " dict is expected", + path, + pk, + pv, + ) validate_dict(t, pv, path + "/" + pk, filter_f, validator_f) elif is_list_generic_type(t): if not isinstance(pv, list): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while list is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while" + " list is expected", + path, + pk, + pv, + ) # get list element type from generic and process each list element l_t = get_args(t)[0] for i, l_v in enumerate(pv): verify_prop(pk + f"[{i}]", l_v, l_t) elif is_dict_generic_type(t): if not isinstance(pv, dict): - raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while dict is expected", path, pk, pv) + raise DictValidationException( + f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__} while" + " dict is expected", + path, + pk, + pv, + ) # get dict key and value type from generic and process each k: v of the dict _, d_v_t = get_args(t) for d_k, d_v in pv.items(): if not isinstance(d_k, str): - raise DictValidationException(f"In {path}: field {pk} key {d_k} must be a string", path, pk, d_k) + raise DictValidationException( + f"In {path}: field {pk} key {d_k} must be a string", path, pk, d_k + ) verify_prop(pk + f"[{d_k}]", d_v, d_v_t) elif t is Any: # pass everything with any type pass else: if not validator_f(path, pk, pv, t): - raise DictValidationException(f"In {path}: field {pk} has expected type {t.__name__} which lacks validator", path, pk) + raise DictValidationException( + f"In {path}: field {pk} has expected type {t.__name__} which lacks validator", + path, + pk, + ) # check allowed props for pk, pv in props.items(): @@ -111,6 +173,5 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: validate_dict_ignoring_xkeys = functools.partial( - validate_dict, - filter_f=lambda k: not k.startswith("x-") -) \ No newline at end of file + validate_dict, filter_f=lambda k: not k.startswith("x-") +) diff --git a/dlt/common/wei.py b/dlt/common/wei.py index 218e5eee3a..b6816bc6f3 100644 --- a/dlt/common/wei.py +++ b/dlt/common/wei.py @@ -11,8 +11,7 @@ WEI_SCALE_POW = 10**18 -class Wei(Decimal,SupportsVariant[Decimal]): - +class Wei(Decimal, SupportsVariant[Decimal]): ctx = default_context(decimal.getcontext().copy(), EVM_DECIMAL_PRECISION) @classmethod @@ -29,11 +28,13 @@ def from_int256(cls, value: int, decimals: int = 0) -> "Wei": def __call__(self) -> Union["Wei", TVariantRV]: # TODO: this should look into DestinationCapabilitiesContext to get maximum Decimal value. # this is BigQuery BIGDECIMAL max - if self > 578960446186580977117854925043439539266 or self < -578960446186580977117854925043439539267: - return ("str", str(self)) + if ( + self > 578960446186580977117854925043439539266 + or self < -578960446186580977117854925043439539267 + ): + return ("str", str(self)) else: return self - def __repr__(self) -> str: return f"Wei('{str(self)}')" diff --git a/dlt/destinations/exceptions.py b/dlt/destinations/exceptions.py index 5c20f081f1..cc4d4fd836 100644 --- a/dlt/destinations/exceptions.py +++ b/dlt/destinations/exceptions.py @@ -1,5 +1,10 @@ from typing import Sequence -from dlt.common.exceptions import DestinationTerminalException, DestinationTransientException, DestinationUndefinedEntity, DestinationException +from dlt.common.exceptions import ( + DestinationTerminalException, + DestinationTransientException, + DestinationUndefinedEntity, + DestinationException, +) from dlt.common.destination.reference import TLoadJobState @@ -25,32 +30,49 @@ def __init__(self, dbapi_exception: Exception) -> None: class DestinationConnectionError(DestinationTransientException): - def __init__(self, client_type: str, dataset_name: str, reason: str, inner_exc: Exception) -> None: + def __init__( + self, client_type: str, dataset_name: str, reason: str, inner_exc: Exception + ) -> None: self.client_type = client_type self.dataset_name = dataset_name self.inner_exc = inner_exc - super().__init__(f"Connection with {client_type} to dataset name {dataset_name} failed. Please check if you configured the credentials at all and provided the right credentials values. You can be also denied access or your internet connection may be down. The actual reason given is: {reason}") + super().__init__( + f"Connection with {client_type} to dataset name {dataset_name} failed. Please check if" + " you configured the credentials at all and provided the right credentials values. You" + " can be also denied access or your internet connection may be down. The actual reason" + f" given is: {reason}" + ) + class LoadClientNotConnected(DestinationTransientException): def __init__(self, client_type: str, dataset_name: str) -> None: self.client_type = client_type self.dataset_name = dataset_name - super().__init__(f"Connection with {client_type} to dataset {dataset_name} is closed. Open the connection with 'client.open_connection' or with the 'with client:' statement") + super().__init__( + f"Connection with {client_type} to dataset {dataset_name} is closed. Open the" + " connection with 'client.open_connection' or with the 'with client:' statement" + ) class DestinationSchemaWillNotUpdate(DestinationTerminalException): def __init__(self, table_name: str, columns: Sequence[str], msg: str) -> None: self.table_name = table_name self.columns = columns - super().__init__(f"Schema for table {table_name} column(s) {columns} will not update: {msg}") + super().__init__( + f"Schema for table {table_name} column(s) {columns} will not update: {msg}" + ) class DestinationSchemaTampered(DestinationTerminalException): def __init__(self, schema_name: str, version_hash: str, stored_version_hash: str) -> None: self.version_hash = version_hash self.stored_version_hash = stored_version_hash - super().__init__(f"Schema {schema_name} content was changed - by a loader or by destination code - from the moment it was retrieved by load package. " - f"Such schema cannot reliably be updated or saved. Current version hash: {version_hash} != stored version hash {stored_version_hash}") + super().__init__( + f"Schema {schema_name} content was changed - by a loader or by destination code - from" + " the moment it was retrieved by load package. Such schema cannot reliably be updated" + f" or saved. Current version hash: {version_hash} != stored version hash" + f" {stored_version_hash}" + ) class LoadJobNotExistsException(DestinationTerminalException): @@ -60,7 +82,9 @@ def __init__(self, job_id: str) -> None: class LoadJobTerminalException(DestinationTerminalException): def __init__(self, file_path: str, message: str) -> None: - super().__init__(f"Job with id/file name {file_path} encountered unrecoverable problem: {message}") + super().__init__( + f"Job with id/file name {file_path} encountered unrecoverable problem: {message}" + ) class LoadJobInvalidStateTransitionException(DestinationTerminalException): @@ -72,17 +96,28 @@ def __init__(self, from_state: TLoadJobState, to_state: TLoadJobState) -> None: class LoadJobFileTooBig(DestinationTerminalException): def __init__(self, file_name: str, max_size: int) -> None: - super().__init__(f"File {file_name} exceeds {max_size} and cannot be loaded. Split the file and try again.") + super().__init__( + f"File {file_name} exceeds {max_size} and cannot be loaded. Split the file and try" + " again." + ) class MergeDispositionException(DestinationTerminalException): - def __init__(self, dataset_name: str, staging_dataset_name: str, tables: Sequence[str], reason: str) -> None: + def __init__( + self, dataset_name: str, staging_dataset_name: str, tables: Sequence[str], reason: str + ) -> None: self.dataset_name = dataset_name self.staging_dataset_name = staging_dataset_name self.tables = tables self.reason = reason - msg = f"Merge sql job for dataset name {dataset_name}, staging dataset name {staging_dataset_name} COULD NOT BE GENERATED. Merge will not be performed. " - msg += f"Data for the following tables ({tables}) is loaded to staging dataset. You may need to write your own materialization. The reason is:\n" + msg = ( + f"Merge sql job for dataset name {dataset_name}, staging dataset name" + f" {staging_dataset_name} COULD NOT BE GENERATED. Merge will not be performed. " + ) + msg += ( + f"Data for the following tables ({tables}) is loaded to staging dataset. You may need" + " to write your own materialization. The reason is:\n" + ) msg += reason super().__init__(msg) diff --git a/dlt/destinations/impl/athena/athena.py b/dlt/destinations/impl/athena/athena.py index f675e7a496..4837f0dbdf 100644 --- a/dlt/destinations/impl/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -1,4 +1,17 @@ -from typing import Optional, ClassVar, Iterator, Any, AnyStr, Sequence, Tuple, List, Dict, Callable, Iterable, Type +from typing import ( + Optional, + ClassVar, + Iterator, + Any, + AnyStr, + Sequence, + Tuple, + List, + Dict, + Callable, + Iterable, + Type, +) from copy import deepcopy import re @@ -10,7 +23,12 @@ from pyathena import connect from pyathena.connection import Connection from pyathena.error import OperationalError, DatabaseError, ProgrammingError, IntegrityError, Error -from pyathena.formatter import DefaultParameterFormatter, _DEFAULT_FORMATTERS, Formatter, _format_date +from pyathena.formatter import ( + DefaultParameterFormatter, + _DEFAULT_FORMATTERS, + Formatter, + _format_date, +) from dlt.common import logger from dlt.common.utils import without_none @@ -26,9 +44,19 @@ from dlt.destinations.sql_jobs import SqlStagingCopyJob from dlt.destinations.typing import DBApi, DBTransaction -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, + LoadJobTerminalException, +) from dlt.destinations.impl.athena import capabilities -from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error +from dlt.destinations.sql_client import ( + SqlClientBase, + DBApiCursorImpl, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.typing import DBApiCursor from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration @@ -46,13 +74,10 @@ class AthenaTypeMapper(TypeMapper): "timestamp": "timestamp", "bigint": "bigint", "binary": "binary", - "time": "string" + "time": "string", } - sct_to_dbt = { - "decimal": "decimal(%i,%i)", - "wei": "decimal(%i,%i)" - } + sct_to_dbt = {"decimal": "decimal(%i,%i)", "wei": "decimal(%i,%i)"} dbt_to_sct = { "varchar": "text", @@ -72,7 +97,9 @@ class AthenaTypeMapper(TypeMapper): def __init__(self, capabilities: DestinationCapabilitiesContext): super().__init__(capabilities) - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None: return "bigint" if precision <= 8: @@ -83,7 +110,9 @@ def to_db_integer_type(self, precision: Optional[int], table_format: TTableForma return "int" return "bigint" - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: for key, val in self.dbt_to_sct.items(): if db_type.startswith(key): return without_none(dict(data_type=val, precision=precision, scale=scale)) # type: ignore[return-value] @@ -101,7 +130,6 @@ def _format_pendulum_datetime(formatter: Formatter, escaper: Callable[[str], str class DLTAthenaFormatter(DefaultParameterFormatter): - _INSTANCE: ClassVar["DLTAthenaFormatter"] = None def __new__(cls: Type["DLTAthenaFormatter"]) -> "DLTAthenaFormatter": @@ -109,7 +137,6 @@ def __new__(cls: Type["DLTAthenaFormatter"]) -> "DLTAthenaFormatter": return cls._INSTANCE return super().__new__(cls) - def __init__(self) -> None: if DLTAthenaFormatter._INSTANCE: return @@ -118,9 +145,7 @@ def __init__(self) -> None: formatters[datetime] = _format_pendulum_datetime formatters[Date] = _format_date - super(DefaultParameterFormatter, self).__init__( - mappings=formatters, default=None - ) + super(DefaultParameterFormatter, self).__init__(mappings=formatters, default=None) DLTAthenaFormatter._INSTANCE = self @@ -138,13 +163,14 @@ def exception(self) -> str: # this part of code should be never reached raise NotImplementedError() + class DoNothingFollowupJob(DoNothingJob, FollowupJob): """The second most lazy class of dlt""" + pass class AthenaSQLClient(SqlClientBase[Connection]): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() dbapi: ClassVar[DBApi] = pyathena @@ -161,7 +187,8 @@ def open_connection(self) -> Connection: schema_name=self.dataset_name, s3_staging_dir=self.config.query_result_bucket, work_group=self.config.athena_work_group, - **native_credentials) + **native_credentials, + ) return self._conn def close_connection(self) -> None: @@ -195,18 +222,24 @@ def drop_dataset(self) -> None: self.execute_sql(f"DROP DATABASE {self.fully_qualified_ddl_dataset_name()} CASCADE;") def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + return ( + self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + ) def drop_tables(self, *tables: str) -> None: if not tables: return - statements = [f"DROP TABLE IF EXISTS {self.make_qualified_ddl_table_name(table)};" for table in tables] + statements = [ + f"DROP TABLE IF EXISTS {self.make_qualified_ddl_table_name(table)};" for table in tables + ] self.execute_fragments(statements) @contextmanager @raise_database_error def begin_transaction(self) -> Iterator[DBTransaction]: - logger.warning("Athena does not support transactions! Each SQL statement is auto-committed separately.") + logger.warning( + "Athena does not support transactions! Each SQL statement is auto-committed separately." + ) yield self @raise_database_error @@ -235,7 +268,9 @@ def _make_database_exception(ex: Exception) -> Exception: return DatabaseTransientException(ex) return ex - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if curr.description is None: return None @@ -244,13 +279,17 @@ def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequen return f @staticmethod - def _convert_to_old_pyformat(new_style_string: str, args: Tuple[Any, ...]) -> Tuple[str, Dict[str, Any]]: + def _convert_to_old_pyformat( + new_style_string: str, args: Tuple[Any, ...] + ) -> Tuple[str, Dict[str, Any]]: # create a list of keys - keys = ["arg"+str(i) for i, _ in enumerate(args)] + keys = ["arg" + str(i) for i, _ in enumerate(args)] # create an old style string and replace placeholders - old_style_string, count = re.subn(r"%s", lambda _: "%(" + keys.pop(0) + ")s", new_style_string) + old_style_string, count = re.subn( + r"%s", lambda _: "%(" + keys.pop(0) + ")s", new_style_string + ) # create a dictionary mapping keys to args - mapping = dict(zip(["arg"+str(i) for i, _ in enumerate(args)], args)) + mapping = dict(zip(["arg" + str(i) for i, _ in enumerate(args)], args)) # raise if there is a mismatch between args and string if count != len(args): raise DatabaseTransientException(OperationalError()) @@ -285,19 +324,17 @@ def has_dataset(self) -> bool: class AthenaClient(SqlJobClientWithStaging, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: AthenaClientConfiguration) -> None: # verify if staging layout is valid for Athena # this will raise if the table prefix is not properly defined # we actually that {table_name} is first, no {schema_name} is allowed - self.table_prefix_layout = path_utils.get_table_prefix_layout(config.staging_config.layout, []) - - sql_client = AthenaSQLClient( - config.normalize_dataset_name(schema), - config + self.table_prefix_layout = path_utils.get_table_prefix_layout( + config.staging_config.layout, [] ) + + sql_client = AthenaSQLClient(config.normalize_dataset_name(schema), config) super().__init__(schema, config, sql_client) self.sql_client: AthenaSQLClient = sql_client # type: ignore self.config: AthenaClientConfiguration = config @@ -308,14 +345,19 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: truncate_tables = [] super().initialize_storage(truncate_tables) - def _from_db_type(self, hive_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, hive_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(hive_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - return f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" - - def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: + return ( + f"{self.sql_client.escape_ddl_identifier(c['name'])} {self.type_mapper.to_db_type(c, table_format)}" + ) + def _get_table_update_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> List[str]: bucket = self.config.staging_config.bucket_url dataset = self.sql_client.dataset_name @@ -325,7 +367,9 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc # or if we are in iceberg mode, we create iceberg tables for all tables table = self.get_load_table(table_name, self.in_staging_mode) is_iceberg = self._is_iceberg_table(table) or table.get("write_disposition", None) == "skip" - columns = ", ".join([self._get_column_def_sql(c, table.get("table_format")) for c in new_columns]) + columns = ", ".join( + [self._get_column_def_sql(c, table.get("table_format")) for c in new_columns] + ) # this will fail if the table prefix is not properly defined table_prefix = self.table_prefix_layout.format(table_name=table_name) @@ -354,21 +398,32 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> if table_schema_has_type(table, "time"): raise LoadJobTerminalException( file_path, - "Athena cannot load TIME columns from parquet tables. Please convert `datetime.time` objects in your data to `str` or `datetime.datetime`." + "Athena cannot load TIME columns from parquet tables. Please convert" + " `datetime.time` objects in your data to `str` or `datetime.datetime`.", ) job = super().start_file_load(table, file_path, load_id) if not job: - job = DoNothingFollowupJob(file_path) if self._is_iceberg_table(self.get_load_table(table["name"])) else DoNothingJob(file_path) + job = ( + DoNothingFollowupJob(file_path) + if self._is_iceberg_table(self.get_load_table(table["name"])) + else DoNothingJob(file_path) + ) return job def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: if self._is_iceberg_table(self.get_load_table(table_chain[0]["name"])): - return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False})] + return [ + SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": False}) + ] return super()._create_append_followup_jobs(table_chain) - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: if self._is_iceberg_table(self.get_load_table(table_chain[0]["name"])): - return [SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})] + return [ + SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) + ] return super()._create_replace_followup_jobs(table_chain) def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: @@ -388,18 +443,22 @@ def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: def should_truncate_table_before_load_on_staging_destination(self, table: TTableSchema) -> bool: # on athena we only truncate replace tables that are not iceberg table = self.get_load_table(table["name"]) - if table["write_disposition"] == "replace" and not self._is_iceberg_table(self.get_load_table(table["name"])): + if table["write_disposition"] == "replace" and not self._is_iceberg_table( + self.get_load_table(table["name"]) + ): return True return False - def should_load_data_to_staging_dataset_on_staging_destination(self, table: TTableSchema) -> bool: + def should_load_data_to_staging_dataset_on_staging_destination( + self, table: TTableSchema + ) -> bool: """iceberg table data goes into staging on staging destination""" return self._is_iceberg_table(self.get_load_table(table["name"])) def get_load_table(self, table_name: str, staging: bool = False) -> TTableSchema: table = super().get_load_table(table_name, staging) if self.config.force_iceberg: - table["table_format"] ="iceberg" + table["table_format"] = "iceberg" if staging and table.get("table_format", None) == "iceberg": table.pop("table_format") return table diff --git a/dlt/destinations/impl/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py index 5dd1341c34..a420c1a8d1 100644 --- a/dlt/destinations/impl/athena/configuration.py +++ b/dlt/destinations/impl/athena/configuration.py @@ -2,7 +2,7 @@ from dlt.common.configuration import configspec from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration -from dlt.common.configuration.specs import AwsCredentials +from dlt.common.configuration.specs import AwsCredentials @configspec diff --git a/dlt/destinations/impl/athena/factory.py b/dlt/destinations/impl/athena/factory.py index cc2b027695..930ed57fe7 100644 --- a/dlt/destinations/impl/athena/factory.py +++ b/dlt/destinations/impl/athena/factory.py @@ -10,7 +10,6 @@ class athena(Destination[AthenaClientConfiguration, "AthenaClient"]): - spec = AthenaClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py index 440123e46d..bd4819462b 100644 --- a/dlt/destinations/impl/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -7,7 +7,13 @@ from dlt.common import json, logger from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, SupportsStagingDestination +from dlt.common.destination.reference import ( + FollowupJob, + NewLoadJob, + TLoadJobState, + LoadJob, + SupportsStagingDestination, +) from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -15,7 +21,12 @@ from dlt.common.schema.exceptions import UnknownTableException from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException +from dlt.destinations.exceptions import ( + DestinationSchemaWillNotUpdate, + DestinationTransientException, + LoadJobNotExistsException, + LoadJobTerminalException, +) from dlt.destinations.impl.bigquery import capabilities from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration @@ -62,7 +73,9 @@ class BigQueryTypeMapper(TypeMapper): "TIME": "time", } - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: if db_type == "BIGNUMERIC": if precision is None: # biggest numeric possible return dict(data_type="wei") @@ -75,7 +88,7 @@ def __init__( file_name: str, bq_load_job: bigquery.LoadJob, http_timeout: float, - retry_deadline: float + retry_deadline: float, ) -> None: self.bq_load_job = bq_load_job self.default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline) @@ -95,7 +108,10 @@ def state(self) -> TLoadJobState: # the job permanently failed for the reason above return "failed" elif reason in ["internalError"]: - logger.warning(f"Got reason {reason} for job {self.file_name}, job considered still running. ({self.bq_load_job.error_result})") + logger.warning( + f"Got reason {reason} for job {self.file_name}, job considered still" + f" running. ({self.bq_load_job.error_result})" + ) # status of the job could not be obtained, job still running return "running" else: @@ -108,13 +124,15 @@ def job_id(self) -> str: return BigQueryLoadJob.get_job_id_from_file_path(super().job_id()) def exception(self) -> str: - exception: str = json.dumps({ - "error_result": self.bq_load_job.error_result, - "errors": self.bq_load_job.errors, - "job_start": self.bq_load_job.started, - "job_end": self.bq_load_job.ended, - "job_id": self.bq_load_job.job_id - }) + exception: str = json.dumps( + { + "error_result": self.bq_load_job.error_result, + "errors": self.bq_load_job.errors, + "job_start": self.bq_load_job.started, + "job_end": self.bq_load_job.ended, + "job_id": self.bq_load_job.job_id, + } + ) return exception @staticmethod @@ -123,19 +141,32 @@ def get_job_id_from_file_path(file_path: str) -> str: class BigQueryMergeJob(SqlMergeJob): - @classmethod - def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: str, key_clauses: Sequence[str], for_delete: bool) -> List[str]: + def gen_key_table_clauses( + cls, + root_table_name: str, + staging_root_table_name: str, + key_clauses: Sequence[str], + for_delete: bool, + ) -> List[str]: # generate several clauses: BigQuery does not support OR nor unions sql: List[str] = [] for clause in key_clauses: - sql.append(f"FROM {root_table_name} AS d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} AS s WHERE {clause.format(d='d', s='s')})") + sql.append( + f"FROM {root_table_name} AS d WHERE EXISTS (SELECT 1 FROM" + f" {staging_root_table_name} AS s WHERE {clause.format(d='d', s='s')})" + ) return sql -class BigqueryStagingCopyJob(SqlStagingCopyJob): +class BigqueryStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): @@ -147,8 +178,8 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient sql.append(f"CREATE TABLE {table_name} CLONE {staging_table_name};") return sql -class BigQueryClient(SqlJobClientWithStaging, SupportsStagingDestination): +class BigQueryClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: @@ -157,7 +188,7 @@ def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: config.credentials, config.get_location(), config.http_timeout, - config.retry_deadline + config.retry_deadline, ) super().__init__(schema, config, sql_client) self.config: BigQueryClientConfiguration = config @@ -167,7 +198,9 @@ def __init__(self, schema: Schema, config: BigQueryClientConfiguration) -> None: def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [BigQueryMergeJob.from_table_chain(table_chain, self.sql_client)] - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: if self.config.replace_strategy == "staging-optimized": return [BigqueryStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) @@ -190,7 +223,7 @@ def restore_file_load(self, file_path: str) -> LoadJob: FileStorage.get_file_name_from_file_path(file_path), self._retrieve_load_job(file_path), self.config.http_timeout, - self.config.retry_deadline + self.config.retry_deadline, ) except api_core_exceptions.GoogleAPICallError as gace: reason = BigQuerySqlClient._get_reason_from_errors(gace) @@ -211,7 +244,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> FileStorage.get_file_name_from_file_path(file_path), self._create_load_job(table, file_path), self.config.http_timeout, - self.config.retry_deadline + self.config.retry_deadline, ) except api_core_exceptions.GoogleAPICallError as gace: reason = BigQuerySqlClient._get_reason_from_errors(gace) @@ -228,17 +261,31 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> raise DestinationTransientException(gace) return job - def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool, separate_alters: bool = False) -> List[str]: + def _get_table_update_sql( + self, + table_name: str, + new_columns: Sequence[TColumnSchema], + generate_alter: bool, + separate_alters: bool = False, + ) -> List[str]: sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) canonical_name = self.sql_client.make_qualified_table_name(table_name) - cluster_list = [self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster")] - partition_list = [self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("partition")] + cluster_list = [ + self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") + ] + partition_list = [ + self.capabilities.escape_identifier(c["name"]) + for c in new_columns + if c.get("partition") + ] # partition by must be added first if len(partition_list) > 0: if len(partition_list) > 1: - raise DestinationSchemaWillNotUpdate(canonical_name, partition_list, "Partition requested for more than one column") + raise DestinationSchemaWillNotUpdate( + canonical_name, partition_list, "Partition requested for more than one column" + ) else: sql[0] = sql[0] + f"\nPARTITION BY DATE({partition_list[0]})" if len(cluster_list) > 0: @@ -248,7 +295,9 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: name = self.capabilities.escape_identifier(c["name"]) - return f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}" + return ( + f"{name} {self.type_mapper.to_db_type(c, table_format)} {self._gen_not_null(c.get('nullable', True))}" + ) def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: schema_table: TTableSchemaColumns = {} @@ -256,7 +305,7 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns] table = self.sql_client.native_connection.get_table( self.sql_client.make_qualified_table_name(table_name, escape=False), retry=self.sql_client._default_retry, - timeout=self.config.http_timeout + timeout=self.config.http_timeout, ) partition_field = table.time_partitioning.field if table.time_partitioning else None for c in table.schema: @@ -269,7 +318,7 @@ def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns] "foreign_key": False, "cluster": c.name in (table.clustering_fields or []), "partition": c.name == partition_field, - **self._from_db_type(c.field_type, c.precision, c.scale) + **self._from_db_type(c.field_type, c.precision, c.scale), } schema_table[c.name] = schema_c return True, schema_table @@ -293,7 +342,10 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load if ext == "parquet": # if table contains complex types, we cannot load with parquet if table_schema_has_type(table, "complex"): - raise LoadJobTerminalException(file_path, "Bigquery cannot load into JSON data type from parquet. Use jsonl instead.") + raise LoadJobTerminalException( + file_path, + "Bigquery cannot load into JSON data type from parquet. Use jsonl instead.", + ) source_format = bigquery.SourceFormat.PARQUET # parquet needs NUMERIC type autodetection decimal_target_types = ["NUMERIC", "BIGNUMERIC"] @@ -306,29 +358,32 @@ def _create_load_job(self, table: TTableSchema, file_path: str) -> bigquery.Load source_format=source_format, decimal_target_types=decimal_target_types, ignore_unknown_values=False, - max_bad_records=0) + max_bad_records=0, + ) if bucket_path: return self.sql_client.native_connection.load_table_from_uri( - bucket_path, - self.sql_client.make_qualified_table_name(table_name, escape=False), - job_id=job_id, - job_config=job_config, - timeout=self.config.file_upload_timeout - ) + bucket_path, + self.sql_client.make_qualified_table_name(table_name, escape=False), + job_id=job_id, + job_config=job_config, + timeout=self.config.file_upload_timeout, + ) with open(file_path, "rb") as f: return self.sql_client.native_connection.load_table_from_file( - f, - self.sql_client.make_qualified_table_name(table_name, escape=False), - job_id=job_id, - job_config=job_config, - timeout=self.config.file_upload_timeout - ) + f, + self.sql_client.make_qualified_table_name(table_name, escape=False), + job_id=job_id, + job_config=job_config, + timeout=self.config.file_upload_timeout, + ) def _retrieve_load_job(self, file_path: str) -> bigquery.LoadJob: job_id = BigQueryLoadJob.get_job_id_from_file_path(file_path) return cast(bigquery.LoadJob, self.sql_client.native_connection.get_job(job_id)) - def _from_db_type(self, bq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, bq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(bq_t, precision, scale) diff --git a/dlt/destinations/impl/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py index 146e137475..264553aa3a 100644 --- a/dlt/destinations/impl/bigquery/configuration.py +++ b/dlt/destinations/impl/bigquery/configuration.py @@ -16,7 +16,9 @@ class BigQueryClientConfiguration(DestinationClientDwhWithStagingConfiguration): http_timeout: float = 15.0 # connection timeout for http request to BigQuery api file_upload_timeout: float = 30 * 60.0 # a timeout for file upload when loading local files - retry_deadline: float = 60.0 # how long to retry the operation in case of error, the backoff 60s + retry_deadline: float = ( + 60.0 # how long to retry the operation in case of error, the backoff 60s + ) __config_gen_annotations__: ClassVar[List[str]] = ["location"] @@ -25,7 +27,10 @@ def get_location(self) -> str: return self.location # default was changed in credentials, emit deprecation message if self.credentials.location != "US": - warnings.warn("Setting BigQuery location in the credentials is deprecated. Please set the location directly in bigquery section ie. destinations.bigquery.location='EU'") + warnings.warn( + "Setting BigQuery location in the credentials is deprecated. Please set the" + " location directly in bigquery section ie. destinations.bigquery.location='EU'" + ) return self.credentials.location def fingerprint(self) -> str: @@ -35,6 +40,7 @@ def fingerprint(self) -> str: return "" if TYPE_CHECKING: + def __init__( self, destination_name: str = None, @@ -44,7 +50,5 @@ def __init__( location: str = "US", http_timeout: float = 15.0, file_upload_timeout: float = 30 * 60.0, - retry_deadline: float = 60.0 - ) -> None: - ... - + retry_deadline: float = 60.0, + ) -> None: ... diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py index ce6ace3bf7..c2b965fccc 100644 --- a/dlt/destinations/impl/bigquery/factory.py +++ b/dlt/destinations/impl/bigquery/factory.py @@ -10,7 +10,6 @@ class bigquery(Destination[BigQueryClientConfiguration, "BigQueryClient"]): - spec = BigQueryClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -28,8 +27,4 @@ def __init__( location: t.Optional[str] = None, **kwargs: t.Any, ) -> None: - super().__init__( - credentials=credentials, - location=location, - **kwargs - ) + super().__init__(credentials=credentials, location=location, **kwargs) diff --git a/dlt/destinations/impl/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py index 4939add0da..cf5d2ecbd4 100644 --- a/dlt/destinations/impl/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -1,4 +1,3 @@ - from contextlib import contextmanager from typing import Any, AnyStr, ClassVar, Iterator, List, Optional, Sequence, Type @@ -14,19 +13,37 @@ from dlt.common.typing import StrAny from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation -from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) +from dlt.destinations.sql_client import ( + DBApiCursorImpl, + SqlClientBase, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.impl.bigquery import capabilities # terminal reasons as returned in BQ gRPC error response # https://cloud.google.com/bigquery/docs/error-messages -BQ_TERMINAL_REASONS = ["billingTierLimitExceeded", "duplicate", "invalid", "notFound", "notImplemented", "stopped", "tableUnavailable"] +BQ_TERMINAL_REASONS = [ + "billingTierLimitExceeded", + "duplicate", + "invalid", + "notFound", + "notImplemented", + "stopped", + "tableUnavailable", +] # invalidQuery is an transient error -> must be fixed by programmer class BigQueryDBApiCursorImpl(DBApiCursorImpl): """Use native BigQuery data frame support if available""" + native_cursor: BQDbApiCursor # type: ignore def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: @@ -43,7 +60,6 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: class BigQuerySqlClient(SqlClientBase[bigquery.Client], DBTransaction): - dbapi: ClassVar[DBApi] = bq_dbapi capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -53,7 +69,7 @@ def __init__( credentials: GcpServiceAccountCredentialsWithoutDefaults, location: str = "US", http_timeout: float = 15.0, - retry_deadline: float = 60.0 + retry_deadline: float = 60.0, ) -> None: self._client: bigquery.Client = None self.credentials: GcpServiceAccountCredentialsWithoutDefaults = credentials @@ -62,16 +78,17 @@ def __init__( super().__init__(credentials.project_id, dataset_name) self._default_retry = bigquery.DEFAULT_RETRY.with_deadline(retry_deadline) - self._default_query = bigquery.QueryJobConfig(default_dataset=self.fully_qualified_dataset_name(escape=False)) + self._default_query = bigquery.QueryJobConfig( + default_dataset=self.fully_qualified_dataset_name(escape=False) + ) self._session_query: bigquery.QueryJobConfig = None - @raise_open_connection_error def open_connection(self) -> bigquery.Client: self._client = bigquery.Client( self.credentials.project_id, credentials=self.credentials.to_native_credentials(), - location=self.location + location=self.location, ) # patch the client query so our defaults are used @@ -81,7 +98,7 @@ def query_patch( query: str, retry: Any = self._default_retry, timeout: Any = self.http_timeout, - **kwargs: Any + **kwargs: Any, ) -> Any: return query_orig(query, retry=retry, timeout=timeout, **kwargs) @@ -105,8 +122,8 @@ def begin_transaction(self) -> Iterator[DBTransaction]: "BEGIN TRANSACTION;", job_config=bigquery.QueryJobConfig( create_session=True, - default_dataset=self.fully_qualified_dataset_name(escape=False) - ) + default_dataset=self.fully_qualified_dataset_name(escape=False), + ), ) self._session_query = bigquery.QueryJobConfig( create_session=False, @@ -115,7 +132,7 @@ def begin_transaction(self) -> Iterator[DBTransaction]: bigquery.query.ConnectionProperty( key="session_id", value=job.session_info.session_id ) - ] + ], ) try: job.result() @@ -124,7 +141,9 @@ def begin_transaction(self) -> Iterator[DBTransaction]: self._session_query = None raise else: - raise dbapi_exceptions.ProgrammingError("Nested transactions not supported on BigQuery") + raise dbapi_exceptions.ProgrammingError( + "Nested transactions not supported on BigQuery" + ) yield self self.commit_transaction() except Exception: @@ -150,7 +169,11 @@ def native_connection(self) -> bigquery.Client: def has_dataset(self) -> bool: try: - self._client.get_dataset(self.fully_qualified_dataset_name(escape=False), retry=self._default_retry, timeout=self.http_timeout) + self._client.get_dataset( + self.fully_qualified_dataset_name(escape=False), + retry=self._default_retry, + timeout=self.http_timeout, + ) return True except gcp_exceptions.NotFound: return False @@ -160,7 +183,7 @@ def create_dataset(self) -> None: self.fully_qualified_dataset_name(escape=False), exists_ok=False, retry=self._default_retry, - timeout=self.http_timeout + timeout=self.http_timeout, ) def drop_dataset(self) -> None: @@ -169,10 +192,12 @@ def drop_dataset(self) -> None: not_found_ok=True, delete_contents=True, retry=self._default_retry, - timeout=self.http_timeout + timeout=self.http_timeout, ) - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if not curr.description: return None @@ -187,7 +212,7 @@ def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequen @contextmanager @raise_database_error - def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]: + def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DBApiCursor]: conn: DbApiConnection = None curr: DBApiCursor = None db_args = args if args else kwargs if kwargs else None @@ -226,11 +251,17 @@ def _make_database_exception(cls, ex: Exception) -> Exception: return DatabaseUndefinedRelation(ex) if reason == "invalidQuery" and "was not found" in str(ex) and "Dataset" in str(ex): return DatabaseUndefinedRelation(ex) - if reason == "invalidQuery" and "Not found" in str(ex) and ("Dataset" in str(ex) or "Table" in str(ex)): + if ( + reason == "invalidQuery" + and "Not found" in str(ex) + and ("Dataset" in str(ex) or "Table" in str(ex)) + ): return DatabaseUndefinedRelation(ex) if reason == "accessDenied" and "Dataset" in str(ex) and "not exist" in str(ex): return DatabaseUndefinedRelation(ex) - if reason == "invalidQuery" and ("Unrecognized name" in str(ex) or "cannot be null" in str(ex)): + if reason == "invalidQuery" and ( + "Unrecognized name" in str(ex) or "cannot be null" in str(ex) + ): # unknown column, inserting NULL into required field return DatabaseTerminalException(ex) if reason in BQ_TERMINAL_REASONS: @@ -253,4 +284,7 @@ def is_dbapi_exception(ex: Exception) -> bool: class TransactionsNotImplementedError(NotImplementedError): def __init__(self) -> None: - super().__init__("BigQuery does not support transaction management. Instead you may wrap your SQL script in BEGIN TRANSACTION; ... COMMIT TRANSACTION;") + super().__init__( + "BigQuery does not support transaction management. Instead you may wrap your SQL script" + " in BEGIN TRANSACTION; ... COMMIT TRANSACTION;" + ) diff --git a/dlt/destinations/impl/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py index a5f77be8fd..63a975d9c3 100644 --- a/dlt/destinations/impl/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -7,7 +7,10 @@ from dlt.common.configuration import configspec from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.configuration.specs.exceptions import InvalidConnectionString -from dlt.common.destination.reference import DestinationClientDwhWithStagingConfiguration, DestinationClientStagingConfiguration +from dlt.common.destination.reference import ( + DestinationClientDwhWithStagingConfiguration, + DestinationClientStagingConfiguration, +) from dlt.common.typing import TSecretValue DUCK_DB_NAME = "%s.duckdb" @@ -59,6 +62,7 @@ def parse_native_representation(self, native_value: Any) -> None: try: # check if database was passed as explicit connection import duckdb + if isinstance(native_value, duckdb.DuckDBPyConnection): self._conn = native_value self._conn_owner = False @@ -134,7 +138,9 @@ def _path_in_pipeline(self, rel_path: str) -> str: if context.is_active(): # pipeline is active, get the working directory return os.path.join(context.pipeline().working_dir, rel_path) - raise RuntimeError("Attempting to use special duckdb database :pipeline: outside of pipeline context.") + raise RuntimeError( + "Attempting to use special duckdb database :pipeline: outside of pipeline context." + ) def _path_to_pipeline(self, abspath: str) -> None: from dlt.common.configuration.container import Container @@ -171,7 +177,11 @@ def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]: pipeline_path = pipeline.get_local_state_val(LOCAL_STATE_KEY) # make sure that path exists if not os.path.exists(pipeline_path): - logger.warning(f"Duckdb attached to pipeline {pipeline.pipeline_name} in path {os.path.relpath(pipeline_path)} was deleted. Attaching to duckdb database '{default_path}' in current folder.") + logger.warning( + f"Duckdb attached to pipeline {pipeline.pipeline_name} in path" + f" {os.path.relpath(pipeline_path)} was deleted. Attaching to duckdb" + f" database '{default_path}' in current folder." + ) else: return pipeline_path, False except KeyError: @@ -189,7 +199,9 @@ class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_name: Final[str] = "duckdb" # type: ignore credentials: DuckDbCredentials - create_indexes: bool = False # should unique indexes be created, this slows loading down massively + create_indexes: bool = ( + False # should unique indexes be created, this slows loading down massively + ) if TYPE_CHECKING: try: @@ -204,6 +216,5 @@ def __init__( dataset_name: str = None, default_schema_name: Optional[str] = None, create_indexes: bool = False, - staging_config: Optional[DestinationClientStagingConfiguration] = None - ) -> None: - ... + staging_config: Optional[DestinationClientStagingConfiguration] = None, + ) -> None: ... diff --git a/dlt/destinations/impl/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py index 6e6ec359fe..735a4ce7e3 100644 --- a/dlt/destinations/impl/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -18,9 +18,7 @@ from dlt.destinations.type_mapping import TypeMapper -HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = { - "unique": "UNIQUE" -} +HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} # duckdb cannot load PARQUET to the same table in parallel. so serialize it per table PARQUET_TABLE_LOCK = threading.Lock() @@ -38,7 +36,7 @@ class DuckDbTypeMapper(TypeMapper): "timestamp": "TIMESTAMP WITH TIME ZONE", "bigint": "BIGINT", "binary": "BLOB", - "time": "TIME" + "time": "TIME", } sct_to_dbt = { @@ -69,7 +67,9 @@ class DuckDbTypeMapper(TypeMapper): "TIMESTAMP_NS": "timestamp", } - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None: return "BIGINT" # Precision is number of bits @@ -83,7 +83,9 @@ def to_db_integer_type(self, precision: Optional[int], table_format: TTableForma return "BIGINT" return "HUGEINT" - def to_db_datetime_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_datetime_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None or precision == 6: return super().to_db_datetime_type(precision, table_format) if precision == 0: @@ -92,9 +94,13 @@ def to_db_datetime_type(self, precision: Optional[int], table_format: TTableForm return "TIMESTAMP_MS" if precision == 9: return "TIMESTAMP_NS" - raise TerminalValueError(f"timestamp {precision} cannot be mapped into duckdb TIMESTAMP typ") + raise TerminalValueError( + f"timestamp {precision} cannot be mapped into duckdb TIMESTAMP typ" + ) - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: # duckdb provides the types with scale and precision db_type = db_type.split("(")[0].upper() if db_type == "DECIMAL": @@ -114,7 +120,9 @@ def __init__(self, table_name: str, file_path: str, sql_client: DuckDbSqlClient) # lock when creating a new lock with PARQUET_TABLE_LOCK: # create or get lock per table name - lock: threading.Lock = TABLES_LOCKS.setdefault(qualified_table_name, threading.Lock()) + lock: threading.Lock = TABLES_LOCKS.setdefault( + qualified_table_name, threading.Lock() + ) elif file_path.endswith("jsonl"): # NOTE: loading JSON does not work in practice on duckdb: the missing keys fail the load instead of being interpreted as NULL source_format = "JSON" # newline delimited, compression auto @@ -125,8 +133,10 @@ def __init__(self, table_name: str, file_path: str, sql_client: DuckDbSqlClient) with maybe_context(lock): with sql_client.begin_transaction(): - sql_client.execute_sql(f"COPY {qualified_table_name} FROM '{file_path}' ( FORMAT {source_format} {options});") - + sql_client.execute_sql( + f"COPY {qualified_table_name} FROM '{file_path}' ( FORMAT" + f" {source_format} {options});" + ) def state(self) -> TLoadJobState: return "completed" @@ -134,15 +144,12 @@ def state(self) -> TLoadJobState: def exception(self) -> str: raise NotImplementedError() -class DuckDbClient(InsertValuesJobClient): +class DuckDbClient(InsertValuesJobClient): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: DuckDbClientConfiguration) -> None: - sql_client = DuckDbSqlClient( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = DuckDbSqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.config: DuckDbClientConfiguration = config self.sql_client: DuckDbSqlClient = sql_client # type: ignore @@ -156,9 +163,17 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> return job def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - hints_str = " ".join(self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True) + hints_str = " ".join( + self.active_hints.get(h, "") + for h in self.active_hints.keys() + if c.get(h, False) is True + ) column_name = self.capabilities.escape_identifier(c["name"]) - return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + return ( + f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + ) - def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, pq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py index 1b882c52a1..1e3edaea66 100644 --- a/dlt/destinations/impl/duckdb/factory.py +++ b/dlt/destinations/impl/duckdb/factory.py @@ -10,7 +10,6 @@ class duckdb(Destination[DuckDbClientConfiguration, "DuckDbClient"]): - spec = DuckDbClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -24,7 +23,9 @@ def client_class(self) -> t.Type["DuckDbClient"]: def __init__( self, - credentials: t.Union[DuckDbCredentials, t.Dict[str, t.Any], str, "DuckDBPyConnection"] = None, + credentials: t.Union[ + DuckDbCredentials, t.Dict[str, t.Any], str, "DuckDBPyConnection" + ] = None, create_indexes: bool = False, **kwargs: t.Any, ) -> None: diff --git a/dlt/destinations/impl/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py index cb4e1678a2..2863d4943e 100644 --- a/dlt/destinations/impl/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -4,9 +4,18 @@ from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error +from dlt.destinations.sql_client import ( + SqlClientBase, + DBApiCursorImpl, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.impl.duckdb import capabilities from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials @@ -14,6 +23,7 @@ class DuckDBDBApiCursorImpl(DBApiCursorImpl): """Use native BigQuery data frame support if available""" + native_cursor: duckdb.DuckDBPyConnection # type: ignore vector_size: ClassVar[int] = 2048 @@ -21,7 +31,9 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: if chunk_size is None: return self.native_cursor.df(**kwargs) else: - multiple = chunk_size // self.vector_size + (0 if self.vector_size % chunk_size == 0 else 1) + multiple = chunk_size // self.vector_size + ( + 0 if self.vector_size % chunk_size == 0 else 1 + ) df = self.native_cursor.fetch_df_chunk(multiple, **kwargs) if df.shape[0] == 0: return None @@ -30,7 +42,6 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> DataFrame: class DuckDbSqlClient(SqlClientBase[duckdb.DuckDBPyConnection], DBTransaction): - dbapi: ClassVar[DBApi] = duckdb capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -44,11 +55,11 @@ def open_connection(self) -> duckdb.DuckDBPyConnection: self._conn = self.credentials.borrow_conn(read_only=self.credentials.read_only) # TODO: apply config settings from credentials self._conn.execute("PRAGMA enable_checkpoint_on_shutdown;") - config={ + config = { "search_path": self.fully_qualified_dataset_name(), "TimeZone": "UTC", - "checkpoint_threshold": "1gb" - } + "checkpoint_threshold": "1gb", + } if config: for k, v in config.items(): try: @@ -91,7 +102,9 @@ def rollback_transaction(self) -> None: def native_connection(self) -> duckdb.DuckDBPyConnection: return self._conn - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if curr.description is None: return None @@ -130,7 +143,9 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB # return None def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + return ( + self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + ) @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: @@ -144,7 +159,15 @@ def _make_database_exception(cls, ex: Exception) -> Exception: raise DatabaseUndefinedRelation(ex) # duckdb raises TypeError on malformed query parameters return DatabaseTransientException(duckdb.ProgrammingError(ex)) - elif isinstance(ex, (duckdb.OperationalError, duckdb.InternalError, duckdb.SyntaxException, duckdb.ParserException)): + elif isinstance( + ex, + ( + duckdb.OperationalError, + duckdb.InternalError, + duckdb.SyntaxException, + duckdb.ParserException, + ), + ): term = cls._maybe_make_terminal_exception_from_data_error(ex) if term: return term diff --git a/dlt/destinations/impl/dummy/__init__.py b/dlt/destinations/impl/dummy/__init__.py index 476523cb8f..81dea88a5f 100644 --- a/dlt/destinations/impl/dummy/__init__.py +++ b/dlt/destinations/impl/dummy/__init__.py @@ -5,7 +5,13 @@ from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration -@with_config(spec=DummyClientConfiguration, sections=(known_sections.DESTINATION, "dummy",)) +@with_config( + spec=DummyClientConfiguration, + sections=( + known_sections.DESTINATION, + "dummy", + ), +) def _configure(config: DummyClientConfiguration = config.value) -> DummyClientConfiguration: return config diff --git a/dlt/destinations/impl/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py index 1a8072300c..83089a0ae7 100644 --- a/dlt/destinations/impl/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -2,12 +2,14 @@ from dlt.common.configuration import configspec from dlt.common.destination import TLoaderFileFormat -from dlt.common.destination.reference import DestinationClientConfiguration, CredentialsConfiguration +from dlt.common.destination.reference import ( + DestinationClientConfiguration, + CredentialsConfiguration, +) @configspec class DummyClientCredentials(CredentialsConfiguration): - def __str__(self) -> str: return "/dev/null" @@ -28,6 +30,7 @@ class DummyClientConfiguration(DestinationClientConfiguration): credentials: DummyClientCredentials = None if TYPE_CHECKING: + def __init__( self, destination_name: str = None, @@ -40,5 +43,4 @@ def __init__( exception_prob: float = None, timeout: float = None, fail_in_init: bool = None, - ) -> None: - ... + ) -> None: ... diff --git a/dlt/destinations/impl/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py index 0bc061a7dd..300ce253b2 100644 --- a/dlt/destinations/impl/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -8,10 +8,20 @@ from dlt.common.schema.typing import TWriteDisposition from dlt.common.storages import FileStorage from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, JobClientBase - -from dlt.destinations.exceptions import (LoadJobNotExistsException, LoadJobInvalidStateTransitionException, - DestinationTerminalException, DestinationTransientException) +from dlt.common.destination.reference import ( + FollowupJob, + NewLoadJob, + TLoadJobState, + LoadJob, + JobClientBase, +) + +from dlt.destinations.exceptions import ( + LoadJobNotExistsException, + LoadJobInvalidStateTransitionException, + DestinationTerminalException, + DestinationTransientException, +) from dlt.destinations.impl.dummy import capabilities from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration @@ -31,7 +41,6 @@ def __init__(self, file_name: str, config: DummyClientConfiguration) -> None: if s == "retry": raise DestinationTransientException(self._exception) - def state(self) -> TLoadJobState: # this should poll the server for a job status, here we simulate various outcomes if self._status == "running": @@ -90,10 +99,14 @@ def is_storage_initialized(self) -> bool: def drop_storage(self) -> None: pass - def update_stored_schema(self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None) -> Optional[TSchemaTables]: + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> Optional[TSchemaTables]: applied_update = super().update_stored_schema(only_tables, expected_update) if self.config.fail_schema_update: - raise DestinationTransientException("Raise on schema update due to fail_schema_update config flag") + raise DestinationTransientException( + "Raise on schema update due to fail_schema_update config flag" + ) return applied_update def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: @@ -115,7 +128,9 @@ def restore_file_load(self, file_path: str) -> LoadJob: raise LoadJobNotExistsException(job_id) return JOBS[job_id] - def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def create_table_chain_completed_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: """Creates a list of followup jobs that should be executed after a table chain is completed""" return [] @@ -125,11 +140,10 @@ def complete_load(self, load_id: str) -> None: def __enter__(self) -> "DummyClient": return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: pass def _create_job(self, job_id: str) -> LoadDummyJob: - return LoadDummyJob( - job_id, - config=self.config - ) + return LoadDummyJob(job_id, config=self.config) diff --git a/dlt/destinations/impl/dummy/factory.py b/dlt/destinations/impl/dummy/factory.py index 265c77b0f4..4bda39fc81 100644 --- a/dlt/destinations/impl/dummy/factory.py +++ b/dlt/destinations/impl/dummy/factory.py @@ -2,7 +2,10 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration, DummyClientCredentials +from dlt.destinations.impl.dummy.configuration import ( + DummyClientConfiguration, + DummyClientCredentials, +) from dlt.destinations.impl.dummy import capabilities if t.TYPE_CHECKING: @@ -10,7 +13,6 @@ class dummy(Destination[DummyClientConfiguration, "DummyClient"]): - spec = DummyClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py index 174dfafb1a..11a008e426 100644 --- a/dlt/destinations/impl/filesystem/configuration.py +++ b/dlt/destinations/impl/filesystem/configuration.py @@ -3,20 +3,24 @@ from typing import Final, Type, Optional, Any, TYPE_CHECKING from dlt.common.configuration import configspec, resolve_type -from dlt.common.destination.reference import CredentialsConfiguration, DestinationClientStagingConfiguration +from dlt.common.destination.reference import ( + CredentialsConfiguration, + DestinationClientStagingConfiguration, +) from dlt.common.storages import FilesystemConfiguration @configspec -class FilesystemDestinationClientConfiguration(FilesystemConfiguration, DestinationClientStagingConfiguration): # type: ignore[misc] +class FilesystemDestinationClientConfiguration(FilesystemConfiguration, DestinationClientStagingConfiguration): # type: ignore[misc] destination_name: Final[str] = "filesystem" # type: ignore - @resolve_type('credentials') + @resolve_type("credentials") def resolve_credentials_type(self) -> Type[CredentialsConfiguration]: # use known credentials or empty credentials for unknown protocol return self.PROTOCOL_CREDENTIALS.get(self.protocol) or Optional[CredentialsConfiguration] # type: ignore[return-value] if TYPE_CHECKING: + def __init__( self, destination_name: str = None, @@ -24,5 +28,4 @@ def __init__( dataset_name: str = None, default_schema_name: Optional[str] = None, bucket_url: str = None, - ) -> None: - ... + ) -> None: ... diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py index 4e2a716d79..e0d11bf0fa 100644 --- a/dlt/destinations/impl/filesystem/factory.py +++ b/dlt/destinations/impl/filesystem/factory.py @@ -10,7 +10,6 @@ class filesystem(Destination[FilesystemDestinationClientConfiguration, "FilesystemClient"]): - spec = FilesystemDestinationClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py index fe349aac6b..f3127639fc 100644 --- a/dlt/destinations/impl/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -9,7 +9,14 @@ from dlt.common.schema import Schema, TSchemaTables, TTableSchema from dlt.common.storages import FileStorage, LoadStorage, fsspec_from_config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import NewLoadJob, TLoadJobState, LoadJob, JobClientBase, FollowupJob, WithStagingDataset +from dlt.common.destination.reference import ( + NewLoadJob, + TLoadJobState, + LoadJob, + JobClientBase, + FollowupJob, + WithStagingDataset, +) from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.impl.filesystem import capabilities @@ -20,38 +27,48 @@ class LoadFilesystemJob(LoadJob): def __init__( - self, - local_path: str, - dataset_path: str, - *, - config: FilesystemDestinationClientConfiguration, - schema_name: str, - load_id: str + self, + local_path: str, + dataset_path: str, + *, + config: FilesystemDestinationClientConfiguration, + schema_name: str, + load_id: str, ) -> None: file_name = FileStorage.get_file_name_from_file_path(local_path) self.config = config self.dataset_path = dataset_path - self.destination_file_name = LoadFilesystemJob.make_destination_filename(config.layout, file_name, schema_name, load_id) + self.destination_file_name = LoadFilesystemJob.make_destination_filename( + config.layout, file_name, schema_name, load_id + ) super().__init__(file_name) fs_client, _ = fsspec_from_config(config) - self.destination_file_name = LoadFilesystemJob.make_destination_filename(config.layout, file_name, schema_name, load_id) + self.destination_file_name = LoadFilesystemJob.make_destination_filename( + config.layout, file_name, schema_name, load_id + ) item = self.make_remote_path() logger.info("PUT file {item}") fs_client.put_file(local_path, item) @staticmethod - def make_destination_filename(layout: str, file_name: str, schema_name: str, load_id: str) -> str: + def make_destination_filename( + layout: str, file_name: str, schema_name: str, load_id: str + ) -> str: job_info = LoadStorage.parse_job_file_name(file_name) - return path_utils.create_path(layout, - schema_name=schema_name, - table_name=job_info.table_name, - load_id=load_id, - file_id=job_info.file_id, - ext=job_info.file_format) + return path_utils.create_path( + layout, + schema_name=schema_name, + table_name=job_info.table_name, + load_id=load_id, + file_id=job_info.file_id, + ext=job_info.file_format, + ) def make_remote_path(self) -> str: - return f"{self.config.protocol}://{posixpath.join(self.dataset_path, self.destination_file_name)}" + return ( + f"{self.config.protocol}://{posixpath.join(self.dataset_path, self.destination_file_name)}" + ) def state(self) -> TLoadJobState: return "completed" @@ -64,7 +81,9 @@ class FollowupFilesystemJob(FollowupJob, LoadFilesystemJob): def create_followup_jobs(self, next_state: str) -> List[NewLoadJob]: jobs = super().create_followup_jobs(next_state) if next_state == "completed": - ref_job = NewReferenceJob(file_name=self.file_name(), status="running", remote_path=self.make_remote_path()) + ref_job = NewReferenceJob( + file_name=self.file_name(), status="running", remote_path=self.make_remote_path() + ) jobs.append(ref_job) return jobs @@ -93,12 +112,13 @@ def drop_storage(self) -> None: def dataset_path(self) -> str: return posixpath.join(self.fs_path, self._dataset_path) - @contextmanager def with_staging_dataset(self) -> Iterator["FilesystemClient"]: current_dataset_path = self._dataset_path try: - self._dataset_path = self.schema.naming.normalize_table_identifier(current_dataset_path + "_staging") + self._dataset_path = self.schema.naming.normalize_table_identifier( + current_dataset_path + "_staging" + ) yield self finally: # restore previous dataset name @@ -113,7 +133,9 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # print(f"TRUNCATE {truncated_dirs}") truncate_prefixes: Set[str] = set() for table in truncate_tables: - table_prefix = self.table_prefix_layout.format(schema_name=self.schema.name, table_name=table) + table_prefix = self.table_prefix_layout.format( + schema_name=self.schema.name, table_name=table + ) truncate_prefixes.add(posixpath.join(self.dataset_path, table_prefix)) # print(f"TRUNCATE PREFIXES {truncate_prefixes}") @@ -135,9 +157,14 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: # print(f"DEL {item}") self.fs_client.rm(item) except FileNotFoundError: - logger.info(f"Directory or path to truncate tables {truncate_dir} does not exist but it should be created previously!") - - def update_stored_schema(self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None) -> TSchemaTables: + logger.info( + f"Directory or path to truncate tables {truncate_dir} does not exist but it" + " should be created previously!" + ) + + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> TSchemaTables: # create destination dirs for all tables dirs_to_create = self._get_table_dirs(only_tables or self.schema.tables.keys()) for directory in dirs_to_create: @@ -148,7 +175,9 @@ def _get_table_dirs(self, table_names: Iterable[str]) -> Set[str]: """Gets unique directories where table data is stored.""" table_dirs: Set[str] = set() for table_name in table_names: - table_prefix = self.table_prefix_layout.format(schema_name=self.schema.name, table_name=table_name) + table_prefix = self.table_prefix_layout.format( + schema_name=self.schema.name, table_name=table_name + ) destination_dir = posixpath.join(self.dataset_path, table_prefix) # extract the path component table_dirs.add(os.path.dirname(destination_dir)) @@ -164,7 +193,7 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> self.dataset_path, config=self.config, schema_name=self.schema.name, - load_id=load_id + load_id=load_id, ) def restore_file_load(self, file_path: str) -> LoadJob: @@ -179,7 +208,9 @@ def complete_load(self, load_id: str) -> None: def __enter__(self) -> "FilesystemClient": return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: pass def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: diff --git a/dlt/destinations/impl/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py index a376f1a5aa..01cfa99efc 100644 --- a/dlt/destinations/impl/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -31,11 +31,13 @@ def _token_to_password(self) -> None: def borrow_conn(self, read_only: bool) -> Any: from duckdb import HTTPException, InvalidInputException + try: return super().borrow_conn(read_only) except (InvalidInputException, HTTPException) as ext_ex: - if 'Failed to download extension' in str(ext_ex) and "motherduck" in str(ext_ex): + if "Failed to download extension" in str(ext_ex) and "motherduck" in str(ext_ex): from importlib.metadata import version as pkg_version + raise MotherduckLocalVersionNotSupported(pkg_version("duckdb")) from ext_ex raise @@ -47,7 +49,10 @@ def parse_native_representation(self, native_value: Any) -> None: def on_resolved(self) -> None: self._token_to_password() if self.drivername == MOTHERDUCK_DRIVERNAME and not self.password: - raise ConfigurationValueError("Motherduck schema 'md' was specified without corresponding token or password. The required format of connection string is: md:///?token=") + raise ConfigurationValueError( + "Motherduck schema 'md' was specified without corresponding token or password. The" + " required format of connection string is: md:///?token=" + ) @configspec @@ -55,7 +60,9 @@ class MotherDuckClientConfiguration(DestinationClientDwhWithStagingConfiguration destination_name: Final[str] = "motherduck" # type: ignore credentials: MotherDuckCredentials - create_indexes: bool = False # should unique indexes be created, this slows loading down massively + create_indexes: bool = ( + False # should unique indexes be created, this slows loading down massively + ) def fingerprint(self) -> str: """Returns a fingerprint of user access token""" @@ -67,4 +74,7 @@ def fingerprint(self) -> str: class MotherduckLocalVersionNotSupported(DestinationTerminalException): def __init__(self, duckdb_version: str) -> None: self.duckdb_version = duckdb_version - super().__init__(f"Looks like your local duckdb version ({duckdb_version}) is not supported by Motherduck") + super().__init__( + f"Looks like your local duckdb version ({duckdb_version}) is not supported by" + " Motherduck" + ) diff --git a/dlt/destinations/impl/motherduck/factory.py b/dlt/destinations/impl/motherduck/factory.py index 17cf4a76b4..7986958912 100644 --- a/dlt/destinations/impl/motherduck/factory.py +++ b/dlt/destinations/impl/motherduck/factory.py @@ -1,7 +1,10 @@ import typing as t from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck.configuration import ( + MotherDuckCredentials, + MotherDuckClientConfiguration, +) from dlt.destinations.impl.motherduck import capabilities if t.TYPE_CHECKING: @@ -10,7 +13,6 @@ class motherduck(Destination[MotherDuckClientConfiguration, "MotherDuckClient"]): - spec = MotherDuckClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -24,7 +26,9 @@ def client_class(self) -> t.Type["MotherDuckClient"]: def __init__( self, - credentials: t.Union[MotherDuckCredentials, str, t.Dict[str, t.Any], "DuckDBPyConnection"] = None, + credentials: t.Union[ + MotherDuckCredentials, str, t.Dict[str, t.Any], "DuckDBPyConnection" + ] = None, create_indexes: bool = False, **kwargs: t.Any, ) -> None: diff --git a/dlt/destinations/impl/motherduck/motherduck.py b/dlt/destinations/impl/motherduck/motherduck.py index 9822f2b7b6..c695d9715e 100644 --- a/dlt/destinations/impl/motherduck/motherduck.py +++ b/dlt/destinations/impl/motherduck/motherduck.py @@ -11,14 +11,10 @@ class MotherDuckClient(DuckDbClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: MotherDuckClientConfiguration) -> None: super().__init__(schema, config) # type: ignore - sql_client = MotherDuckSqlClient( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = MotherDuckSqlClient(config.normalize_dataset_name(schema), config.credentials) self.config: MotherDuckClientConfiguration = config # type: ignore self.sql_client: MotherDuckSqlClient = sql_client diff --git a/dlt/destinations/impl/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py index 672c377fd9..7990f90947 100644 --- a/dlt/destinations/impl/motherduck/sql_client.py +++ b/dlt/destinations/impl/motherduck/sql_client.py @@ -4,9 +4,18 @@ from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error +from dlt.destinations.sql_client import ( + SqlClientBase, + DBApiCursorImpl, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl from dlt.destinations.impl.motherduck import capabilities @@ -14,7 +23,6 @@ class MotherDuckSqlClient(DuckDbSqlClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, dataset_name: str, credentials: MotherDuckCredentials) -> None: @@ -22,6 +30,12 @@ def __init__(self, dataset_name: str, credentials: MotherDuckCredentials) -> Non self.database_name = credentials.database def fully_qualified_dataset_name(self, escape: bool = True) -> str: - database_name = self.capabilities.escape_identifier(self.database_name) if escape else self.database_name - dataset_name = self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + database_name = ( + self.capabilities.escape_identifier(self.database_name) + if escape + else self.database_name + ) + dataset_name = ( + self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + ) return f"{database_name}.{dataset_name}" diff --git a/dlt/destinations/impl/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py index 40e971cacf..e9d9fe24fd 100644 --- a/dlt/destinations/impl/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -19,7 +19,7 @@ def capabilities() -> DestinationCapabilitiesContext: caps.max_column_identifier_length = 128 caps.max_query_length = 4 * 1024 * 64 * 1024 caps.is_max_query_length_in_bytes = True - caps.max_text_data_type_length = 2 ** 30 - 1 + caps.max_text_data_type_length = 2**30 - 1 caps.is_max_text_data_type_length_in_bytes = False caps.supports_ddl_transactions = True caps.max_rows_per_insert = 1000 diff --git a/dlt/destinations/impl/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py index 17f10ddff0..9d100b34e6 100644 --- a/dlt/destinations/impl/mssql/configuration.py +++ b/dlt/destinations/impl/mssql/configuration.py @@ -45,16 +45,17 @@ def _get_odbc_driver(self) -> str: if self.odbc_driver: return self.odbc_driver # Pick a default driver if available - supported_drivers = ['ODBC Driver 18 for SQL Server', 'ODBC Driver 17 for SQL Server'] + supported_drivers = ["ODBC Driver 18 for SQL Server", "ODBC Driver 17 for SQL Server"] import pyodbc + available_drivers = pyodbc.drivers() for driver in supported_drivers: if driver in available_drivers: return driver docs_url = "https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver16" raise SystemConfigurationException( - f"No supported ODBC driver found for MS SQL Server. " - f"See {docs_url} for information on how to install the '{supported_drivers[0]}' on your platform." + f"No supported ODBC driver found for MS SQL Server. See {docs_url} for information on" + f" how to install the '{supported_drivers[0]}' on your platform." ) def to_odbc_dsn(self) -> str: @@ -70,7 +71,6 @@ def to_odbc_dsn(self) -> str: return ";".join([f"{k}={v}" for k, v in params.items()]) - @configspec class MsSqlClientConfiguration(DestinationClientDwhWithStagingConfiguration): destination_name: Final[str] = "mssql" # type: ignore diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py index c98531ca79..6c8d0dfe40 100644 --- a/dlt/destinations/impl/mssql/factory.py +++ b/dlt/destinations/impl/mssql/factory.py @@ -10,7 +10,6 @@ class mssql(Destination[MsSqlClientConfiguration, "MsSqlClient"]): - spec = MsSqlClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py index 851122f20c..7561003fb4 100644 --- a/dlt/destinations/impl/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -19,9 +19,7 @@ from dlt.destinations.type_mapping import TypeMapper -HINT_TO_MSSQL_ATTR: Dict[TColumnHint, str] = { - "unique": "UNIQUE" -} +HINT_TO_MSSQL_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} class MsSqlTypeMapper(TypeMapper): @@ -44,7 +42,7 @@ class MsSqlTypeMapper(TypeMapper): "binary": "varbinary(%i)", "decimal": "decimal(%i,%i)", "time": "time(%i)", - "wei": "decimal(%i,%i)" + "wei": "decimal(%i,%i)", } dbt_to_sct = { @@ -62,7 +60,9 @@ class MsSqlTypeMapper(TypeMapper): "int": "bigint", } - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None: return "bigint" if precision <= 8: @@ -73,7 +73,9 @@ def to_db_integer_type(self, precision: Optional[int], table_format: TTableForma return "int" return "bigint" - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: if db_type == "numeric": if (precision, scale) == self.capabilities.wei_precision: return dict(data_type="wei") @@ -81,9 +83,13 @@ def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[i class MsSqlStagingCopyJob(SqlStagingCopyJob): - @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): @@ -92,7 +98,10 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient # drop destination table sql.append(f"DROP TABLE IF EXISTS {table_name};") # moving staging table to destination schema - sql.append(f"ALTER SCHEMA {sql_client.fully_qualified_dataset_name()} TRANSFER {staging_table_name};") + sql.append( + f"ALTER SCHEMA {sql_client.fully_qualified_dataset_name()} TRANSFER" + f" {staging_table_name};" + ) # recreate staging table sql.append(f"SELECT * INTO {staging_table_name} FROM {table_name} WHERE 1 = 0;") return sql @@ -100,13 +109,24 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient class MsSqlMergeJob(SqlMergeJob): @classmethod - def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: str, key_clauses: Sequence[str], for_delete: bool) -> List[str]: - """Generate sql clauses that may be used to select or delete rows in root table of destination dataset - """ + def gen_key_table_clauses( + cls, + root_table_name: str, + staging_root_table_name: str, + key_clauses: Sequence[str], + for_delete: bool, + ) -> List[str]: + """Generate sql clauses that may be used to select or delete rows in root table of destination dataset""" if for_delete: # MS SQL doesn't support alias in DELETE FROM - return [f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} WHERE {' OR '.join([c.format(d=root_table_name,s=staging_root_table_name) for c in key_clauses])})"] - return SqlMergeJob.gen_key_table_clauses(root_table_name, staging_root_table_name, key_clauses, for_delete) + return [ + f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM" + f" {staging_root_table_name} WHERE" + f" {' OR '.join([c.format(d=root_table_name,s=staging_root_table_name) for c in key_clauses])})" + ] + return SqlMergeJob.gen_key_table_clauses( + root_table_name, staging_root_table_name, key_clauses, for_delete + ) @classmethod def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: @@ -115,18 +135,14 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: @classmethod def _new_temp_table_name(cls, name_prefix: str) -> str: name = SqlMergeJob._new_temp_table_name(name_prefix) - return '#' + name + return "#" + name class MsSqlClient(InsertValuesJobClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: MsSqlClientConfiguration) -> None: - sql_client = PyOdbcMsSqlClient( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = PyOdbcMsSqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.config: MsSqlClientConfiguration = config self.sql_client = sql_client @@ -136,9 +152,13 @@ def __init__(self, schema: Schema, config: MsSqlClientConfiguration) -> None: def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [MsSqlMergeJob.from_table_chain(table_chain, self.sql_client)] - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: + def _make_add_column_sql( + self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None + ) -> List[str]: # Override because mssql requires multiple columns in a single ADD COLUMN clause - return ["ADD \n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns)] + return [ + "ADD \n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns) + ] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: sc_type = c["data_type"] @@ -148,14 +168,22 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non else: db_type = self.type_mapper.to_db_type(c) - hints_str = " ".join(self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True) + hints_str = " ".join( + self.active_hints.get(h, "") + for h in self.active_hints.keys() + if c.get(h, False) is True + ) column_name = self.capabilities.escape_identifier(c["name"]) return f"{column_name} {db_type} {hints_str} {self._gen_not_null(c['nullable'])}" - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: if self.config.replace_strategy == "staging-optimized": return [MsSqlStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) - def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, pq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/impl/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py index 5372fa3626..427518feeb 100644 --- a/dlt/destinations/impl/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -9,9 +9,18 @@ from contextlib import contextmanager from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction -from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error +from dlt.destinations.sql_client import ( + DBApiCursorImpl, + SqlClientBase, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.impl.mssql.configuration import MsSqlCredentials from dlt.destinations.impl.mssql import capabilities @@ -21,12 +30,18 @@ def handle_datetimeoffset(dto_value: bytes) -> datetime: # ref: https://github.com/mkleehammer/pyodbc/issues/134#issuecomment-281739794 tup = struct.unpack("<6hI2h", dto_value) # e.g., (2017, 3, 16, 10, 35, 18, 500000000, -6, 0) return datetime( - tup[0], tup[1], tup[2], tup[3], tup[4], tup[5], tup[6] // 1000, timezone(timedelta(hours=tup[7], minutes=tup[8])) + tup[0], + tup[1], + tup[2], + tup[3], + tup[4], + tup[5], + tup[6] // 1000, + timezone(timedelta(hours=tup[7], minutes=tup[8])), ) class PyOdbcMsSqlClient(SqlClientBase[pyodbc.Connection], DBTransaction): - dbapi: ClassVar[DBApi] = pyodbc capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -79,13 +94,15 @@ def drop_dataset(self) -> None: # MS Sql doesn't support DROP ... CASCADE, drop tables in the schema first # Drop all views rows = self.execute_sql( - "SELECT table_name FROM information_schema.views WHERE table_schema = %s;", self.dataset_name + "SELECT table_name FROM information_schema.views WHERE table_schema = %s;", + self.dataset_name, ) view_names = [row[0] for row in rows] self._drop_views(*view_names) # Drop all tables rows = self.execute_sql( - "SELECT table_name FROM information_schema.tables WHERE table_schema = %s;", self.dataset_name + "SELECT table_name FROM information_schema.tables WHERE table_schema = %s;", + self.dataset_name, ) table_names = [row[0] for row in rows] self.drop_tables(*table_names) @@ -95,10 +112,14 @@ def drop_dataset(self) -> None: def _drop_views(self, *tables: str) -> None: if not tables: return - statements = [f"DROP VIEW IF EXISTS {self.make_qualified_table_name(table)};" for table in tables] + statements = [ + f"DROP VIEW IF EXISTS {self.make_qualified_table_name(table)};" for table in tables + ] self.execute_fragments(statements) - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if curr.description is None: return None @@ -126,7 +147,9 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB raise outer def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + return ( + self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + ) @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: diff --git a/dlt/destinations/impl/postgres/__init__.py b/dlt/destinations/impl/postgres/__init__.py index 009174ecc9..43e6af1996 100644 --- a/dlt/destinations/impl/postgres/__init__.py +++ b/dlt/destinations/impl/postgres/__init__.py @@ -5,7 +5,6 @@ from dlt.common.wei import EVM_DECIMAL_PRECISION - def capabilities() -> DestinationCapabilitiesContext: # https://www.postgresql.org/docs/current/limits.html caps = DestinationCapabilitiesContext() @@ -16,7 +15,7 @@ def capabilities() -> DestinationCapabilitiesContext: caps.escape_identifier = escape_postgres_identifier caps.escape_literal = escape_postgres_literal caps.decimal_precision = (DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE) - caps.wei_precision = (2*EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION) + caps.wei_precision = (2 * EVM_DECIMAL_PRECISION, EVM_DECIMAL_PRECISION) caps.max_identifier_length = 63 caps.max_column_identifier_length = 63 caps.max_query_length = 32 * 1024 * 1024 @@ -26,5 +25,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = True return caps - - diff --git a/dlt/destinations/impl/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py index 4204ce1c38..09efdd79db 100644 --- a/dlt/destinations/impl/postgres/configuration.py +++ b/dlt/destinations/impl/postgres/configuration.py @@ -48,12 +48,12 @@ def fingerprint(self) -> str: return "" if TYPE_CHECKING: + def __init__( self, destination_name: str = None, credentials: PostgresCredentials = None, dataset_name: str = None, default_schema_name: str = None, - create_indexes: bool = True - ) -> None: - ... + create_indexes: bool = True, + ) -> None: ... diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py index 33971eb642..a1a3840aac 100644 --- a/dlt/destinations/impl/postgres/factory.py +++ b/dlt/destinations/impl/postgres/factory.py @@ -2,7 +2,10 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres.configuration import ( + PostgresCredentials, + PostgresClientConfiguration, +) from dlt.destinations.impl.postgres import capabilities if t.TYPE_CHECKING: @@ -10,7 +13,6 @@ class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]): - spec = PostgresClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py index 03c42f4d75..f8fa3e341a 100644 --- a/dlt/destinations/impl/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -18,9 +18,8 @@ from dlt.destinations.type_mapping import TypeMapper -HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = { - "unique": "UNIQUE" -} +HINT_TO_POSTGRES_ATTR: Dict[TColumnHint, str] = {"unique": "UNIQUE"} + class PostgresTypeMapper(TypeMapper): sct_to_unbound_dbt = { @@ -40,7 +39,7 @@ class PostgresTypeMapper(TypeMapper): "timestamp": "timestamp (%i) with time zone", "decimal": "numeric(%i,%i)", "time": "time (%i) without time zone", - "wei": "numeric(%i,%i)" + "wei": "numeric(%i,%i)", } dbt_to_sct = { @@ -59,7 +58,9 @@ class PostgresTypeMapper(TypeMapper): "integer": "bigint", } - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None: return "bigint" # Precision is number of bits @@ -69,7 +70,9 @@ def to_db_integer_type(self, precision: Optional[int], table_format: TTableForma return "integer" return "bigint" - def from_db_type(self, db_type: str, precision: Optional[int] = None, scale: Optional[int] = None) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int] = None, scale: Optional[int] = None + ) -> TColumnType: if db_type == "numeric": if (precision, scale) == self.capabilities.wei_precision: return dict(data_type="wei") @@ -77,9 +80,13 @@ def from_db_type(self, db_type: str, precision: Optional[int] = None, scale: Opt class PostgresStagingCopyJob(SqlStagingCopyJob): - @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): @@ -88,21 +95,20 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient # drop destination table sql.append(f"DROP TABLE IF EXISTS {table_name};") # moving staging table to destination schema - sql.append(f"ALTER TABLE {staging_table_name} SET SCHEMA {sql_client.fully_qualified_dataset_name()};") + sql.append( + f"ALTER TABLE {staging_table_name} SET SCHEMA" + f" {sql_client.fully_qualified_dataset_name()};" + ) # recreate staging table sql.append(f"CREATE TABLE {staging_table_name} (like {table_name} including all);") return sql class PostgresClient(InsertValuesJobClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: - sql_client = Psycopg2SqlClient( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = Psycopg2SqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.config: PostgresClientConfiguration = config self.sql_client = sql_client @@ -110,14 +116,24 @@ def __init__(self, schema: Schema, config: PostgresClientConfiguration) -> None: self.type_mapper = PostgresTypeMapper(self.capabilities) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - hints_str = " ".join(self.active_hints.get(h, "") for h in self.active_hints.keys() if c.get(h, False) is True) + hints_str = " ".join( + self.active_hints.get(h, "") + for h in self.active_hints.keys() + if c.get(h, False) is True + ) column_name = self.capabilities.escape_identifier(c["name"]) - return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + return ( + f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + ) - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: if self.config.replace_strategy == "staging-optimized": return [PostgresStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) - def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, pq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/impl/postgres/sql_client.py b/dlt/destinations/impl/postgres/sql_client.py index b6c4c1a1be..366ed243ef 100644 --- a/dlt/destinations/impl/postgres/sql_client.py +++ b/dlt/destinations/impl/postgres/sql_client.py @@ -12,16 +12,24 @@ from contextlib import contextmanager from typing import Any, AnyStr, ClassVar, Iterator, Optional, Sequence -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction -from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error +from dlt.destinations.sql_client import ( + DBApiCursorImpl, + SqlClientBase, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.impl.postgres.configuration import PostgresCredentials from dlt.destinations.impl.postgres import capabilities class Psycopg2SqlClient(SqlClientBase["psycopg2.connection"], DBTransaction): - dbapi: ClassVar[DBApi] = psycopg2 capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -32,9 +40,9 @@ def __init__(self, dataset_name: str, credentials: PostgresCredentials) -> None: def open_connection(self) -> "psycopg2.connection": self._conn = psycopg2.connect( - dsn=self.credentials.to_native_representation(), - options=f"-c search_path={self.fully_qualified_dataset_name()},public" - ) + dsn=self.credentials.to_native_representation(), + options=f"-c search_path={self.fully_qualified_dataset_name()},public", + ) # we'll provide explicit transactions see _reset self._reset_connection() return self._conn @@ -70,7 +78,9 @@ def native_connection(self) -> "psycopg2.connection": return self._conn # @raise_database_error - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if curr.description is None: return None @@ -95,13 +105,17 @@ def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> Iterator[DB self.open_connection() raise outer - def execute_fragments(self, fragments: Sequence[AnyStr], *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_fragments( + self, fragments: Sequence[AnyStr], *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: # compose the statements using psycopg2 library - composed = Composed(sql if isinstance(sql, Composable) else SQL(sql) for sql in fragments) + composed = Composed(sql if isinstance(sql, Composable) else SQL(sql) for sql in fragments) return self.execute_sql(composed, *args, **kwargs) def fully_qualified_dataset_name(self, escape: bool = True) -> str: - return self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + return ( + self.capabilities.escape_identifier(self.dataset_name) if escape else self.dataset_name + ) def _reset_connection(self) -> None: # self._conn.autocommit = True @@ -112,13 +126,23 @@ def _reset_connection(self) -> None: def _make_database_exception(cls, ex: Exception) -> Exception: if isinstance(ex, (psycopg2.errors.UndefinedTable, psycopg2.errors.InvalidSchemaName)): raise DatabaseUndefinedRelation(ex) - if isinstance(ex, (psycopg2.OperationalError, psycopg2.InternalError, psycopg2.errors.SyntaxError, psycopg2.errors.UndefinedFunction)): + if isinstance( + ex, + ( + psycopg2.OperationalError, + psycopg2.InternalError, + psycopg2.errors.SyntaxError, + psycopg2.errors.UndefinedFunction, + ), + ): term = cls._maybe_make_terminal_exception_from_data_error(ex) if term: return term else: return DatabaseTransientException(ex) - elif isinstance(ex, (psycopg2.DataError, psycopg2.ProgrammingError, psycopg2.IntegrityError)): + elif isinstance( + ex, (psycopg2.DataError, psycopg2.ProgrammingError, psycopg2.IntegrityError) + ): return DatabaseTerminalException(ex) elif isinstance(ex, TypeError): # psycopg2 raises TypeError on malformed query parameters @@ -129,7 +153,9 @@ def _make_database_exception(cls, ex: Exception) -> Exception: return ex @staticmethod - def _maybe_make_terminal_exception_from_data_error(pg_ex: psycopg2.DataError) -> Optional[Exception]: + def _maybe_make_terminal_exception_from_data_error( + pg_ex: psycopg2.DataError, + ) -> Optional[Exception]: return None @staticmethod diff --git a/dlt/destinations/impl/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py index dc252e3b31..59d1fa2f72 100644 --- a/dlt/destinations/impl/qdrant/configuration.py +++ b/dlt/destinations/impl/qdrant/configuration.py @@ -1,7 +1,10 @@ from typing import Optional, Final from dlt.common.configuration import configspec -from dlt.common.configuration.specs.base_configuration import BaseConfiguration, CredentialsConfiguration +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + CredentialsConfiguration, +) from dlt.common.destination.reference import DestinationClientDwhConfiguration diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py index 316b5ae434..4bd6105afd 100644 --- a/dlt/destinations/impl/qdrant/factory.py +++ b/dlt/destinations/impl/qdrant/factory.py @@ -10,7 +10,6 @@ class qdrant(Destination[QdrantClientConfiguration, "QdrantClient"]): - spec = QdrantClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index f37a1f6cd8..243cbd6c5b 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -5,6 +5,7 @@ VECTORIZE_HINT = "x-qdrant-embed" + def qdrant_adapter( data: Any, embed: TColumnNames = None, @@ -47,8 +48,7 @@ def qdrant_adapter( embed = [embed] if not isinstance(embed, list): raise ValueError( - "embed must be a list of column names or a single " - "column name as a string" + "embed must be a list of column names or a single column name as a string" ) for column_name in embed: @@ -58,8 +58,7 @@ def qdrant_adapter( } if not column_hints: - raise ValueError( - "A value for 'embed' must be specified.") + raise ValueError("A value for 'embed' must be specified.") else: resource.apply_hints(columns=column_hints) diff --git a/dlt/destinations/impl/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py index 029530d624..2df3023d86 100644 --- a/dlt/destinations/impl/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -19,6 +19,7 @@ from qdrant_client.qdrant_fastembed import uuid from qdrant_client.http.exceptions import UnexpectedResponse + class LoadQdrantJob(LoadJob): def __init__( self, @@ -32,8 +33,7 @@ def __init__( super().__init__(file_name) self.db_client = db_client self.collection_name = collection_name - self.embedding_fields = get_columns_names_with_prop( - table_schema, VECTORIZE_HINT) + self.embedding_fields = get_columns_names_with_prop(table_schema, VECTORIZE_HINT) self.unique_identifiers = self._list_unique_identifiers(table_schema) self.config = client_config @@ -42,17 +42,24 @@ def __init__( for line in f: data = json.loads(line) - point_id = self._generate_uuid( - data, self.unique_identifiers, self.collection_name) if self.unique_identifiers else uuid.uuid4() + point_id = ( + self._generate_uuid(data, self.unique_identifiers, self.collection_name) + if self.unique_identifiers + else uuid.uuid4() + ) embedding_doc = self._get_embedding_doc(data) payloads.append(data) ids.append(point_id) docs.append(embedding_doc) - embedding_model = db_client._get_or_init_model( - db_client.embedding_model_name) - embeddings = list(embedding_model.embed( - docs, batch_size=self.config.embedding_batch_size, parallel=self.config.embedding_parallelism)) + embedding_model = db_client._get_or_init_model(db_client.embedding_model_name) + embeddings = list( + embedding_model.embed( + docs, + batch_size=self.config.embedding_batch_size, + parallel=self.config.embedding_parallelism, + ) + ) vector_name = db_client.get_vector_field_name() embeddings = [{vector_name: embedding.tolist()} for embedding in embeddings] assert len(embeddings) == len(payloads) == len(ids) @@ -81,13 +88,14 @@ def _list_unique_identifiers(self, table_schema: TTableSchema) -> Sequence[str]: Sequence[str]: A list of unique column identifiers. """ if table_schema.get("write_disposition") == "merge": - primary_keys = get_columns_names_with_prop( - table_schema, "primary_key") + primary_keys = get_columns_names_with_prop(table_schema, "primary_key") if primary_keys: return primary_keys return get_columns_names_with_prop(table_schema, "unique") - def _upload_data(self, ids: Iterable[Any], vectors: Iterable[Any], payloads: Iterable[Any]) -> None: + def _upload_data( + self, ids: Iterable[Any], vectors: Iterable[Any], payloads: Iterable[Any] + ) -> None: """Uploads data to a Qdrant instance in a batch. Supports retries and parallelism. Args: @@ -96,7 +104,14 @@ def _upload_data(self, ids: Iterable[Any], vectors: Iterable[Any], payloads: Ite payloads (Iterable[Any]): Payloads to be uploaded to the collection """ self.db_client.upload_collection( - self.collection_name, ids=ids, payload=payloads, vectors=vectors, parallel=self.config.upload_parallelism, batch_size=self.config.upload_batch_size, max_retries=self.config.upload_max_retries) + self.collection_name, + ids=ids, + payload=payloads, + vectors=vectors, + parallel=self.config.upload_parallelism, + batch_size=self.config.upload_batch_size, + max_retries=self.config.upload_max_retries, + ) def _generate_uuid( self, data: Dict[str, Any], unique_identifiers: Sequence[str], collection_name: str @@ -120,12 +135,19 @@ def state(self) -> TLoadJobState: def exception(self) -> str: raise NotImplementedError() + class QdrantClient(JobClientBase, WithStateSync): - """Qdrant Destination Handler - """ + """Qdrant Destination Handler""" + capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() state_properties: ClassVar[List[str]] = [ - "version", "engine_version", "pipeline_name", "state", "created_at", "_dlt_load_id"] + "version", + "engine_version", + "pipeline_name", + "state", + "created_at", + "_dlt_load_id", + ] def __init__(self, schema: Schema, config: QdrantClientConfiguration) -> None: super().__init__(schema, config) @@ -167,11 +189,13 @@ def _make_qualified_collection_name(self, table_name: str) -> str: str: The dataset name and table name concatenated with a separator if dataset name is present. """ dataset_separator = self.config.dataset_separator - return f"{self.dataset_name}{dataset_separator}{table_name}" if self.dataset_name else table_name + return ( + f"{self.dataset_name}{dataset_separator}{table_name}" + if self.dataset_name + else table_name + ) - def _create_collection( - self, full_collection_name: str - ) -> None: + def _create_collection(self, full_collection_name: str) -> None: """Creates a collection in Qdrant. Args: @@ -188,7 +212,8 @@ def _create_collection( vectors_config = self.db_client.get_fastembed_vector_params() self.db_client.create_collection( - collection_name=full_collection_name, vectors_config=vectors_config) + collection_name=full_collection_name, vectors_config=vectors_config + ) def _create_point(self, obj: Dict[str, Any], collection_name: str) -> None: """Inserts a point into a Qdrant collection without a vector. @@ -197,12 +222,16 @@ def _create_point(self, obj: Dict[str, Any], collection_name: str) -> None: obj (Dict[str, Any]): The arbitrary data to be inserted as payload. collection_name (str): The name of the collection to insert the point into. """ - self.db_client.upsert(collection_name, points=[ - models.PointStruct( - id=str(uuid.uuid4()), - payload=obj, - vector={}, - )]) + self.db_client.upsert( + collection_name, + points=[ + models.PointStruct( + id=str(uuid.uuid4()), + payload=obj, + vector={}, + ) + ], + ) def drop_storage(self) -> None: """Drop the dataset from the Qdrant instance. @@ -213,8 +242,7 @@ def drop_storage(self) -> None: If dataset name was not provided, it deletes all the tables in the current schema """ collections = self.db_client.get_collections().collections - collection_name_list = [collection.name - for collection in collections] + collection_name_list = [collection.name for collection in collections] if self.dataset_name: prefix = f"{self.dataset_name}{self.config.dataset_separator}" @@ -234,8 +262,7 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: self._create_sentinel_collection() elif truncate_tables: for table_name in truncate_tables: - qualified_table_name = self._make_qualified_collection_name( - table_name=table_name) + qualified_table_name = self._make_qualified_collection_name(table_name=table_name) if self._collection_exists(qualified_table_name): continue @@ -257,48 +284,64 @@ def update_stored_schema( self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None ) -> Optional[TSchemaTables]: applied_update: TSchemaTables = {} - schema_info = self.get_stored_schema_by_hash( - self.schema.stored_version_hash) + schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) if schema_info is None: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - f"not found in the storage. upgrading" + "not found in the storage. upgrading" ) self._execute_schema_update(only_tables) else: logger.info( f"Schema with hash {self.schema.stored_version_hash} " f"inserted at {schema_info.inserted_at} found " - f"in storage, no upgrade required" + "in storage, no upgrade required" ) return applied_update def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: """Loads compressed state from destination storage - By finding a load id that was completed + By finding a load id that was completed """ limit = 10 offset = None while True: try: scroll_table_name = self._make_qualified_collection_name( - self.schema.state_table_name) - state_records, offset = self.db_client.scroll(scroll_table_name, with_payload=self.state_properties, scroll_filter=models.Filter(must=[ - models.FieldCondition( - key="pipeline_name", match=models.MatchValue(value=pipeline_name)) - ]), limit=limit, offset=offset) + self.schema.state_table_name + ) + state_records, offset = self.db_client.scroll( + scroll_table_name, + with_payload=self.state_properties, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="pipeline_name", match=models.MatchValue(value=pipeline_name) + ) + ] + ), + limit=limit, + offset=offset, + ) if len(state_records) == 0: return None for state_record in state_records: state = state_record.payload load_id = state["_dlt_load_id"] scroll_table_name = self._make_qualified_collection_name( - self.schema.loads_table_name) - load_records = self.db_client.count(scroll_table_name, exact=True, count_filter=models.Filter( - must=[models.FieldCondition( - key="load_id", match=models.MatchValue(value=load_id) - )] - )) + self.schema.loads_table_name + ) + load_records = self.db_client.count( + scroll_table_name, + exact=True, + count_filter=models.Filter( + must=[ + models.FieldCondition( + key="load_id", match=models.MatchValue(value=load_id) + ) + ] + ), + ) if load_records.count > 0: state["dlt_load_id"] = state.pop("_dlt_load_id") return StateInfo(**state) @@ -308,14 +351,20 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" try: - scroll_table_name = self._make_qualified_collection_name( - self.schema.version_table_name) - response = self.db_client.scroll(scroll_table_name, with_payload=True, scroll_filter=models.Filter( - must=[models.FieldCondition( - key="schema_name", - match=models.MatchValue(value=self.schema.name), - )] - ), limit=1) + scroll_table_name = self._make_qualified_collection_name(self.schema.version_table_name) + response = self.db_client.scroll( + scroll_table_name, + with_payload=True, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="schema_name", + match=models.MatchValue(value=self.schema.name), + ) + ] + ), + limit=1, + ) record = response[0][0].payload return StorageSchemaInfo(**record) except Exception: @@ -323,23 +372,25 @@ def get_stored_schema(self) -> Optional[StorageSchemaInfo]: def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: try: - scroll_table_name = self._make_qualified_collection_name( - self.schema.version_table_name) - response = self.db_client.scroll(scroll_table_name, with_payload=True, scroll_filter=models.Filter( - must=[ - models.FieldCondition( - key="version_hash", match=models.MatchValue(value=schema_hash)) - ] - - ), limit=1) + scroll_table_name = self._make_qualified_collection_name(self.schema.version_table_name) + response = self.db_client.scroll( + scroll_table_name, + with_payload=True, + scroll_filter=models.Filter( + must=[ + models.FieldCondition( + key="version_hash", match=models.MatchValue(value=schema_hash) + ) + ] + ), + limit=1, + ) record = response[0][0].payload return StorageSchemaInfo(**record) except Exception: return None - def start_file_load( - self, table: TTableSchema, file_path: str, load_id: str - ) -> LoadJob: + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: return LoadQdrantJob( table, file_path, @@ -358,8 +409,7 @@ def complete_load(self, load_id: str) -> None: "status": 0, "inserted_at": str(pendulum.now()), } - loads_table_name = self._make_qualified_collection_name( - self.schema.loads_table_name) + loads_table_name = self._make_qualified_collection_name(self.schema.loads_table_name) self._create_point(properties, loads_table_name) def __enter__(self) -> "QdrantClient": @@ -383,8 +433,7 @@ def _update_schema_in_storage(self, schema: Schema) -> None: "inserted_at": str(pendulum.now()), "schema": schema_str, } - version_table_name = self._make_qualified_collection_name( - self.schema.version_table_name) + version_table_name = self._make_qualified_collection_name(self.schema.version_table_name) self._create_point(properties, version_table_name) def _execute_schema_update(self, only_tables: Iterable[str]) -> None: @@ -399,8 +448,11 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: def _collection_exists(self, table_name: str, qualify_table_name: bool = True) -> bool: try: - table_name = self._make_qualified_collection_name( - table_name) if qualify_table_name else table_name + table_name = ( + self._make_qualified_collection_name(table_name) + if qualify_table_name + else table_name + ) self.db_client.get_collection(table_name) return True except UnexpectedResponse as e: diff --git a/dlt/destinations/impl/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py index 7018445773..94b9219a2f 100644 --- a/dlt/destinations/impl/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -4,7 +4,10 @@ from dlt.common.configuration import configspec from dlt.common.utils import digest128 -from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres.configuration import ( + PostgresCredentials, + PostgresClientConfiguration, +) @configspec diff --git a/dlt/destinations/impl/redshift/factory.py b/dlt/destinations/impl/redshift/factory.py index 7648b35851..e5af9f7122 100644 --- a/dlt/destinations/impl/redshift/factory.py +++ b/dlt/destinations/impl/redshift/factory.py @@ -2,7 +2,10 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.redshift.configuration import RedshiftCredentials, RedshiftClientConfiguration +from dlt.destinations.impl.redshift.configuration import ( + RedshiftCredentials, + RedshiftClientConfiguration, +) from dlt.destinations.impl.redshift import capabilities if t.TYPE_CHECKING: @@ -10,7 +13,6 @@ class redshift(Destination[RedshiftClientConfiguration, "RedshiftClient"]): - spec = RedshiftClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -41,5 +43,8 @@ def __init__( **kwargs: Additional arguments passed to the destination config """ super().__init__( - credentials=credentials, create_indexes=create_indexes, staging_iam_role=staging_iam_role, **kwargs + credentials=credentials, + create_indexes=create_indexes, + staging_iam_role=staging_iam_role, + **kwargs, ) diff --git a/dlt/destinations/impl/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py index 2124807bc1..eaa1968133 100644 --- a/dlt/destinations/impl/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -4,17 +4,24 @@ from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.common.schema.utils import table_schema_has_type, table_schema_has_type_with_precision + if platform.python_implementation() == "PyPy": import psycopg2cffi as psycopg2 + # from psycopg2cffi.sql import SQL, Composed else: import psycopg2 + # from psycopg2.sql import SQL, Composed from typing import ClassVar, Dict, List, Optional, Sequence, Any from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import NewLoadJob, CredentialsConfiguration, SupportsStagingDestination +from dlt.common.destination.reference import ( + NewLoadJob, + CredentialsConfiguration, + SupportsStagingDestination, +) from dlt.common.data_types import TDataType from dlt.common.schema import TColumnSchema, TColumnHint, Schema from dlt.common.schema.typing import TTableSchema, TColumnType, TTableFormat @@ -36,7 +43,7 @@ "cluster": "DISTKEY", # it is better to not enforce constraints in redshift # "primary_key": "PRIMARY KEY", - "sort": "SORTKEY" + "sort": "SORTKEY", } @@ -50,7 +57,7 @@ class RedshiftTypeMapper(TypeMapper): "timestamp": "timestamp with time zone", "bigint": "bigint", "binary": "varbinary", - "time": "time without time zone" + "time": "time without time zone", } sct_to_dbt = { @@ -76,7 +83,9 @@ class RedshiftTypeMapper(TypeMapper): "integer": "bigint", } - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: if precision is None: return "bigint" if precision <= 16: @@ -85,7 +94,9 @@ def to_db_integer_type(self, precision: Optional[int], table_format: TTableForma return "integer" return "bigint" - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: if db_type == "numeric": if (precision, scale) == self.capabilities.wei_precision: return dict(data_type="wei") @@ -93,11 +104,12 @@ def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[i class RedshiftSqlClient(Psycopg2SqlClient): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @staticmethod - def _maybe_make_terminal_exception_from_data_error(pg_ex: psycopg2.DataError) -> Optional[Exception]: + def _maybe_make_terminal_exception_from_data_error( + pg_ex: psycopg2.DataError, + ) -> Optional[Exception]: if "Cannot insert a NULL value into column" in pg_ex.pgerror: # NULL violations is internal error, probably a redshift thing return DatabaseTerminalException(pg_ex) @@ -107,26 +119,33 @@ def _maybe_make_terminal_exception_from_data_error(pg_ex: psycopg2.DataError) -> return DatabaseTerminalException(pg_ex) return None -class RedshiftCopyFileLoadJob(CopyRemoteFileLoadJob): - def __init__(self, table: TTableSchema, - file_path: str, - sql_client: SqlClientBase[Any], - staging_credentials: Optional[CredentialsConfiguration] = None, - staging_iam_role: str = None) -> None: +class RedshiftCopyFileLoadJob(CopyRemoteFileLoadJob): + def __init__( + self, + table: TTableSchema, + file_path: str, + sql_client: SqlClientBase[Any], + staging_credentials: Optional[CredentialsConfiguration] = None, + staging_iam_role: str = None, + ) -> None: self._staging_iam_role = staging_iam_role super().__init__(table, file_path, sql_client, staging_credentials) def execute(self, table: TTableSchema, bucket_path: str) -> None: - # we assume s3 credentials where provided for the staging credentials = "" if self._staging_iam_role: credentials = f"IAM_ROLE '{self._staging_iam_role}'" - elif self._staging_credentials and isinstance(self._staging_credentials, AwsCredentialsWithoutDefaults): + elif self._staging_credentials and isinstance( + self._staging_credentials, AwsCredentialsWithoutDefaults + ): aws_access_key = self._staging_credentials.aws_access_key_id aws_secret_key = self._staging_credentials.aws_secret_access_key - credentials = f"CREDENTIALS 'aws_access_key_id={aws_access_key};aws_secret_access_key={aws_secret_key}'" + credentials = ( + "CREDENTIALS" + f" 'aws_access_key_id={aws_access_key};aws_secret_access_key={aws_secret_key}'" + ) table_name = table["name"] # get format @@ -137,11 +156,17 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: if table_schema_has_type(table, "time"): raise LoadJobTerminalException( self.file_name(), - f"Redshift cannot load TIME columns from {ext} files. Switch to direct INSERT file format or convert `datetime.time` objects in your data to `str` or `datetime.datetime`" + f"Redshift cannot load TIME columns from {ext} files. Switch to direct INSERT file" + " format or convert `datetime.time` objects in your data to `str` or" + " `datetime.datetime`", ) if ext == "jsonl": if table_schema_has_type(table, "binary"): - raise LoadJobTerminalException(self.file_name(), "Redshift cannot load VARBYTE columns from json files. Switch to parquet to load binaries.") + raise LoadJobTerminalException( + self.file_name(), + "Redshift cannot load VARBYTE columns from json files. Switch to parquet to" + " load binaries.", + ) file_type = "FORMAT AS JSON 'auto'" dateformat = "dateformat 'auto' timeformat 'auto'" compression = "GZIP" @@ -149,7 +174,8 @@ def execute(self, table: TTableSchema, bucket_path: str) -> None: if table_schema_has_type_with_precision(table, "binary"): raise LoadJobTerminalException( self.file_name(), - f"Redshift cannot load fixed width VARBYTE columns from {ext} files. Switch to direct INSERT file format or use binary columns without precision." + f"Redshift cannot load fixed width VARBYTE columns from {ext} files. Switch to" + " direct INSERT file format or use binary columns without precision.", ) file_type = "PARQUET" # if table contains complex types then SUPER field will be used. @@ -174,28 +200,36 @@ def exception(self) -> str: # this part of code should be never reached raise NotImplementedError() -class RedshiftMergeJob(SqlMergeJob): +class RedshiftMergeJob(SqlMergeJob): @classmethod - def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: str, key_clauses: Sequence[str], for_delete: bool) -> List[str]: + def gen_key_table_clauses( + cls, + root_table_name: str, + staging_root_table_name: str, + key_clauses: Sequence[str], + for_delete: bool, + ) -> List[str]: """Generate sql clauses that may be used to select or delete rows in root table of destination dataset - A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery + A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery """ if for_delete: - return [f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} WHERE {' OR '.join([c.format(d=root_table_name,s=staging_root_table_name) for c in key_clauses])})"] - return SqlMergeJob.gen_key_table_clauses(root_table_name, staging_root_table_name, key_clauses, for_delete) + return [ + f"FROM {root_table_name} WHERE EXISTS (SELECT 1 FROM" + f" {staging_root_table_name} WHERE" + f" {' OR '.join([c.format(d=root_table_name,s=staging_root_table_name) for c in key_clauses])})" + ] + return SqlMergeJob.gen_key_table_clauses( + root_table_name, staging_root_table_name, key_clauses, for_delete + ) class RedshiftClient(InsertValuesJobClient, SupportsStagingDestination): - capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: RedshiftClientConfiguration) -> None: - sql_client = RedshiftSqlClient ( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = RedshiftSqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.sql_client = sql_client self.config: RedshiftClientConfiguration = config @@ -205,17 +239,33 @@ def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> Li return [RedshiftMergeJob.from_table_chain(table_chain, self.sql_client)] def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: - hints_str = " ".join(HINT_TO_REDSHIFT_ATTR.get(h, "") for h in HINT_TO_REDSHIFT_ATTR.keys() if c.get(h, False) is True) + hints_str = " ".join( + HINT_TO_REDSHIFT_ATTR.get(h, "") + for h in HINT_TO_REDSHIFT_ATTR.keys() + if c.get(h, False) is True + ) column_name = self.capabilities.escape_identifier(c["name"]) - return f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + return ( + f"{column_name} {self.type_mapper.to_db_type(c)} {hints_str} {self._gen_not_null(c.get('nullable', True))}" + ) def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: """Starts SqlLoadJob for files ending with .sql or returns None to let derived classes to handle their specific jobs""" job = super().start_file_load(table, file_path, load_id) if not job: - assert NewReferenceJob.is_reference_job(file_path), "Redshift must use staging to load files" - job = RedshiftCopyFileLoadJob(table, file_path, self.sql_client, staging_credentials=self.config.staging_config.credentials, staging_iam_role=self.config.staging_iam_role) + assert NewReferenceJob.is_reference_job( + file_path + ), "Redshift must use staging to load files" + job = RedshiftCopyFileLoadJob( + table, + file_path, + self.sql_client, + staging_credentials=self.config.staging_config.credentials, + staging_iam_role=self.config.staging_iam_role, + ) return job - def _from_db_type(self, pq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, pq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(pq_t, precision, scale) diff --git a/dlt/destinations/impl/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py index 12e118eeab..d6bebd3fdd 100644 --- a/dlt/destinations/impl/snowflake/__init__.py +++ b/dlt/destinations/impl/snowflake/__init__.py @@ -4,7 +4,6 @@ from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() caps.preferred_loader_file_format = "jsonl" diff --git a/dlt/destinations/impl/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py index 4d9aaa7b54..6ad57fe929 100644 --- a/dlt/destinations/impl/snowflake/configuration.py +++ b/dlt/destinations/impl/snowflake/configuration.py @@ -16,8 +16,7 @@ def _read_private_key(private_key: str, password: Optional[str] = None) -> bytes: - """Load an encrypted or unencrypted private key from string. - """ + """Load an encrypted or unencrypted private key from string.""" try: from cryptography.hazmat.backends import default_backend from cryptography.hazmat.primitives.asymmetric import rsa @@ -25,7 +24,10 @@ def _read_private_key(private_key: str, password: Optional[str] = None) -> bytes from cryptography.hazmat.primitives import serialization from cryptography.hazmat.primitives.asymmetric.types import PrivateKeyTypes except ModuleNotFoundError as e: - raise MissingDependencyException("SnowflakeCredentials with private key", dependencies=[f"{version.DLT_PKG_NAME}[snowflake]"]) from e + raise MissingDependencyException( + "SnowflakeCredentials with private key", + dependencies=[f"{version.DLT_PKG_NAME}[snowflake]"], + ) from e try: # load key from base64-encoded DER key @@ -45,7 +47,7 @@ def _read_private_key(private_key: str, password: Optional[str] = None) -> bytes return pkey.private_bytes( encoding=serialization.Encoding.DER, format=serialization.PrivateFormat.PKCS8, - encryption_algorithm=serialization.NoEncryption() + encryption_algorithm=serialization.NoEncryption(), ) @@ -65,24 +67,35 @@ class SnowflakeCredentials(ConnectionStringCredentials): def parse_native_representation(self, native_value: Any) -> None: super().parse_native_representation(native_value) - self.warehouse = self.query.get('warehouse') - self.role = self.query.get('role') - self.private_key = self.query.get('private_key') # type: ignore - self.private_key_passphrase = self.query.get('private_key_passphrase') # type: ignore + self.warehouse = self.query.get("warehouse") + self.role = self.query.get("role") + self.private_key = self.query.get("private_key") # type: ignore + self.private_key_passphrase = self.query.get("private_key_passphrase") # type: ignore if not self.is_partial() and (self.password or self.private_key): self.resolve() def on_resolved(self) -> None: if not self.password and not self.private_key: - raise ConfigurationValueError("Please specify password or private_key. SnowflakeCredentials supports password and private key authentication and one of those must be specified.") + raise ConfigurationValueError( + "Please specify password or private_key. SnowflakeCredentials supports password and" + " private key authentication and one of those must be specified." + ) def to_url(self) -> URL: query = dict(self.query or {}) - if self.warehouse and 'warehouse' not in query: - query['warehouse'] = self.warehouse - if self.role and 'role' not in query: - query['role'] = self.role - return URL.create(self.drivername, self.username, self.password, self.host, self.port, self.database, query) + if self.warehouse and "warehouse" not in query: + query["warehouse"] = self.warehouse + if self.role and "role" not in query: + query["role"] = self.role + return URL.create( + self.drivername, + self.username, + self.password, + self.host, + self.port, + self.database, + query, + ) def to_connector_params(self) -> Dict[str, Any]: private_key: Optional[bytes] = None diff --git a/dlt/destinations/impl/snowflake/factory.py b/dlt/destinations/impl/snowflake/factory.py index 1201f406b0..c36e0756a6 100644 --- a/dlt/destinations/impl/snowflake/factory.py +++ b/dlt/destinations/impl/snowflake/factory.py @@ -1,6 +1,9 @@ import typing as t -from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials, SnowflakeClientConfiguration +from dlt.destinations.impl.snowflake.configuration import ( + SnowflakeCredentials, + SnowflakeClientConfiguration, +) from dlt.destinations.impl.snowflake import capabilities from dlt.common.destination import Destination, DestinationCapabilitiesContext @@ -9,7 +12,6 @@ class snowflake(Destination[SnowflakeClientConfiguration, "SnowflakeClient"]): - spec = SnowflakeClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -38,4 +40,9 @@ def __init__( stage_name: Name of an existing stage to use for loading data. Default uses implicit stage per table keep_staged_files: Whether to delete or keep staged files after loading """ - super().__init__(credentials=credentials, stage_name=stage_name, keep_staged_files=keep_staged_files, **kwargs) + super().__init__( + credentials=credentials, + stage_name=stage_name, + keep_staged_files=keep_staged_files, + **kwargs, + ) diff --git a/dlt/destinations/impl/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py index ead3e810d2..67df78c138 100644 --- a/dlt/destinations/impl/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -2,8 +2,18 @@ from urllib.parse import urlparse, urlunparse from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import FollowupJob, NewLoadJob, TLoadJobState, LoadJob, CredentialsConfiguration, SupportsStagingDestination -from dlt.common.configuration.specs import AwsCredentialsWithoutDefaults, AzureCredentialsWithoutDefaults +from dlt.common.destination.reference import ( + FollowupJob, + NewLoadJob, + TLoadJobState, + LoadJob, + CredentialsConfiguration, + SupportsStagingDestination, +) +from dlt.common.configuration.specs import ( + AwsCredentialsWithoutDefaults, + AzureCredentialsWithoutDefaults, +) from dlt.common.data_types import TDataType from dlt.common.storages.file_storage import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns @@ -54,23 +64,31 @@ class SnowflakeTypeMapper(TypeMapper): "TIMESTAMP_TZ": "timestamp", "BINARY": "binary", "VARIANT": "complex", - "TIME": "time" + "TIME": "time", } - def from_db_type(self, db_type: str, precision: Optional[int] = None, scale: Optional[int] = None) -> TColumnType: + def from_db_type( + self, db_type: str, precision: Optional[int] = None, scale: Optional[int] = None + ) -> TColumnType: if db_type == "NUMBER": if precision == self.BIGINT_PRECISION and scale == 0: - return dict(data_type='bigint') + return dict(data_type="bigint") elif (precision, scale) == self.capabilities.wei_precision: - return dict(data_type='wei') - return dict(data_type='decimal', precision=precision, scale=scale) + return dict(data_type="wei") + return dict(data_type="decimal", precision=precision, scale=scale) return super().from_db_type(db_type, precision, scale) class SnowflakeLoadJob(LoadJob, FollowupJob): def __init__( - self, file_path: str, table_name: str, load_id: str, client: SnowflakeSqlClient, - stage_name: Optional[str] = None, keep_staged_files: bool = True, staging_credentials: Optional[CredentialsConfiguration] = None + self, + file_path: str, + table_name: str, + load_id: str, + client: SnowflakeSqlClient, + stage_name: Optional[str] = None, + keep_staged_files: bool = True, + staging_credentials: Optional[CredentialsConfiguration] = None, ) -> None: file_name = FileStorage.get_file_name_from_file_path(file_path) super().__init__(file_name) @@ -78,8 +96,14 @@ def __init__( qualified_table_name = client.make_qualified_table_name(table_name) # extract and prepare some vars - bucket_path = NewReferenceJob.resolve_reference(file_path) if NewReferenceJob.is_reference_job(file_path) else "" - file_name = FileStorage.get_file_name_from_file_path(bucket_path) if bucket_path else file_name + bucket_path = ( + NewReferenceJob.resolve_reference(file_path) + if NewReferenceJob.is_reference_job(file_path) + else "" + ) + file_name = ( + FileStorage.get_file_name_from_file_path(bucket_path) if bucket_path else file_name + ) from_clause = "" credentials_clause = "" files_clause = "" @@ -93,10 +117,18 @@ def __init__( from_clause = f"FROM '@{stage_name}'" files_clause = f"FILES = ('{bucket_url.path.lstrip('/')}')" # referencing an staged files via a bucket URL requires explicit AWS credentials - elif bucket_scheme == "s3" and staging_credentials and isinstance(staging_credentials, AwsCredentialsWithoutDefaults): + elif ( + bucket_scheme == "s3" + and staging_credentials + and isinstance(staging_credentials, AwsCredentialsWithoutDefaults) + ): credentials_clause = f"""CREDENTIALS=(AWS_KEY_ID='{staging_credentials.aws_access_key_id}' AWS_SECRET_KEY='{staging_credentials.aws_secret_access_key}')""" from_clause = f"FROM '{bucket_path}'" - elif bucket_scheme in ["az", "abfs"] and staging_credentials and isinstance(staging_credentials, AzureCredentialsWithoutDefaults): + elif ( + bucket_scheme in ["az", "abfs"] + and staging_credentials + and isinstance(staging_credentials, AzureCredentialsWithoutDefaults) + ): # Explicit azure credentials are needed to load from bucket without a named stage credentials_clause = f"CREDENTIALS=(AZURE_SAS_TOKEN='?{staging_credentials.azure_storage_sas_token}')" # Converts an az:/// to azure://.blob.core.windows.net// @@ -106,7 +138,7 @@ def __init__( bucket_url._replace( scheme="azure", netloc=f"{staging_credentials.azure_storage_account_name}.blob.core.windows.net", - path=_path + path=_path, ) ) from_clause = f"FROM '{bucket_path}'" @@ -115,14 +147,19 @@ def __init__( bucket_path = bucket_path.replace("gs://", "gcs://") if not stage_name: # when loading from bucket stage must be given - raise LoadJobTerminalException(file_path, f"Cannot load from bucket path {bucket_path} without a stage name. See https://dlthub.com/docs/dlt-ecosystem/destinations/snowflake for instructions on setting up the `stage_name`") + raise LoadJobTerminalException( + file_path, + f"Cannot load from bucket path {bucket_path} without a stage name. See" + " https://dlthub.com/docs/dlt-ecosystem/destinations/snowflake for" + " instructions on setting up the `stage_name`", + ) from_clause = f"FROM @{stage_name}/" files_clause = f"FILES = ('{urlparse(bucket_path).path.lstrip('/')}')" else: # this means we have a local file if not stage_name: # Use implicit table stage by default: "SCHEMA_NAME"."%TABLE_NAME" - stage_name = client.make_qualified_table_name('%'+table_name) + stage_name = client.make_qualified_table_name("%" + table_name) stage_file_path = f'@{stage_name}/"{load_id}"/{file_name}' from_clause = f"FROM {stage_file_path}" @@ -134,19 +171,19 @@ def __init__( with client.begin_transaction(): # PUT and COPY in one tx if local file, otherwise only copy if not bucket_path: - client.execute_sql(f'PUT file://{file_path} @{stage_name}/"{load_id}" OVERWRITE = TRUE, AUTO_COMPRESS = FALSE') - client.execute_sql( - f"""COPY INTO {qualified_table_name} + client.execute_sql( + f'PUT file://{file_path} @{stage_name}/"{load_id}" OVERWRITE = TRUE,' + " AUTO_COMPRESS = FALSE" + ) + client.execute_sql(f"""COPY INTO {qualified_table_name} {from_clause} {files_clause} {credentials_clause} FILE_FORMAT = {source_format} MATCH_BY_COLUMN_NAME='CASE_INSENSITIVE' - """ - ) + """) if stage_file_path and not keep_staged_files: - client.execute_sql(f'REMOVE {stage_file_path}') - + client.execute_sql(f"REMOVE {stage_file_path}") def state(self) -> TLoadJobState: return "completed" @@ -154,10 +191,15 @@ def state(self) -> TLoadJobState: def exception(self) -> str: raise NotImplementedError() -class SnowflakeStagingCopyJob(SqlStagingCopyJob): +class SnowflakeStagingCopyJob(SqlStagingCopyJob): @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): @@ -173,10 +215,7 @@ class SnowflakeClient(SqlJobClientWithStaging, SupportsStagingDestination): capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() def __init__(self, schema: Schema, config: SnowflakeClientConfiguration) -> None: - sql_client = SnowflakeSqlClient( - config.normalize_dataset_name(schema), - config.credentials - ) + sql_client = SnowflakeSqlClient(config.normalize_dataset_name(schema), config.credentials) super().__init__(schema, config, sql_client) self.config: SnowflakeClientConfiguration = config self.sql_client: SnowflakeSqlClient = sql_client # type: ignore @@ -188,43 +227,64 @@ def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> if not job: job = SnowflakeLoadJob( file_path, - table['name'], + table["name"], load_id, self.sql_client, stage_name=self.config.stage_name, keep_staged_files=self.config.keep_staged_files, - staging_credentials=self.config.staging_config.credentials if self.config.staging_config else None + staging_credentials=( + self.config.staging_config.credentials if self.config.staging_config else None + ), ) return job def restore_file_load(self, file_path: str) -> LoadJob: return EmptyLoadJob.from_file_path(file_path, "completed") - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: + def _make_add_column_sql( + self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None + ) -> List[str]: # Override because snowflake requires multiple columns in a single ADD COLUMN clause - return ["ADD COLUMN\n" + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns)] - - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + return [ + "ADD COLUMN\n" + + ",\n".join(self._get_column_def_sql(c, table_format) for c in new_columns) + ] + + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: if self.config.replace_strategy == "staging-optimized": return [SnowflakeStagingCopyJob.from_table_chain(table_chain, self.sql_client)] return super()._create_replace_followup_jobs(table_chain) - def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool, separate_alters: bool = False) -> List[str]: + def _get_table_update_sql( + self, + table_name: str, + new_columns: Sequence[TColumnSchema], + generate_alter: bool, + separate_alters: bool = False, + ) -> List[str]: sql = super()._get_table_update_sql(table_name, new_columns, generate_alter) - cluster_list = [self.capabilities.escape_identifier(c['name']) for c in new_columns if c.get('cluster')] + cluster_list = [ + self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get("cluster") + ] if cluster_list: sql[0] = sql[0] + "\nCLUSTER BY (" + ",".join(cluster_list) + ")" return sql - def _from_db_type(self, bq_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, bq_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(bq_t, precision, scale) def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = None) -> str: name = self.capabilities.escape_identifier(c["name"]) - return f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" + return ( + f"{name} {self.type_mapper.to_db_type(c)} {self._gen_not_null(c.get('nullable', True))}" + ) def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: table_name = table_name.upper() # All snowflake tables are uppercased in information schema diff --git a/dlt/destinations/impl/snowflake/sql_client.py b/dlt/destinations/impl/snowflake/sql_client.py index 139a5ebb7a..ba932277df 100644 --- a/dlt/destinations/impl/snowflake/sql_client.py +++ b/dlt/destinations/impl/snowflake/sql_client.py @@ -4,12 +4,22 @@ import snowflake.connector as snowflake_lib from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation -from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) +from dlt.destinations.sql_client import ( + DBApiCursorImpl, + SqlClientBase, + raise_database_error, + raise_open_connection_error, +) from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials from dlt.destinations.impl.snowflake import capabilities + class SnowflakeCursorImpl(DBApiCursorImpl): native_cursor: snowflake_lib.cursor.SnowflakeCursor # type: ignore[assignment] @@ -20,7 +30,6 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: class SnowflakeSqlClient(SqlClientBase[snowflake_lib.SnowflakeConnection], DBTransaction): - dbapi: ClassVar[DBApi] = snowflake_lib capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() @@ -36,8 +45,7 @@ def open_connection(self) -> snowflake_lib.SnowflakeConnection: if "timezone" not in conn_params: conn_params["timezone"] = "UTC" self._conn = snowflake_lib.connect( - schema=self.fully_qualified_dataset_name(), - **conn_params + schema=self.fully_qualified_dataset_name(), **conn_params ) return self._conn @@ -77,7 +85,9 @@ def drop_tables(self, *tables: str) -> None: with suppress(DatabaseUndefinedRelation): super().drop_tables(*tables) - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: with self.execute_query(sql, *args, **kwargs) as curr: if curr.description is None: return None @@ -115,7 +125,7 @@ def _reset_connection(self) -> None: @classmethod def _make_database_exception(cls, ex: Exception) -> Exception: if isinstance(ex, snowflake_lib.errors.ProgrammingError): - if ex.sqlstate == 'P0000' and ex.errno == 100132: + if ex.sqlstate == "P0000" and ex.errno == 100132: # Error in a multi statement execution. These don't show the original error codes msg = str(ex) if "NULL result in a non-nullable column" in msg: @@ -124,11 +134,11 @@ def _make_database_exception(cls, ex: Exception) -> Exception: return DatabaseUndefinedRelation(ex) else: return DatabaseTransientException(ex) - if ex.sqlstate in {'42S02', '02000'}: + if ex.sqlstate in {"42S02", "02000"}: return DatabaseUndefinedRelation(ex) - elif ex.sqlstate == '22023': # Adding non-nullable no-default column + elif ex.sqlstate == "22023": # Adding non-nullable no-default column return DatabaseTerminalException(ex) - elif ex.sqlstate == '42000' and ex.errno == 904: # Invalid identifier + elif ex.sqlstate == "42000" and ex.errno == 904: # Invalid identifier return DatabaseTerminalException(ex) elif ex.sqlstate == "22000": return DatabaseTerminalException(ex) @@ -152,7 +162,9 @@ def _make_database_exception(cls, ex: Exception) -> Exception: return ex @staticmethod - def _maybe_make_terminal_exception_from_data_error(snowflake_ex: snowflake_lib.DatabaseError) -> Optional[Exception]: + def _maybe_make_terminal_exception_from_data_error( + snowflake_ex: snowflake_lib.DatabaseError, + ) -> Optional[Exception]: return None @staticmethod diff --git a/dlt/destinations/impl/weaviate/ci_naming.py b/dlt/destinations/impl/weaviate/ci_naming.py index 3b1c068133..cc8936f42d 100644 --- a/dlt/destinations/impl/weaviate/ci_naming.py +++ b/dlt/destinations/impl/weaviate/ci_naming.py @@ -1,5 +1,6 @@ from .naming import NamingConvention as WeaviateNamingConvention + class NamingConvention(WeaviateNamingConvention): def _lowercase_property(self, identifier: str) -> str: """Lowercase the whole property to become case insensitive""" diff --git a/dlt/destinations/impl/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py index 054e8bef25..ad6397b395 100644 --- a/dlt/destinations/impl/weaviate/configuration.py +++ b/dlt/destinations/impl/weaviate/configuration.py @@ -34,20 +34,22 @@ class WeaviateClientConfiguration(DestinationClientDwhConfiguration): batch_retries: int = 5 conn_timeout: float = 10.0 - read_timeout: float = 3*60.0 + read_timeout: float = 3 * 60.0 startup_period: int = 5 dataset_separator: str = "_" credentials: WeaviateCredentials vectorizer: str = "text2vec-openai" - module_config: Dict[str, Dict[str, str]] = field(default_factory=lambda: { - "text2vec-openai": { - "model": "ada", - "modelVersion": "002", - "type": "text", + module_config: Dict[str, Dict[str, str]] = field( + default_factory=lambda: { + "text2vec-openai": { + "model": "ada", + "modelVersion": "002", + "type": "text", + } } - }) + ) def fingerprint(self) -> str: """Returns a fingerprint of host part of a connection string""" diff --git a/dlt/destinations/impl/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py index adec0fee1e..bff1b4cacc 100644 --- a/dlt/destinations/impl/weaviate/exceptions.py +++ b/dlt/destinations/impl/weaviate/exceptions.py @@ -7,6 +7,10 @@ class WeaviateBatchError(DestinationException): class PropertyNameConflict(DestinationTerminalException): def __init__(self) -> None: - super().__init__("Your data contains items with identical property names when compared case insensitive. Weaviate cannot handle such data." - " Please clean up your data before loading or change to case insensitive naming convention." - " See https://dlthub.com/docs/dlt-ecosystem/destinations/weaviate#names-normalization for details.") + super().__init__( + "Your data contains items with identical property names when compared case insensitive." + " Weaviate cannot handle such data. Please clean up your data before loading or change" + " to case insensitive naming convention. See" + " https://dlthub.com/docs/dlt-ecosystem/destinations/weaviate#names-normalization for" + " details." + ) diff --git a/dlt/destinations/impl/weaviate/factory.py b/dlt/destinations/impl/weaviate/factory.py index b29d02b1a7..2e63e437cf 100644 --- a/dlt/destinations/impl/weaviate/factory.py +++ b/dlt/destinations/impl/weaviate/factory.py @@ -2,7 +2,10 @@ from dlt.common.destination import Destination, DestinationCapabilitiesContext -from dlt.destinations.impl.weaviate.configuration import WeaviateCredentials, WeaviateClientConfiguration +from dlt.destinations.impl.weaviate.configuration import ( + WeaviateCredentials, + WeaviateClientConfiguration, +) from dlt.destinations.impl.weaviate import capabilities if t.TYPE_CHECKING: @@ -10,7 +13,6 @@ class weaviate(Destination[WeaviateClientConfiguration, "WeaviateClient"]): - spec = WeaviateClientConfiguration def capabilities(self) -> DestinationCapabilitiesContext: @@ -40,8 +42,5 @@ def __init__( **kwargs: Additional arguments forwarded to the destination config """ super().__init__( - credentials=credentials, - vectorizer=vectorizer, - module_config=module_config, - **kwargs + credentials=credentials, vectorizer=vectorizer, module_config=module_config, **kwargs ) diff --git a/dlt/destinations/impl/weaviate/naming.py b/dlt/destinations/impl/weaviate/naming.py index cf01983b90..f5c94c872f 100644 --- a/dlt/destinations/impl/weaviate/naming.py +++ b/dlt/destinations/impl/weaviate/naming.py @@ -7,11 +7,7 @@ class NamingConvention(SnakeCaseNamingConvention): """Normalizes identifiers according to Weaviate documentation: https://weaviate.io/developers/weaviate/config-refs/schema#class""" - RESERVED_PROPERTIES = { - "id": "__id", - "_id": "___id", - "_additional": "__additional" - } + RESERVED_PROPERTIES = {"id": "__id", "_id": "___id", "_additional": "__additional"} _RE_UNDERSCORES = re.compile("([^_])__+") _STARTS_DIGIT = re.compile("^[0-9]") _STARTS_NON_LETTER = re.compile("^[0-9_]") @@ -19,7 +15,7 @@ class NamingConvention(SnakeCaseNamingConvention): def normalize_identifier(self, identifier: str) -> str: """Normalizes Weaviate property name by removing not allowed characters, replacing them by _ and contracting multiple _ into single one - and lowercasing the first character. + and lowercasing the first character. """ identifier = BaseNamingConvention.normalize_identifier(self, identifier) @@ -34,12 +30,15 @@ def normalize_identifier(self, identifier: str) -> str: def normalize_table_identifier(self, identifier: str) -> str: """Creates Weaviate class name. Runs property normalization and then creates capitalized case name by splitting on _ - https://weaviate.io/developers/weaviate/configuration/schema-configuration#create-a-class + https://weaviate.io/developers/weaviate/configuration/schema-configuration#create-a-class """ identifier = BaseNamingConvention.normalize_identifier(self, identifier) norm_identifier = self._base_normalize(identifier) # norm_identifier = norm_identifier.strip("_") - norm_identifier = "".join(s[1:2].upper() + s[2:] if s and s[0] == "_" else s for s in self._SPLIT_UNDERSCORE_NON_CAP.split(norm_identifier)) + norm_identifier = "".join( + s[1:2].upper() + s[2:] if s and s[0] == "_" else s + for s in self._SPLIT_UNDERSCORE_NON_CAP.split(norm_identifier) + ) norm_identifier = norm_identifier[0].upper() + norm_identifier[1:] if self._STARTS_NON_LETTER.match(norm_identifier): norm_identifier = "C" + norm_identifier diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index bbb3f1c9da..2d5161d9e9 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -69,8 +69,7 @@ def weaviate_adapter( vectorize = [vectorize] if not isinstance(vectorize, list): raise ValueError( - "vectorize must be a list of column names or a single " - "column name as a string" + "vectorize must be a list of column names or a single column name as a string" ) # create weaviate-specific vectorize hints for column_name in vectorize: @@ -83,7 +82,10 @@ def weaviate_adapter( for column_name, method in tokenization.items(): if method not in TOKENIZATION_METHODS: allowed_methods = ", ".join(TOKENIZATION_METHODS) - raise ValueError(f"Tokenization type {method} for column {column_name} is invalid. Allowed methods are: {allowed_methods}") + raise ValueError( + f"Tokenization type {method} for column {column_name} is invalid. Allowed" + f" methods are: {allowed_methods}" + ) if column_name in column_hints: column_hints[column_name][TOKENIZATION_HINT] = method # type: ignore else: diff --git a/dlt/destinations/impl/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py index 099cdc7368..eb096d0a26 100644 --- a/dlt/destinations/impl/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -31,13 +31,7 @@ from dlt.common.schema.typing import TColumnSchema, TColumnType from dlt.common.schema.utils import get_columns_names_with_prop from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import ( - TLoadJobState, - LoadJob, - JobClientBase, - WithStateSync - -) +from dlt.common.destination.reference import TLoadJobState, LoadJob, JobClientBase, WithStateSync from dlt.common.data_types import TDataType from dlt.common.storages import FileStorage @@ -55,7 +49,7 @@ "vectorizer": "none", "vectorIndexConfig": { "skip": True, - } + }, } @@ -105,7 +99,9 @@ def _wrap(self: JobClientBase, *args: Any, **kwargs: Any) -> Any: if status_ex.status_code == 403: raise DestinationTerminalException(status_ex) if status_ex.status_code == 422: - if "conflict for property" in str(status_ex) or "none vectorizer module" in str(status_ex): + if "conflict for property" in str(status_ex) or "none vectorizer module" in str( + status_ex + ): raise PropertyNameConflict() raise DestinationTerminalException(status_ex) # looks like there are no more terminal exception @@ -133,9 +129,7 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: ) if "conflict for property" in message: raise PropertyNameConflict() - raise DestinationTransientException( - f"Batch failed {errors} AND WILL BE RETRIED" - ) + raise DestinationTransientException(f"Batch failed {errors} AND WILL BE RETRIED") except Exception: raise DestinationTransientException("Batch failed AND WILL BE RETRIED") @@ -194,9 +188,7 @@ def check_batch_result(results: List[StrAny]) -> None: weaviate_error_retries=weaviate.WeaviateErrorRetryConf( self.client_config.batch_retries ), - consistency_level=weaviate.ConsistencyLevel[ - self.client_config.batch_consistency - ], + consistency_level=weaviate.ConsistencyLevel[self.client_config.batch_consistency], num_workers=self.client_config.batch_workers, callback=check_batch_result, ) as batch: @@ -210,9 +202,7 @@ def check_batch_result(results: List[StrAny]) -> None: if key in data: data[key] = str(ensure_pendulum_datetime(data[key])) if self.unique_identifiers: - uuid = self.generate_uuid( - data, self.unique_identifiers, self.class_name - ) + uuid = self.generate_uuid(data, self.unique_identifiers, self.class_name) else: uuid = None @@ -242,7 +232,14 @@ class WeaviateClient(JobClientBase, WithStateSync): """Weaviate client implementation.""" capabilities: ClassVar[DestinationCapabilitiesContext] = capabilities() - state_properties: ClassVar[List[str]] = ["version", "engine_version", "pipeline_name", "state", "created_at", "_dlt_load_id"] + state_properties: ClassVar[List[str]] = [ + "version", + "engine_version", + "pipeline_name", + "state", + "created_at", + "_dlt_load_id", + ] def __init__(self, schema: Schema, config: WeaviateClientConfiguration) -> None: super().__init__(schema, config) @@ -266,7 +263,11 @@ def sentinel_class(self) -> str: @staticmethod def create_db_client(config: WeaviateClientConfiguration) -> weaviate.Client: - auth_client_secret: weaviate.AuthApiKey = weaviate.AuthApiKey(api_key=config.credentials.api_key) if config.credentials.api_key else None + auth_client_secret: weaviate.AuthApiKey = ( + weaviate.AuthApiKey(api_key=config.credentials.api_key) + if config.credentials.api_key + else None + ) return weaviate.Client( url=config.credentials.url, timeout_config=(config.conn_timeout, config.read_timeout), @@ -314,9 +315,7 @@ def create_class( self.db_client.schema.create_class(updated_schema) - def create_class_property( - self, class_name: str, prop_schema: Dict[str, Any] - ) -> None: + def create_class_property(self, class_name: str, prop_schema: Dict[str, Any]) -> None: """Create a Weaviate class property. Args: @@ -434,14 +433,14 @@ def update_stored_schema( if schema_info is None: logger.info( f"Schema with hash {self.schema.stored_version_hash} " - f"not found in the storage. upgrading" + "not found in the storage. upgrading" ) self._execute_schema_update(only_tables) else: logger.info( f"Schema with hash {self.schema.stored_version_hash} " f"inserted at {schema_info.inserted_at} found " - f"in storage, no upgrade required" + "in storage, no upgrade required" ) return applied_update @@ -450,12 +449,8 @@ def _execute_schema_update(self, only_tables: Iterable[str]) -> None: for table_name in only_tables or self.schema.tables: exists, existing_columns = self.get_storage_table(table_name) # TODO: detect columns where vectorization was added or removed and modify it. currently we ignore change of hints - new_columns = self.schema.get_new_table_columns( - table_name, existing_columns - ) - logger.info( - f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}" - ) + new_columns = self.schema.get_new_table_columns(table_name, existing_columns) + logger.info(f"Found {len(new_columns)} updates for {table_name} in {self.schema.name}") if len(new_columns) > 0: if exists: for column in new_columns: @@ -493,26 +488,33 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: stepsize = 10 offset = 0 while True: - state_records = self.get_records(self.schema.state_table_name, - sort={ - "path": ["created_at"], - "order": "desc" - }, where={ + state_records = self.get_records( + self.schema.state_table_name, + sort={"path": ["created_at"], "order": "desc"}, + where={ "path": ["pipeline_name"], "operator": "Equal", "valueString": pipeline_name, - }, limit=stepsize, offset=offset, properties=self.state_properties) + }, + limit=stepsize, + offset=offset, + properties=self.state_properties, + ) offset += stepsize if len(state_records) == 0: return None for state in state_records: load_id = state["_dlt_load_id"] - load_records = self.get_records(self.schema.loads_table_name, - where={ + load_records = self.get_records( + self.schema.loads_table_name, + where={ "path": ["load_id"], "operator": "Equal", "valueString": load_id, - }, limit=1, properties=["load_id", "status"]) + }, + limit=1, + properties=["load_id", "status"], + ) # if there is a load for this state which was successful, return the state if len(load_records): state["dlt_load_id"] = state.pop("_dlt_load_id") @@ -532,33 +534,45 @@ def get_stored_state(self, pipeline_name: str) -> Optional[StateInfo]: def get_stored_schema(self) -> Optional[StorageSchemaInfo]: """Retrieves newest schema from destination storage""" try: - record = self.get_records(self.schema.version_table_name, sort={ - "path": ["inserted_at"], - "order": "desc" - }, where={ + record = self.get_records( + self.schema.version_table_name, + sort={"path": ["inserted_at"], "order": "desc"}, + where={ "path": ["schema_name"], "operator": "Equal", "valueString": self.schema.name, }, - limit=1)[0] + limit=1, + )[0] return StorageSchemaInfo(**record) except IndexError: return None def get_stored_schema_by_hash(self, schema_hash: str) -> Optional[StorageSchemaInfo]: try: - record = self.get_records(self.schema.version_table_name, where={ + record = self.get_records( + self.schema.version_table_name, + where={ "path": ["version_hash"], "operator": "Equal", "valueString": schema_hash, - }, limit=1)[0] + }, + limit=1, + )[0] return StorageSchemaInfo(**record) except IndexError: return None @wrap_weaviate_error - def get_records(self, table_name: str, where: Dict[str, Any] = None, sort: Dict[str, Any] = None, limit: int = 0, offset: int = 0, properties: List[str] = None) -> List[Dict[str, Any]]: - + def get_records( + self, + table_name: str, + where: Dict[str, Any] = None, + sort: Dict[str, Any] = None, + limit: int = 0, + offset: int = 0, + properties: List[str] = None, + ) -> List[Dict[str, Any]]: # fail if schema does not exist? self.get_class_schema(table_name) @@ -578,7 +592,7 @@ def get_records(self, table_name: str, where: Dict[str, Any] = None, sort: Dict[ response = query.do() full_class_name = self.make_qualified_class_name(table_name) records = response["data"]["Get"][full_class_name] - return cast(List[Dict[str, Any]],records) + return cast(List[Dict[str, Any]], records) def make_weaviate_class_schema(self, table_name: str) -> Dict[str, Any]: """Creates a Weaviate class schema from a table schema.""" @@ -631,9 +645,7 @@ def _make_property_schema(self, column_name: str, column: TColumnSchema) -> Dict **extra_kv, } - def start_file_load( - self, table: TTableSchema, file_path: str, load_id: str - ) -> LoadJob: + def start_file_load(self, table: TTableSchema, file_path: str, load_id: str) -> LoadJob: return LoadWeaviateJob( self.schema, table, @@ -656,7 +668,6 @@ def complete_load(self, load_id: str) -> None: } self.create_object(properties, self.schema.loads_table_name) - def __enter__(self) -> "WeaviateClient": return self @@ -680,5 +691,7 @@ def _update_schema_in_storage(self, schema: Schema) -> None: } self.create_object(properties, self.schema.version_table_name) - def _from_db_type(self, wt_t: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, wt_t: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: return self.type_mapper.from_db_type(wt_t, precision, scale) diff --git a/dlt/destinations/insert_job_client.py b/dlt/destinations/insert_job_client.py index d5759db6c2..678ba43bcc 100644 --- a/dlt/destinations/insert_job_client.py +++ b/dlt/destinations/insert_job_client.py @@ -18,7 +18,9 @@ def __init__(self, table_name: str, file_path: str, sql_client: SqlClientBase[An self._sql_client = sql_client # insert file content immediately with self._sql_client.begin_transaction(): - for fragments in self._insert(sql_client.make_qualified_table_name(table_name), file_path): + for fragments in self._insert( + sql_client.make_qualified_table_name(table_name), file_path + ): self._sql_client.execute_fragments(fragments) def state(self) -> TLoadJobState: @@ -90,7 +92,6 @@ def _insert(self, qualified_table_name: str, file_path: str) -> Iterator[List[st class InsertValuesJobClient(SqlJobClientWithStaging): - def restore_file_load(self, file_path: str) -> LoadJob: """Returns a completed SqlLoadJob or InsertValuesJob diff --git a/dlt/destinations/job_client_impl.py b/dlt/destinations/job_client_impl.py index 7dabf278c2..ac68cfea8a 100644 --- a/dlt/destinations/job_client_impl.py +++ b/dlt/destinations/job_client_impl.py @@ -6,18 +6,56 @@ from copy import copy import datetime # noqa: 251 from types import TracebackType -from typing import Any, ClassVar, List, NamedTuple, Optional, Sequence, Tuple, Type, Iterable, Iterator, ContextManager, cast +from typing import ( + Any, + ClassVar, + List, + NamedTuple, + Optional, + Sequence, + Tuple, + Type, + Iterable, + Iterator, + ContextManager, + cast, +) import zlib import re from dlt.common import json, pendulum, logger from dlt.common.data_types import TDataType -from dlt.common.schema.typing import COLUMN_HINTS, TColumnType, TColumnSchemaBase, TTableSchema, TWriteDisposition, TTableFormat +from dlt.common.schema.typing import ( + COLUMN_HINTS, + TColumnType, + TColumnSchemaBase, + TTableSchema, + TWriteDisposition, + TTableFormat, +) from dlt.common.storages import FileStorage from dlt.common.schema import TColumnSchema, Schema, TTableSchemaColumns, TSchemaTables -from dlt.common.destination.reference import StateInfo, StorageSchemaInfo,WithStateSync, DestinationClientConfiguration, DestinationClientDwhConfiguration, DestinationClientDwhWithStagingConfiguration, NewLoadJob, WithStagingDataset, TLoadJobState, LoadJob, JobClientBase, FollowupJob, CredentialsConfiguration +from dlt.common.destination.reference import ( + StateInfo, + StorageSchemaInfo, + WithStateSync, + DestinationClientConfiguration, + DestinationClientDwhConfiguration, + DestinationClientDwhWithStagingConfiguration, + NewLoadJob, + WithStagingDataset, + TLoadJobState, + LoadJob, + JobClientBase, + FollowupJob, + CredentialsConfiguration, +) from dlt.common.utils import concat_strings_with_limit -from dlt.destinations.exceptions import DatabaseUndefinedRelation, DestinationSchemaTampered, DestinationSchemaWillNotUpdate +from dlt.destinations.exceptions import ( + DatabaseUndefinedRelation, + DestinationSchemaTampered, + DestinationSchemaWillNotUpdate, +) from dlt.destinations.job_impl import EmptyLoadJobWithoutFollowup, NewReferenceJob from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME @@ -26,11 +64,8 @@ from dlt.destinations.sql_client import SqlClientBase # this should suffice for now -DDL_COMMANDS = [ - "ALTER", - "CREATE", - "DROP" -] +DDL_COMMANDS = ["ALTER", "CREATE", "DROP"] + class SqlLoadJob(LoadJob): """A job executing sql statement, without followup trait""" @@ -42,7 +77,10 @@ def __init__(self, file_path: str, sql_client: SqlClientBase[Any]) -> None: sql = f.read() # if we detect ddl transactions, only execute transaction if supported by client - if not self._string_containts_ddl_queries(sql) or sql_client.capabilities.supports_ddl_transactions: + if ( + not self._string_containts_ddl_queries(sql) + or sql_client.capabilities.supports_ddl_transactions + ): # with sql_client.begin_transaction(): sql_client.execute_sql(sql) else: @@ -68,7 +106,13 @@ def is_sql_job(file_path: str) -> bool: class CopyRemoteFileLoadJob(LoadJob, FollowupJob): - def __init__(self, table: TTableSchema, file_path: str, sql_client: SqlClientBase[Any], staging_credentials: Optional[CredentialsConfiguration] = None) -> None: + def __init__( + self, + table: TTableSchema, + file_path: str, + sql_client: SqlClientBase[Any], + staging_credentials: Optional[CredentialsConfiguration] = None, + ) -> None: super().__init__(FileStorage.get_file_name_from_file_path(file_path)) self._sql_client = sql_client self._staging_credentials = staging_credentials @@ -85,13 +129,35 @@ def state(self) -> TLoadJobState: class SqlJobClientBase(JobClientBase, WithStateSync): - - _VERSION_TABLE_SCHEMA_COLUMNS: ClassVar[Tuple[str, ...]] = ('version_hash', 'schema_name', 'version', 'engine_version', 'inserted_at', 'schema') - _STATE_TABLE_COLUMNS: ClassVar[Tuple[str, ...]] = ('version', 'engine_version', 'pipeline_name', 'state', 'created_at', '_dlt_load_id') - - def __init__(self, schema: Schema, config: DestinationClientConfiguration, sql_client: SqlClientBase[TNativeConn]) -> None: - self.version_table_schema_columns = ", ".join(sql_client.escape_column_name(col) for col in self._VERSION_TABLE_SCHEMA_COLUMNS) - self.state_table_columns = ", ".join(sql_client.escape_column_name(col) for col in self._STATE_TABLE_COLUMNS) + _VERSION_TABLE_SCHEMA_COLUMNS: ClassVar[Tuple[str, ...]] = ( + "version_hash", + "schema_name", + "version", + "engine_version", + "inserted_at", + "schema", + ) + _STATE_TABLE_COLUMNS: ClassVar[Tuple[str, ...]] = ( + "version", + "engine_version", + "pipeline_name", + "state", + "created_at", + "_dlt_load_id", + ) + + def __init__( + self, + schema: Schema, + config: DestinationClientConfiguration, + sql_client: SqlClientBase[TNativeConn], + ) -> None: + self.version_table_schema_columns = ", ".join( + sql_client.escape_column_name(col) for col in self._VERSION_TABLE_SCHEMA_COLUMNS + ) + self.state_table_columns = ", ".join( + sql_client.escape_column_name(col) for col in self._STATE_TABLE_COLUMNS + ) super().__init__(schema, config) self.sql_client = sql_client @@ -112,17 +178,25 @@ def initialize_storage(self, truncate_tables: Iterable[str] = None) -> None: def is_storage_initialized(self) -> bool: return self.sql_client.has_dataset() - def update_stored_schema(self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None) -> Optional[TSchemaTables]: + def update_stored_schema( + self, only_tables: Iterable[str] = None, expected_update: TSchemaTables = None + ) -> Optional[TSchemaTables]: super().update_stored_schema(only_tables, expected_update) applied_update: TSchemaTables = {} schema_info = self.get_stored_schema_by_hash(self.schema.stored_version_hash) if schema_info is None: - logger.info(f"Schema with hash {self.schema.stored_version_hash} not found in the storage. upgrading") + logger.info( + f"Schema with hash {self.schema.stored_version_hash} not found in the storage." + " upgrading" + ) with self.maybe_ddl_transaction(): applied_update = self._execute_schema_update_sql(only_tables) else: - logger.info(f"Schema with hash {self.schema.stored_version_hash} inserted at {schema_info.inserted_at} found in storage, no upgrade required") + logger.info( + f"Schema with hash {self.schema.stored_version_hash} inserted at" + f" {schema_info.inserted_at} found in storage, no upgrade required" + ) return applied_update def drop_tables(self, *tables: str, replace_schema: bool = True) -> None: @@ -141,7 +215,10 @@ def maybe_ddl_transaction(self) -> Iterator[None]: yield def should_truncate_table_before_load(self, table: TTableSchema) -> bool: - return table["write_disposition"] == "replace" and self.config.replace_strategy == "truncate-and-insert" + return ( + table["write_disposition"] == "replace" + and self.config.replace_strategy == "truncate-and-insert" + ) def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [] @@ -149,13 +226,19 @@ def _create_append_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> L def _create_merge_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: return [SqlMergeJob.from_table_chain(table_chain, self.sql_client)] - def _create_replace_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def _create_replace_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: jobs: List[NewLoadJob] = [] if self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]: - jobs.append(SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True})) + jobs.append( + SqlStagingCopyJob.from_table_chain(table_chain, self.sql_client, {"replace": True}) + ) return jobs - def create_table_chain_completed_followup_jobs(self, table_chain: Sequence[TTableSchema]) -> List[NewLoadJob]: + def create_table_chain_completed_followup_jobs( + self, table_chain: Sequence[TTableSchema] + ) -> List[NewLoadJob]: """Creates a list of followup jobs for merge write disposition and staging replace strategies""" jobs = super().create_table_chain_completed_followup_jobs(table_chain) write_disposition = table_chain[0]["write_disposition"] @@ -194,19 +277,25 @@ def complete_load(self, load_id: str) -> None: name = self.sql_client.make_qualified_table_name(self.schema.loads_table_name) now_ts = pendulum.now() self.sql_client.execute_sql( - f"INSERT INTO {name}(load_id, schema_name, status, inserted_at, schema_version_hash) VALUES(%s, %s, %s, %s, %s);", - load_id, self.schema.name, 0, now_ts, self.schema.version_hash + f"INSERT INTO {name}(load_id, schema_name, status, inserted_at, schema_version_hash)" + " VALUES(%s, %s, %s, %s, %s);", + load_id, + self.schema.name, + 0, + now_ts, + self.schema.version_hash, ) def __enter__(self) -> "SqlJobClientBase": self.sql_client.open_connection() return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: self.sql_client.close_connection() def get_storage_table(self, table_name: str) -> Tuple[bool, TTableSchemaColumns]: - def _null_to_bool(v: str) -> bool: if v == "NO": return False @@ -217,7 +306,9 @@ def _null_to_bool(v: str) -> bool: fields = ["column_name", "data_type", "is_nullable"] if self.capabilities.schema_supports_numeric_precision: fields += ["numeric_precision", "numeric_scale"] - db_params = self.sql_client.make_qualified_table_name(table_name, escape=False).split(".", 3) + db_params = self.sql_client.make_qualified_table_name(table_name, escape=False).split( + ".", 3 + ) query = f""" SELECT {",".join(fields)} FROM INFORMATION_SCHEMA.COLUMNS @@ -234,29 +325,40 @@ def _null_to_bool(v: str) -> bool: return False, schema_table # TODO: pull more data to infer indexes, PK and uniques attributes/constraints for c in rows: - numeric_precision = c[3] if self.capabilities.schema_supports_numeric_precision else None + numeric_precision = ( + c[3] if self.capabilities.schema_supports_numeric_precision else None + ) numeric_scale = c[4] if self.capabilities.schema_supports_numeric_precision else None schema_c: TColumnSchemaBase = { "name": c[0], "nullable": _null_to_bool(c[2]), - **self._from_db_type(c[1], numeric_precision, numeric_scale) + **self._from_db_type(c[1], numeric_precision, numeric_scale), } schema_table[c[0]] = schema_c # type: ignore return True, schema_table @abstractmethod - def _from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: + def _from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: pass def get_stored_schema(self) -> StorageSchemaInfo: name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) - query = f"SELECT {self.version_table_schema_columns} FROM {name} WHERE schema_name = %s ORDER BY inserted_at DESC;" + query = ( + f"SELECT {self.version_table_schema_columns} FROM {name} WHERE schema_name = %s ORDER" + " BY inserted_at DESC;" + ) return self._row_to_schema_info(query, self.schema.name) def get_stored_state(self, pipeline_name: str) -> StateInfo: state_table = self.sql_client.make_qualified_table_name(self.schema.state_table_name) loads_table = self.sql_client.make_qualified_table_name(self.schema.loads_table_name) - query = f"SELECT {self.state_table_columns} FROM {state_table} AS s JOIN {loads_table} AS l ON l.load_id = s._dlt_load_id WHERE pipeline_name = %s AND l.status = 0 ORDER BY created_at DESC" + query = ( + f"SELECT {self.state_table_columns} FROM {state_table} AS s JOIN {loads_table} AS l ON" + " l.load_id = s._dlt_load_id WHERE pipeline_name = %s AND l.status = 0 ORDER BY" + " created_at DESC" + ) with self.sql_client.execute_query(query, pipeline_name) as cur: row = cur.fetchone() if not row: @@ -281,12 +383,16 @@ def _execute_schema_update_sql(self, only_tables: Iterable[str]) -> TSchemaTable sql_scripts, schema_update = self._build_schema_update_sql(only_tables) # stay within max query size when doing DDL. some db backends use bytes not characters so decrease limit by half # assuming that most of the characters in DDL encode into single bytes - for sql_fragment in concat_strings_with_limit(sql_scripts, "\n", self.capabilities.max_query_length // 2): + for sql_fragment in concat_strings_with_limit( + sql_scripts, "\n", self.capabilities.max_query_length // 2 + ): self.sql_client.execute_sql(sql_fragment) self._update_schema_in_storage(self.schema) return schema_update - def _build_schema_update_sql(self, only_tables: Iterable[str]) -> Tuple[List[str], TSchemaTables]: + def _build_schema_update_sql( + self, only_tables: Iterable[str] + ) -> Tuple[List[str], TSchemaTables]: """Generates CREATE/ALTER sql for tables that differ between the destination and in client's Schema. This method compares all or `only_tables` defined in self.schema to the respective tables in the destination. It detects only new tables and new columns. @@ -318,11 +424,15 @@ def _build_schema_update_sql(self, only_tables: Iterable[str]) -> Tuple[List[str return sql_updates, schema_update - def _make_add_column_sql(self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None) -> List[str]: + def _make_add_column_sql( + self, new_columns: Sequence[TColumnSchema], table_format: TTableFormat = None + ) -> List[str]: """Make one or more ADD COLUMN sql clauses to be joined in ALTER TABLE statement(s)""" return [f"ADD COLUMN {self._get_column_def_sql(c, table_format)}" for c in new_columns] - def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool) -> List[str]: + def _get_table_update_sql( + self, table_name: str, new_columns: Sequence[TColumnSchema], generate_alter: bool + ) -> List[str]: # build sql canonical_name = self.sql_client.make_qualified_table_name(table_name) table = self.get_load_table(table_name) @@ -342,20 +452,32 @@ def _get_table_update_sql(self, table_name: str, new_columns: Sequence[TColumnSc sql_result.append(sql_base + column_sql.join(add_column_statements)) else: # build ALTER as separate statement for each column (redshift limitation) - sql_result.extend([sql_base + col_statement for col_statement in add_column_statements]) + sql_result.extend( + [sql_base + col_statement for col_statement in add_column_statements] + ) # scan columns to get hints if generate_alter: # no hints may be specified on added columns for hint in COLUMN_HINTS: if any(c.get(hint, False) is True for c in new_columns): - hint_columns = [self.capabilities.escape_identifier(c["name"]) for c in new_columns if c.get(hint, False)] + hint_columns = [ + self.capabilities.escape_identifier(c["name"]) + for c in new_columns + if c.get(hint, False) + ] if hint == "not_null": - logger.warning(f"Column(s) {hint_columns} with NOT NULL are being added to existing table {canonical_name}." - " If there's data in the table the operation will fail.") + logger.warning( + f"Column(s) {hint_columns} with NOT NULL are being added to existing" + f" table {canonical_name}. If there's data in the table the operation" + " will fail." + ) else: - logger.warning(f"Column(s) {hint_columns} with hint {hint} are being added to existing table {canonical_name}." - " Several hint types may not be added to existing tables.") + logger.warning( + f"Column(s) {hint_columns} with hint {hint} are being added to existing" + f" table {canonical_name}. Several hint types may not be added to" + " existing tables." + ) return sql_result @abstractmethod @@ -366,14 +488,16 @@ def _get_column_def_sql(self, c: TColumnSchema, table_format: TTableFormat = Non def _gen_not_null(v: bool) -> str: return "NOT NULL" if not v else "" - def _create_table_update(self, table_name: str, storage_columns: TTableSchemaColumns) -> Sequence[TColumnSchema]: + def _create_table_update( + self, table_name: str, storage_columns: TTableSchemaColumns + ) -> Sequence[TColumnSchema]: # compare table with stored schema and produce delta updates = self.schema.get_new_table_columns(table_name, storage_columns) logger.info(f"Found {len(updates)} updates for {table_name} in {self.schema.name}") return updates def _row_to_schema_info(self, query: str, *args: Any) -> StorageSchemaInfo: - row: Tuple[Any,...] = None + row: Tuple[Any, ...] = None # if there's no dataset/schema return none info with contextlib.suppress(DatabaseUndefinedRelation): with self.sql_client.execute_query(query, *args) as cur: @@ -401,9 +525,7 @@ def _replace_schema_in_storage(self, schema: Schema) -> None: Save the given schema in storage and remove all previous versions with the same name """ name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) - self.sql_client.execute_sql( - f"DELETE FROM {name} WHERE schema_name = %s;", schema.name - ) + self.sql_client.execute_sql(f"DELETE FROM {name} WHERE schema_name = %s;", schema.name) self._update_schema_in_storage(schema) def _update_schema_in_storage(self, schema: Schema) -> None: @@ -425,16 +547,22 @@ def _commit_schema_update(self, schema: Schema, schema_str: str) -> None: name = self.sql_client.make_qualified_table_name(self.schema.version_table_name) # values = schema.version_hash, schema.name, schema.version, schema.ENGINE_VERSION, str(now_ts), schema_str self.sql_client.execute_sql( - f"INSERT INTO {name}({self.version_table_schema_columns}) VALUES (%s, %s, %s, %s, %s, %s);", schema.stored_version_hash, schema.name, schema.version, schema.ENGINE_VERSION, now_ts, schema_str + f"INSERT INTO {name}({self.version_table_schema_columns}) VALUES (%s, %s, %s, %s, %s," + " %s);", + schema.stored_version_hash, + schema.name, + schema.version, + schema.ENGINE_VERSION, + now_ts, + schema_str, ) class SqlJobClientWithStaging(SqlJobClientBase, WithStagingDataset): - in_staging_mode: bool = False @contextlib.contextmanager - def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: + def with_staging_dataset(self) -> Iterator["SqlJobClientBase"]: try: with self.sql_client.with_staging_dataset(True): self.in_staging_mode = True @@ -445,7 +573,8 @@ def with_staging_dataset(self)-> Iterator["SqlJobClientBase"]: def should_load_data_to_staging_dataset(self, table: TTableSchema) -> bool: if table["write_disposition"] == "merge": return True - elif table["write_disposition"] == "replace" and (self.config.replace_strategy in ["insert-from-staging", "staging-optimized"]): + elif table["write_disposition"] == "replace" and ( + self.config.replace_strategy in ["insert-from-staging", "staging-optimized"] + ): return True return False - diff --git a/dlt/destinations/job_impl.py b/dlt/destinations/job_impl.py index fb3ba48b6d..7a6b98544c 100644 --- a/dlt/destinations/job_impl.py +++ b/dlt/destinations/job_impl.py @@ -6,6 +6,7 @@ from dlt.common.destination.reference import NewLoadJob, FollowupJob, TLoadJobState, LoadJob from dlt.common.storages.load_storage import ParsedLoadJobFileName + class EmptyLoadJobWithoutFollowup(LoadJob): def __init__(self, file_name: str, status: TLoadJobState, exception: str = None) -> None: self._status = status @@ -13,7 +14,9 @@ def __init__(self, file_name: str, status: TLoadJobState, exception: str = None) super().__init__(file_name) @classmethod - def from_file_path(cls, file_path: str, status: TLoadJobState, message: str = None) -> "EmptyLoadJobWithoutFollowup": + def from_file_path( + cls, file_path: str, status: TLoadJobState, message: str = None + ) -> "EmptyLoadJobWithoutFollowup": return cls(FileStorage.get_file_name_from_file_path(file_path), status, exception=message) def state(self) -> TLoadJobState: @@ -38,9 +41,11 @@ def new_file_path(self) -> str: """Path to a newly created temporary job file""" return self._new_file_path -class NewReferenceJob(NewLoadJobImpl): - def __init__(self, file_name: str, status: TLoadJobState, exception: str = None, remote_path: str = None) -> None: +class NewReferenceJob(NewLoadJobImpl): + def __init__( + self, file_name: str, status: TLoadJobState, exception: str = None, remote_path: str = None + ) -> None: file_name = os.path.splitext(file_name)[0] + ".reference" super().__init__(file_name, status, exception) self._remote_path = remote_path diff --git a/dlt/destinations/path_utils.py b/dlt/destinations/path_utils.py index a6cf634452..047cb274e0 100644 --- a/dlt/destinations/path_utils.py +++ b/dlt/destinations/path_utils.py @@ -7,18 +7,9 @@ from dlt.destinations.exceptions import InvalidFilesystemLayout, CantExtractTablePrefix # TODO: ensure layout only has supported placeholders -SUPPORTED_PLACEHOLDERS = { - "schema_name", - "table_name", - "load_id", - "file_id", - "ext", - "curr_date" -} +SUPPORTED_PLACEHOLDERS = {"schema_name", "table_name", "load_id", "file_id", "ext", "curr_date"} -SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS = ( - "schema_name", -) +SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS = ("schema_name",) def check_layout(layout: str) -> List[str]: @@ -28,11 +19,14 @@ def check_layout(layout: str) -> List[str]: raise InvalidFilesystemLayout(invalid_placeholders) return placeholders + def get_placeholders(layout: str) -> List[str]: - return re.findall(r'\{(.*?)\}', layout) + return re.findall(r"\{(.*?)\}", layout) -def create_path(layout: str, schema_name: str, table_name: str, load_id: str, file_id: str, ext: str) -> str: +def create_path( + layout: str, schema_name: str, table_name: str, load_id: str, file_id: str, ext: str +) -> str: """create a filepath from the layout and our default params""" placeholders = check_layout(layout) path = layout.format( @@ -41,7 +35,7 @@ def create_path(layout: str, schema_name: str, table_name: str, load_id: str, fi load_id=load_id, file_id=file_id, ext=ext, - curr_date=str(pendulum.today()) + curr_date=str(pendulum.today()), ) # if extension is not defined, we append it at the end if "ext" not in placeholders: @@ -51,11 +45,11 @@ def create_path(layout: str, schema_name: str, table_name: str, load_id: str, fi def get_table_prefix_layout( layout: str, - supported_prefix_placeholders: Sequence[str] = SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS + supported_prefix_placeholders: Sequence[str] = SUPPORTED_TABLE_NAME_PREFIX_PLACEHOLDERS, ) -> str: """get layout fragment that defines positions of the table, cutting other placeholders - allowed `supported_prefix_placeholders` that may appear before table. + allowed `supported_prefix_placeholders` that may appear before table. """ placeholders = get_placeholders(layout) @@ -67,14 +61,20 @@ def get_table_prefix_layout( # fail if any other prefix is defined before table_name if [p for p in placeholders[:table_name_index] if p not in supported_prefix_placeholders]: if len(supported_prefix_placeholders) == 0: - details = "No other placeholders are allowed before {table_name} but you have %s present. " % placeholders[:table_name_index] + details = ( + "No other placeholders are allowed before {table_name} but you have %s present. " + % placeholders[:table_name_index] + ) else: - details = "Only %s are allowed before {table_name} but you have %s present. " % (supported_prefix_placeholders, placeholders[:table_name_index]) + details = "Only %s are allowed before {table_name} but you have %s present. " % ( + supported_prefix_placeholders, + placeholders[:table_name_index], + ) raise CantExtractTablePrefix(layout, details) # we include the char after the table_name here, this should be a separator not a new placeholder # this is to prevent selecting tables that have the same starting name - prefix = layout[:layout.index("{table_name}") + 13] + prefix = layout[: layout.index("{table_name}") + 13] if prefix[-1] == "{": raise CantExtractTablePrefix(layout, "A separator is required after a {table_name}. ") diff --git a/dlt/destinations/sql_client.py b/dlt/destinations/sql_client.py index 68af420085..1e5f7031a5 100644 --- a/dlt/destinations/sql_client.py +++ b/dlt/destinations/sql_client.py @@ -3,7 +3,19 @@ from functools import wraps import inspect from types import TracebackType -from typing import Any, ClassVar, ContextManager, Generic, Iterator, Optional, Sequence, Tuple, Type, AnyStr, List +from typing import ( + Any, + ClassVar, + ContextManager, + Generic, + Iterator, + Optional, + Sequence, + Tuple, + Type, + AnyStr, + List, +) from dlt.common.typing import TFun from dlt.common.destination import DestinationCapabilitiesContext @@ -13,7 +25,6 @@ class SqlClientBase(ABC, Generic[TNativeConn]): - dbapi: ClassVar[DBApi] = None capabilities: ClassVar[DestinationCapabilitiesContext] = None @@ -45,7 +56,9 @@ def __enter__(self) -> "SqlClientBase[TNativeConn]": self.open_connection() return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: TracebackType + ) -> None: self.close_connection() @property @@ -78,20 +91,27 @@ def truncate_tables(self, *tables: str) -> None: def drop_tables(self, *tables: str) -> None: if not tables: return - statements = [f"DROP TABLE IF EXISTS {self.make_qualified_table_name(table)};" for table in tables] + statements = [ + f"DROP TABLE IF EXISTS {self.make_qualified_table_name(table)};" for table in tables + ] self.execute_fragments(statements) @abstractmethod - def execute_sql(self, sql: AnyStr, *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: + def execute_sql( + self, sql: AnyStr, *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: pass @abstractmethod - def execute_query(self, query: AnyStr, *args: Any, **kwargs: Any) -> ContextManager[DBApiCursor]: + def execute_query( + self, query: AnyStr, *args: Any, **kwargs: Any + ) -> ContextManager[DBApiCursor]: pass - def execute_fragments(self, fragments: Sequence[AnyStr], *args: Any, **kwargs: Any) -> Optional[Sequence[Sequence[Any]]]: - """Executes several SQL fragments as efficiently as possible to prevent data copying. Default implementation just joins the strings and executes them together. - """ + def execute_fragments( + self, fragments: Sequence[AnyStr], *args: Any, **kwargs: Any + ) -> Optional[Sequence[Sequence[Any]]]: + """Executes several SQL fragments as efficiently as possible to prevent data copying. Default implementation just joins the strings and executes them together.""" return self.execute_sql("".join(fragments), *args, **kwargs) # type: ignore @abstractmethod @@ -109,7 +129,9 @@ def escape_column_name(self, column_name: str, escape: bool = True) -> str: return column_name @contextmanager - def with_alternative_dataset_name(self, dataset_name: str) -> Iterator["SqlClientBase[TNativeConn]"]: + def with_alternative_dataset_name( + self, dataset_name: str + ) -> Iterator["SqlClientBase[TNativeConn]"]: """Sets the `dataset_name` as the default dataset during the lifetime of the context. Does not modify any search paths in the existing connection.""" current_dataset_name = self.dataset_name try: @@ -119,7 +141,9 @@ def with_alternative_dataset_name(self, dataset_name: str) -> Iterator["SqlClien # restore previous dataset name self.dataset_name = current_dataset_name - def with_staging_dataset(self, staging: bool = False)-> ContextManager["SqlClientBase[TNativeConn]"]: + def with_staging_dataset( + self, staging: bool = False + ) -> ContextManager["SqlClientBase[TNativeConn]"]: dataset_name = self.dataset_name if staging: dataset_name = SqlClientBase.make_staging_dataset_name(dataset_name) @@ -127,7 +151,7 @@ def with_staging_dataset(self, staging: bool = False)-> ContextManager["SqlClien def _ensure_native_conn(self) -> None: if not self.native_connection: - raise LoadClientNotConnected(type(self).__name__ , self.dataset_name) + raise LoadClientNotConnected(type(self).__name__, self.dataset_name) @staticmethod @abstractmethod @@ -156,6 +180,7 @@ def _truncate_table_sql(self, qualified_table_name: str) -> str: class DBApiCursorImpl(DBApiCursor): """A DBApi Cursor wrapper with dataframes reading functionality""" + def __init__(self, curr: DBApiCursor) -> None: self.native_cursor = curr @@ -187,7 +212,6 @@ def df(self, chunk_size: int = None, **kwargs: Any) -> Optional[DataFrame]: def raise_database_error(f: TFun) -> TFun: - @wraps(f) def _wrap_gen(self: SqlClientBase[Any], *args: Any, **kwargs: Any) -> Any: try: @@ -211,7 +235,6 @@ def _wrap(self: SqlClientBase[Any], *args: Any, **kwargs: Any) -> Any: def raise_open_connection_error(f: TFun) -> TFun: - @wraps(f) def _wrap(self: SqlClientBase[Any], *args: Any, **kwargs: Any) -> Any: try: diff --git a/dlt/destinations/sql_jobs.py b/dlt/destinations/sql_jobs.py index 4e8393ed74..5ed9e9ce2f 100644 --- a/dlt/destinations/sql_jobs.py +++ b/dlt/destinations/sql_jobs.py @@ -11,19 +11,26 @@ from dlt.destinations.job_impl import NewLoadJobImpl from dlt.destinations.sql_client import SqlClientBase + class SqlJobParams(TypedDict): replace: Optional[bool] -DEFAULTS: SqlJobParams = { - "replace": False -} + +DEFAULTS: SqlJobParams = {"replace": False} + class SqlBaseJob(NewLoadJobImpl): """Sql base job for jobs that rely on the whole tablechain""" + failed_text: str = "" @classmethod - def from_table_chain(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> NewLoadJobImpl: + def from_table_chain( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> NewLoadJobImpl: """Generates a list of sql statements, that will be executed by the sql client when the job is executed in the loader. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). @@ -34,44 +41,74 @@ def from_table_chain(cls, table_chain: Sequence[TTableSchema], sql_client: SqlCl try: # Remove line breaks from multiline statements and write one SQL statement per line in output file # to support clients that need to execute one statement at a time (i.e. snowflake) - sql = [' '.join(stmt.splitlines()) for stmt in cls.generate_sql(table_chain, sql_client, params)] + sql = [ + " ".join(stmt.splitlines()) + for stmt in cls.generate_sql(table_chain, sql_client, params) + ] job = cls(file_info.job_id(), "running") job._save_text_file("\n".join(sql)) except Exception: # return failed job - tables_str = yaml.dump(table_chain, allow_unicode=True, default_flow_style=False, sort_keys=False) + tables_str = yaml.dump( + table_chain, allow_unicode=True, default_flow_style=False, sort_keys=False + ) job = cls(file_info.job_id(), "failed", pretty_format_exception()) job._save_text_file("\n".join([cls.failed_text, tables_str])) return job @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: pass class SqlStagingCopyJob(SqlBaseJob): """Generates a list of sql statements that copy the data from staging dataset into destination dataset.""" + failed_text: str = "Tried to generate a staging copy sql job for the following tables:" @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: sql: List[str] = [] for table in table_chain: with sql_client.with_staging_dataset(staging=True): staging_table_name = sql_client.make_qualified_table_name(table["name"]) table_name = sql_client.make_qualified_table_name(table["name"]) - columns = ", ".join(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(table, "name"))) + columns = ", ".join( + map( + sql_client.capabilities.escape_identifier, + get_columns_names_with_prop(table, "name"), + ) + ) if params["replace"]: sql.append(sql_client._truncate_table_sql(table_name)) - sql.append(f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};") + sql.append( + f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name};" + ) return sql + class SqlMergeJob(SqlBaseJob): """Generates a list of sql statements that merge the data from staging dataset into destination dataset.""" + failed_text: str = "Tried to generate a merge sql job for the following tables:" @classmethod - def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any], params: Optional[SqlJobParams] = None) -> List[str]: + def generate_sql( + cls, + table_chain: Sequence[TTableSchema], + sql_client: SqlClientBase[Any], + params: Optional[SqlJobParams] = None, + ) -> List[str]: """Generates a list of sql statements that merge the data in staging dataset with the data in destination dataset. The `table_chain` contains a list schemas of a tables with parent-child relationship, ordered by the ancestry (the root of the tree is first on the list). @@ -84,29 +121,46 @@ def generate_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClient return cls.gen_merge_sql(table_chain, sql_client) @classmethod - def _gen_key_table_clauses(cls, primary_keys: Sequence[str], merge_keys: Sequence[str])-> List[str]: + def _gen_key_table_clauses( + cls, primary_keys: Sequence[str], merge_keys: Sequence[str] + ) -> List[str]: """Generate sql clauses to select rows to delete via merge and primary key. Return select all clause if no keys defined.""" clauses: List[str] = [] if primary_keys or merge_keys: if primary_keys: - clauses.append(" AND ".join(["%s.%s = %s.%s" % ("{d}", c, "{s}", c) for c in primary_keys])) + clauses.append( + " AND ".join(["%s.%s = %s.%s" % ("{d}", c, "{s}", c) for c in primary_keys]) + ) if merge_keys: - clauses.append(" AND ".join(["%s.%s = %s.%s" % ("{d}", c, "{s}", c) for c in merge_keys])) + clauses.append( + " AND ".join(["%s.%s = %s.%s" % ("{d}", c, "{s}", c) for c in merge_keys]) + ) return clauses or ["1=1"] @classmethod - def gen_key_table_clauses(cls, root_table_name: str, staging_root_table_name: str, key_clauses: Sequence[str], for_delete: bool) -> List[str]: + def gen_key_table_clauses( + cls, + root_table_name: str, + staging_root_table_name: str, + key_clauses: Sequence[str], + for_delete: bool, + ) -> List[str]: """Generate sql clauses that may be used to select or delete rows in root table of destination dataset - A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery + A list of clauses may be returned for engines that do not support OR in subqueries. Like BigQuery """ - return [f"FROM {root_table_name} as d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as s WHERE {' OR '.join([c.format(d='d',s='s') for c in key_clauses])})"] + return [ + f"FROM {root_table_name} as d WHERE EXISTS (SELECT 1 FROM {staging_root_table_name} as" + f" s WHERE {' OR '.join([c.format(d='d',s='s') for c in key_clauses])})" + ] @classmethod - def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequence[str]) -> Tuple[List[str], str]: + def gen_delete_temp_table_sql( + cls, unique_column: str, key_table_clauses: Sequence[str] + ) -> Tuple[List[str], str]: """Generate sql that creates delete temp table and inserts `unique_column` from root table for all records to delete. May return several statements. - Returns temp table name for cases where special names are required like SQLServer. + Returns temp table name for cases where special names are required like SQLServer. """ sql: List[str] = [] temp_table_name = cls._new_temp_table_name("delete") @@ -117,7 +171,9 @@ def gen_delete_temp_table_sql(cls, unique_column: str, key_table_clauses: Sequen return sql, temp_table_name @classmethod - def gen_insert_temp_table_sql(cls, staging_root_table_name: str, primary_keys: Sequence[str], unique_column: str) -> Tuple[List[str], str]: + def gen_insert_temp_table_sql( + cls, staging_root_table_name: str, primary_keys: Sequence[str], unique_column: str + ) -> Tuple[List[str], str]: temp_table_name = cls._new_temp_table_name("insert") select_statement = f""" SELECT {unique_column} @@ -146,7 +202,9 @@ def _to_temp_table(cls, select_sql: str, temp_table_name: str) -> str: return f"CREATE TEMP TABLE {temp_table_name} AS {select_sql};" @classmethod - def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any]) -> List[str]: + def gen_merge_sql( + cls, table_chain: Sequence[TTableSchema], sql_client: SqlClientBase[Any] + ) -> List[str]: sql: List[str] = [] root_table = table_chain[0] @@ -155,22 +213,35 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien with sql_client.with_staging_dataset(staging=True): staging_root_table_name = sql_client.make_qualified_table_name(root_table["name"]) # get merge and primary keys from top level - primary_keys = list(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(root_table, "primary_key"))) - merge_keys = list(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(root_table, "merge_key"))) + primary_keys = list( + map( + sql_client.capabilities.escape_identifier, + get_columns_names_with_prop(root_table, "primary_key"), + ) + ) + merge_keys = list( + map( + sql_client.capabilities.escape_identifier, + get_columns_names_with_prop(root_table, "merge_key"), + ) + ) key_clauses = cls._gen_key_table_clauses(primary_keys, merge_keys) unique_column: str = None root_key_column: str = None insert_temp_table_name: str = None - if len(table_chain) == 1: - key_table_clauses = cls.gen_key_table_clauses(root_table_name, staging_root_table_name, key_clauses, for_delete=True) + key_table_clauses = cls.gen_key_table_clauses( + root_table_name, staging_root_table_name, key_clauses, for_delete=True + ) # if no child tables, just delete data from top table for clause in key_table_clauses: sql.append(f"DELETE {clause};") else: - key_table_clauses = cls.gen_key_table_clauses(root_table_name, staging_root_table_name, key_clauses, for_delete=False) + key_table_clauses = cls.gen_key_table_clauses( + root_table_name, staging_root_table_name, key_clauses, for_delete=False + ) # use unique hint to create temp table with all identifiers to delete unique_columns = get_columns_names_with_prop(root_table, "unique") if not unique_columns: @@ -178,15 +249,21 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien sql_client.fully_qualified_dataset_name(), staging_root_table_name, [t["name"] for t in table_chain], - f"There is no unique column (ie _dlt_id) in top table {root_table['name']} so it is not possible to link child tables to it." + f"There is no unique column (ie _dlt_id) in top table {root_table['name']} so" + " it is not possible to link child tables to it.", ) # get first unique column unique_column = sql_client.capabilities.escape_identifier(unique_columns[0]) # create temp table with unique identifier - create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql(unique_column, key_table_clauses) + create_delete_temp_table_sql, delete_temp_table_name = cls.gen_delete_temp_table_sql( + unique_column, key_table_clauses + ) sql.extend(create_delete_temp_table_sql) # delete top table - sql.append(f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM {delete_temp_table_name});") + sql.append( + f"DELETE FROM {root_table_name} WHERE {unique_column} IN (SELECT * FROM" + f" {delete_temp_table_name});" + ) # delete other tables for table in table_chain[1:]: table_name = sql_client.make_qualified_table_name(table["name"]) @@ -196,13 +273,22 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien sql_client.fully_qualified_dataset_name(), staging_root_table_name, [t["name"] for t in table_chain], - f"There is no root foreign key (ie _dlt_root_id) in child table {table['name']} so it is not possible to refer to top level table {root_table['name']} unique column {unique_column}" + "There is no root foreign key (ie _dlt_root_id) in child table" + f" {table['name']} so it is not possible to refer to top level table" + f" {root_table['name']} unique column {unique_column}", ) root_key_column = sql_client.capabilities.escape_identifier(root_key_columns[0]) - sql.append(f"DELETE FROM {table_name} WHERE {root_key_column} IN (SELECT * FROM {delete_temp_table_name});") + sql.append( + f"DELETE FROM {table_name} WHERE {root_key_column} IN (SELECT * FROM" + f" {delete_temp_table_name});" + ) # create temp table used to deduplicate, only when we have primary keys if primary_keys: - create_insert_temp_table_sql, insert_temp_table_name = cls.gen_insert_temp_table_sql(staging_root_table_name, primary_keys, unique_column) + create_insert_temp_table_sql, insert_temp_table_name = ( + cls.gen_insert_temp_table_sql( + staging_root_table_name, primary_keys, unique_column + ) + ) sql.extend(create_insert_temp_table_sql) # insert from staging to dataset, truncate staging table @@ -210,8 +296,15 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien table_name = sql_client.make_qualified_table_name(table["name"]) with sql_client.with_staging_dataset(staging=True): staging_table_name = sql_client.make_qualified_table_name(table["name"]) - columns = ", ".join(map(sql_client.capabilities.escape_identifier, get_columns_names_with_prop(table, "name"))) - insert_sql = f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name}" + columns = ", ".join( + map( + sql_client.capabilities.escape_identifier, + get_columns_names_with_prop(table, "name"), + ) + ) + insert_sql = ( + f"INSERT INTO {table_name}({columns}) SELECT {columns} FROM {staging_table_name}" + ) if len(primary_keys) > 0: if len(table_chain) == 1: insert_sql = f"""INSERT INTO {table_name}({columns}) @@ -222,11 +315,13 @@ def gen_merge_sql(cls, table_chain: Sequence[TTableSchema], sql_client: SqlClien """ else: uniq_column = unique_column if table.get("parent") is None else root_key_column - insert_sql += f" WHERE {uniq_column} IN (SELECT * FROM {insert_temp_table_name});" + insert_sql += ( + f" WHERE {uniq_column} IN (SELECT * FROM {insert_temp_table_name});" + ) if insert_sql.strip()[-1] != ";": insert_sql += ";" sql.append(insert_sql) # -- DELETE FROM {staging_table_name} WHERE 1=1; - return sql \ No newline at end of file + return sql diff --git a/dlt/destinations/type_mapping.py b/dlt/destinations/type_mapping.py index 3f09524bbf..765fd6dbed 100644 --- a/dlt/destinations/type_mapping.py +++ b/dlt/destinations/type_mapping.py @@ -20,11 +20,15 @@ class TypeMapper: def __init__(self, capabilities: DestinationCapabilitiesContext) -> None: self.capabilities = capabilities - def to_db_integer_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_integer_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: # Override in subclass if db supports other integer types (e.g. smallint, integer, tinyint, etc.) return self.sct_to_unbound_dbt["bigint"] - def to_db_datetime_type(self, precision: Optional[int], table_format: TTableFormat = None) -> str: + def to_db_datetime_type( + self, precision: Optional[int], table_format: TTableFormat = None + ) -> str: # Override in subclass if db supports other timestamp types (e.g. with different time resolutions) return None @@ -54,7 +58,9 @@ def to_db_type(self, column: TColumnSchema, table_format: TTableFormat = None) - return self.sct_to_unbound_dbt[sc_t] return self.sct_to_dbt[sc_t] % precision_tuple - def precision_tuple_or_default(self, data_type: TDataType, precision: Optional[int], scale: Optional[int]) -> Optional[Tuple[int, ...]]: + def precision_tuple_or_default( + self, data_type: TDataType, precision: Optional[int], scale: Optional[int] + ) -> Optional[Tuple[int, ...]]: if data_type in ("timestamp", "time"): if precision is None: return None # Use default which is usually the max @@ -66,30 +72,38 @@ def precision_tuple_or_default(self, data_type: TDataType, precision: Optional[i if precision is None: return None elif scale is None: - return (precision, ) + return (precision,) return (precision, scale) - def decimal_precision(self, precision: Optional[int] = None, scale: Optional[int] = None) -> Optional[Tuple[int, int]]: + def decimal_precision( + self, precision: Optional[int] = None, scale: Optional[int] = None + ) -> Optional[Tuple[int, int]]: defaults = self.capabilities.decimal_precision if not defaults: return None default_precision, default_scale = defaults return ( - precision if precision is not None else default_precision, scale if scale is not None else default_scale + precision if precision is not None else default_precision, + scale if scale is not None else default_scale, ) - def wei_precision(self, precision: Optional[int] = None, scale: Optional[int] = None) -> Optional[Tuple[int, int]]: + def wei_precision( + self, precision: Optional[int] = None, scale: Optional[int] = None + ) -> Optional[Tuple[int, int]]: defaults = self.capabilities.wei_precision if not defaults: return None default_precision, default_scale = defaults return ( - precision if precision is not None else default_precision, scale if scale is not None else default_scale + precision if precision is not None else default_precision, + scale if scale is not None else default_scale, ) - def from_db_type(self, db_type: str, precision: Optional[int], scale: Optional[int]) -> TColumnType: - return without_none(dict( # type: ignore[return-value] - data_type=self.dbt_to_sct.get(db_type, "text"), - precision=precision, - scale=scale - )) + def from_db_type( + self, db_type: str, precision: Optional[int], scale: Optional[int] + ) -> TColumnType: + return without_none( + dict( # type: ignore[return-value] + data_type=self.dbt_to_sct.get(db_type, "text"), precision=precision, scale=scale + ) + ) diff --git a/dlt/destinations/typing.py b/dlt/destinations/typing.py index 7edf69d2ea..99ffed01fd 100644 --- a/dlt/destinations/typing.py +++ b/dlt/destinations/typing.py @@ -1,4 +1,5 @@ from typing import Any, AnyStr, List, Type, Optional, Protocol, Tuple, TypeVar + try: from pandas import DataFrame except ImportError: @@ -7,12 +8,11 @@ # native connection TNativeConn = TypeVar("TNativeConn", bound=Any) + class DBTransaction(Protocol): - def commit_transaction(self) -> None: - ... + def commit_transaction(self) -> None: ... - def rollback_transaction(self) -> None: - ... + def rollback_transaction(self) -> None: ... class DBApi(Protocol): @@ -23,21 +23,17 @@ class DBApi(Protocol): class DBApiCursor(Protocol): """Protocol for DBAPI cursor""" + description: Tuple[Any, ...] native_cursor: "DBApiCursor" """Cursor implementation native to current destination""" - def execute(self, query: AnyStr, *args: Any, **kwargs: Any) -> None: - ... - def fetchall(self) -> List[Tuple[Any, ...]]: - ... - def fetchmany(self, size: int = ...) -> List[Tuple[Any, ...]]: - ... - def fetchone(self) -> Optional[Tuple[Any, ...]]: - ... - def close(self) -> None: - ... + def execute(self, query: AnyStr, *args: Any, **kwargs: Any) -> None: ... + def fetchall(self) -> List[Tuple[Any, ...]]: ... + def fetchmany(self, size: int = ...) -> List[Tuple[Any, ...]]: ... + def fetchone(self) -> Optional[Tuple[Any, ...]]: ... + def close(self) -> None: ... def df(self, chunk_size: int = None, **kwargs: None) -> Optional[DataFrame]: """Fetches the results as data frame. For large queries the results may be chunked @@ -54,4 +50,3 @@ def df(self, chunk_size: int = None, **kwargs: None) -> Optional[DataFrame]: Optional[DataFrame]: A data frame with query results. If chunk_size > 0, None will be returned if there is no more data in results """ ... - diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index cc6ff15759..9dcffdacb9 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -4,4 +4,14 @@ from dlt.extract.incremental import Incremental from dlt.extract.wrappers import wrap_additional_type -__all__ = ["DltResource", "DltSource", "with_table_name", "source", "resource", "transformer", "defer", "Incremental", "wrap_additional_type"] +__all__ = [ + "DltResource", + "DltSource", + "with_table_name", + "source", + "resource", + "transformer", + "defer", + "Incremental", + "wrap_additional_type", +] diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index 1dbfcb4350..cf7426e683 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -2,7 +2,21 @@ import inspect from types import ModuleType from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Iterator, List, Literal, Optional, Tuple, Type, Union, cast, overload +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Iterator, + List, + Literal, + Optional, + Tuple, + Type, + Union, + cast, + overload, +) from typing_extensions import TypeVar from dlt.common.configuration import with_config, get_fun_spec, known_sections, configspec @@ -15,13 +29,35 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContract, TTableFormat -from dlt.extract.utils import ensure_table_schema_columns_hint, simulate_func_call, wrap_compat_transformer, wrap_resource_gen +from dlt.common.schema.typing import ( + TColumnNames, + TTableSchemaColumns, + TWriteDisposition, + TAnySchemaColumns, + TSchemaContract, + TTableFormat, +) +from dlt.extract.utils import ( + ensure_table_schema_columns_hint, + simulate_func_call, + wrap_compat_transformer, + wrap_resource_gen, +) from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage from dlt.common.typing import AnyFun, ParamSpec, Concatenate, TDataItem, TDataItems from dlt.common.utils import get_callable_name, get_module_name, is_inner_callable -from dlt.extract.exceptions import DynamicNameNotStandaloneResource, InvalidTransformerDataTypeGeneratorFunctionRequired, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, SourceDataIsNone, SourceIsAClassTypeError, ExplicitSourceNameInvalid, SourceNotAFunction, SourceSchemaNotAvailable +from dlt.extract.exceptions import ( + DynamicNameNotStandaloneResource, + InvalidTransformerDataTypeGeneratorFunctionRequired, + ResourceFunctionExpected, + ResourceInnerCallableConfigWrapDisallowed, + SourceDataIsNone, + SourceIsAClassTypeError, + ExplicitSourceNameInvalid, + SourceNotAFunction, + SourceSchemaNotAvailable, +) from dlt.extract.incremental import IncrementalResourceWrapper from dlt.extract.typing import TTableHintTemplate @@ -32,13 +68,15 @@ @configspec class SourceSchemaInjectableContext(ContainerInjectableContext): """A context containing the source schema, present when decorated function is executed""" + schema: Schema can_create_default: ClassVar[bool] = False if TYPE_CHECKING: - def __init__(self, schema: Schema = None) -> None: - ... + + def __init__(self, schema: Schema = None) -> None: ... + TSourceFunParams = ParamSpec("TSourceFunParams") TResourceFunParams = ParamSpec("TResourceFunParams") @@ -56,9 +94,9 @@ def source( schema: Schema = None, schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, - _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] -) -> Callable[TSourceFunParams, DltSource]: - ... + _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] +) -> Callable[TSourceFunParams, DltSource]: ... + @overload def source( @@ -71,9 +109,9 @@ def source( schema: Schema = None, schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, - _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] -) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: - ... + _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] +) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: ... + def source( func: Optional[AnyFun] = None, @@ -85,7 +123,7 @@ def source( schema: Schema = None, schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, - _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] + _impl_cls: Type[TDltSourceImpl] = DltSource, # type: ignore[assignment] ) -> Any: """A decorator that transforms a function returning one or more `dlt resources` into a `dlt source` in order to load it with `dlt`. @@ -129,7 +167,9 @@ def source( `DltSource` instance """ if name and schema: - raise ArgumentsOverloadException("'name' has no effect when `schema` argument is present", source.__name__) + raise ArgumentsOverloadException( + "'name' has no effect when `schema` argument is present", source.__name__ + ) def decorator(f: Callable[TSourceFunParams, Any]) -> Callable[TSourceFunParams, TDltSourceImpl]: nonlocal schema, name @@ -164,7 +204,13 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: # configurations will be accessed in this section in the source proxy = Container()[PipelineContext] pipeline_name = None if not proxy.is_active() else proxy.pipeline().pipeline_name - with inject_section(ConfigSectionContext(pipeline_name=pipeline_name, sections=source_sections, source_state_key=schema.name)): + with inject_section( + ConfigSectionContext( + pipeline_name=pipeline_name, + sections=source_sections, + source_state_key=schema.name, + ) + ): rv = conf_f(*args, **kwargs) if rv is None: raise SourceDataIsNone(schema.name) @@ -182,7 +228,6 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: s.root_key = root_key return s - # get spec for wrapped function SPEC = get_fun_spec(conf_f) # store the source information @@ -212,9 +257,9 @@ def resource( schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> DltResource: - ... + spec: Type[BaseConfiguration] = None, +) -> DltResource: ... + @overload def resource( @@ -229,9 +274,9 @@ def resource( schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: - ... + spec: Type[BaseConfiguration] = None, +) -> Callable[[Callable[TResourceFunParams, Any]], DltResource]: ... + @overload def resource( @@ -247,9 +292,8 @@ def resource( table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True -) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: - ... + standalone: Literal[True] = True, +) -> Callable[[Callable[TResourceFunParams, Any]], Callable[TResourceFunParams, DltResource]]: ... @overload @@ -265,9 +309,8 @@ def resource( schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> DltResource: - ... + spec: Type[BaseConfiguration] = None, +) -> DltResource: ... def resource( @@ -348,7 +391,10 @@ def resource( Returns: DltResource instance which may be loaded, iterated or combined with other resources into a pipeline. """ - def make_resource(_name: str, _section: str, _data: Any, incremental: IncrementalResourceWrapper = None) -> DltResource: + + def make_resource( + _name: str, _section: str, _data: Any, incremental: IncrementalResourceWrapper = None + ) -> DltResource: table_template = DltResource.new_table_template( table_name, write_disposition=write_disposition, @@ -356,16 +402,27 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa primary_key=primary_key, merge_key=merge_key, schema_contract=schema_contract, - table_format=table_format + table_format=table_format, + ) + return DltResource.from_data( + _data, + _name, + _section, + table_template, + selected, + cast(DltResource, data_from), + incremental=incremental, ) - return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) - - def decorator(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunParams, DltResource]: + def decorator( + f: Callable[TResourceFunParams, Any] + ) -> Callable[TResourceFunParams, DltResource]: if not callable(f): if data_from: # raise more descriptive exception if we construct transformer - raise InvalidTransformerDataTypeGeneratorFunctionRequired(name or "", f, type(f)) + raise InvalidTransformerDataTypeGeneratorFunctionRequired( + name or "", f, type(f) + ) raise ResourceFunctionExpected(name or "", f, type(f)) if not standalone and callable(name): raise DynamicNameNotStandaloneResource(get_callable_name(f)) @@ -391,7 +448,10 @@ def decorator(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunPara # for autogenerated spec do not include defaults conf_f = with_config( incr_f, - spec=spec, sections=resource_sections, sections_merge_style=ConfigSectionContext.resource_merge_style, include_defaults=spec is not None + spec=spec, + sections=resource_sections, + sections_merge_style=ConfigSectionContext.resource_merge_style, + include_defaults=spec is not None, ) is_inner_resource = is_inner_callable(f) if conf_f != incr_f and is_inner_resource and not standalone: @@ -412,13 +472,21 @@ def decorator(f: Callable[TResourceFunParams, Any]) -> Callable[TResourceFunPara @wraps(conf_f) def _wrap(*args: Any, **kwargs: Any) -> DltResource: _, mod_sig, bound_args = simulate_func_call(conf_f, skip_args, *args, **kwargs) - actual_resource_name = name(bound_args.arguments) if callable(name) else resource_name - r = make_resource(actual_resource_name, source_section, compat_wrapper(actual_resource_name, conf_f, sig, *args, **kwargs), incremental) + actual_resource_name = ( + name(bound_args.arguments) if callable(name) else resource_name + ) + r = make_resource( + actual_resource_name, + source_section, + compat_wrapper(actual_resource_name, conf_f, sig, *args, **kwargs), + incremental, + ) # consider transformer arguments bound r._args_bound = True # keep explicit args passed r._set_explicit_args(conf_f, mod_sig, *args, **kwargs) return r + return _wrap else: return make_resource(resource_name, source_section, conf_f, incremental) @@ -453,9 +521,9 @@ def transformer( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: - ... + spec: Type[BaseConfiguration] = None, +) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], DltResource]: ... + @overload def transformer( @@ -470,9 +538,12 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True -) -> Callable[[Callable[Concatenate[TDataItem, TResourceFunParams], Any]], Callable[TResourceFunParams, DltResource]]: - ... + standalone: Literal[True] = True, +) -> Callable[ + [Callable[Concatenate[TDataItem, TResourceFunParams], Any]], + Callable[TResourceFunParams, DltResource], +]: ... + @overload def transformer( @@ -486,9 +557,9 @@ def transformer( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, - spec: Type[BaseConfiguration] = None -) -> DltResource: - ... + spec: Type[BaseConfiguration] = None, +) -> DltResource: ... + @overload def transformer( @@ -503,9 +574,9 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: Literal[True] = True -) -> Callable[TResourceFunParams, DltResource]: - ... + standalone: Literal[True] = True, +) -> Callable[TResourceFunParams, DltResource]: ... + def transformer( f: Optional[Callable[Concatenate[TDataItem, TResourceFunParams], Any]] = None, @@ -519,7 +590,7 @@ def transformer( merge_key: TTableHintTemplate[TColumnNames] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, - standalone: bool = False + standalone: bool = False, ) -> Any: """A form of `dlt resource` that takes input from other resources via `data_from` argument in order to enrich or transform the data. @@ -576,7 +647,10 @@ def transformer( standalone (bool, optional): Returns a wrapped decorated function that creates DltResource instance. Must be called before use. Cannot be part of a source. """ if isinstance(f, DltResource): - raise ValueError("Please pass `data_from=` argument as keyword argument. The only positional argument to transformer is the decorated function") + raise ValueError( + "Please pass `data_from=` argument as keyword argument. The only positional argument to" + " transformer is the decorated function" + ) return resource( # type: ignore f, @@ -589,7 +663,7 @@ def transformer( selected=selected, spec=spec, standalone=standalone, - data_from=data_from + data_from=data_from, ) @@ -628,12 +702,14 @@ def get_source_schema() -> Schema: TDeferredFunParams = ParamSpec("TDeferredFunParams") -def defer(f: Callable[TDeferredFunParams, TBoundItems]) -> Callable[TDeferredFunParams, TDeferred[TBoundItems]]: - +def defer( + f: Callable[TDeferredFunParams, TBoundItems] +) -> Callable[TDeferredFunParams, TDeferred[TBoundItems]]: @wraps(f) def _wrap(*args: Any, **kwargs: Any) -> TDeferred[TBoundItems]: def _curry() -> TBoundItems: return f(*args, **kwargs) + return _curry return _wrap diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index 351b85a9d8..8e7d0dddf8 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -42,7 +42,11 @@ def __init__(self, pipe_name: str, has_parent: bool) -> None: self.pipe_name = pipe_name self.has_parent = has_parent if has_parent: - msg = f"A pipe created from transformer {pipe_name} is unbound or its parent is unbound or empty. Provide a resource in `data_from` argument or bind resources with | operator." + msg = ( + f"A pipe created from transformer {pipe_name} is unbound or its parent is unbound" + " or empty. Provide a resource in `data_from` argument or bind resources with |" + " operator." + ) else: msg = "Pipe is empty and does not have a resource at its head" super().__init__(pipe_name, msg) @@ -52,21 +56,41 @@ class InvalidStepFunctionArguments(PipeException): def __init__(self, pipe_name: str, func_name: str, sig: Signature, call_error: str) -> None: self.func_name = func_name self.sig = sig - super().__init__(pipe_name, f"Unable to call {func_name}: {call_error}. The mapping/filtering function {func_name} requires first argument to take data item and optional second argument named 'meta', but the signature is {sig}") + super().__init__( + pipe_name, + f"Unable to call {func_name}: {call_error}. The mapping/filtering function" + f" {func_name} requires first argument to take data item and optional second argument" + f" named 'meta', but the signature is {sig}", + ) class ResourceExtractionError(PipeException): def __init__(self, pipe_name: str, gen: Any, msg: str, kind: str) -> None: self.msg = msg self.kind = kind - self.func_name = gen.__name__ if isgenerator(gen) else get_callable_name(gen) if callable(gen) else str(gen) - super().__init__(pipe_name, f"extraction of resource {pipe_name} in {kind} {self.func_name} caused an exception: {msg}") + self.func_name = ( + gen.__name__ + if isgenerator(gen) + else get_callable_name(gen) if callable(gen) else str(gen) + ) + super().__init__( + pipe_name, + f"extraction of resource {pipe_name} in {kind} {self.func_name} caused an exception:" + f" {msg}", + ) class PipeGenInvalid(PipeException): def __init__(self, pipe_name: str, gen: Any) -> None: - msg = "A pipe generator element must be an Iterator (ie. list or generator function). Generator element is typically created from a `data` argument to pipeline.run or extract method." - msg += "dlt will evaluate functions that were passed as data argument. If you passed a function the returned data type is not iterable. " + msg = ( + "A pipe generator element must be an Iterator (ie. list or generator function)." + " Generator element is typically created from a `data` argument to pipeline.run or" + " extract method." + ) + msg += ( + "dlt will evaluate functions that were passed as data argument. If you passed a" + " function the returned data type is not iterable. " + ) type_name = str(type(gen)) msg += f" Generator type is {type_name}." if "DltSource" in type_name: @@ -79,13 +103,21 @@ def __init__(self, pipe_name: str, gen: Any) -> None: class ResourceNameMissing(DltResourceException): def __init__(self) -> None: - super().__init__(None, """Resource name is missing. If you create a resource directly from data ie. from a list you must pass the name explicitly in `name` argument. - Please note that for resources created from functions or generators, the name is the function name by default.""") + super().__init__( + None, + """Resource name is missing. If you create a resource directly from data ie. from a list you must pass the name explicitly in `name` argument. + Please note that for resources created from functions or generators, the name is the function name by default.""", + ) class DynamicNameNotStandaloneResource(DltResourceException): def __init__(self, resource_name: str) -> None: - super().__init__(resource_name, "You must set the resource as standalone to be able to dynamically set its name based on call arguments") + super().__init__( + resource_name, + "You must set the resource as standalone to be able to dynamically set its name based" + " on call arguments", + ) + # class DependentResourceIsNotCallable(DltResourceException): # def __init__(self, resource_name: str) -> None: @@ -93,42 +125,81 @@ def __init__(self, resource_name: str) -> None: class ResourceNotFoundError(DltResourceException, KeyError): - def __init__(self, resource_name: str, context: str) -> None: - self.resource_name = resource_name - super().__init__(resource_name, f"Resource with a name {resource_name} could not be found. {context}") + def __init__(self, resource_name: str, context: str) -> None: + self.resource_name = resource_name + super().__init__( + resource_name, f"Resource with a name {resource_name} could not be found. {context}" + ) class InvalidResourceDataType(DltResourceException): def __init__(self, resource_name: str, item: Any, _typ: Type[Any], msg: str) -> None: self.item = item self._typ = _typ - super().__init__(resource_name, f"Cannot create resource {resource_name} from specified data. If you want to process just one data item, enclose it in a list. " + msg) + super().__init__( + resource_name, + f"Cannot create resource {resource_name} from specified data. If you want to process" + " just one data item, enclose it in a list. " + + msg, + ) class InvalidResourceDataTypeAsync(InvalidResourceDataType): - def __init__(self, resource_name: str, item: Any,_typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, "Async iterators and generators are not valid resources. Please use standard iterators and generators that yield Awaitables instead (for example by yielding from async function without await") + def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: + super().__init__( + resource_name, + item, + _typ, + "Async iterators and generators are not valid resources. Please use standard iterators" + " and generators that yield Awaitables instead (for example by yielding from async" + " function without await", + ) class InvalidResourceDataTypeBasic(InvalidResourceDataType): - def __init__(self, resource_name: str, item: Any,_typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, f"Resources cannot be strings or dictionaries but {_typ.__name__} was provided. Please pass your data in a list or as a function yielding items. If you want to process just one data item, enclose it in a list.") + def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: + super().__init__( + resource_name, + item, + _typ, + f"Resources cannot be strings or dictionaries but {_typ.__name__} was provided. Please" + " pass your data in a list or as a function yielding items. If you want to process" + " just one data item, enclose it in a list.", + ) class InvalidResourceDataTypeFunctionNotAGenerator(InvalidResourceDataType): - def __init__(self, resource_name: str, item: Any,_typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, "Please make sure that function decorated with @dlt.resource uses 'yield' to return the data.") + def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: + super().__init__( + resource_name, + item, + _typ, + "Please make sure that function decorated with @dlt.resource uses 'yield' to return the" + " data.", + ) class InvalidResourceDataTypeMultiplePipes(InvalidResourceDataType): - def __init__(self, resource_name: str, item: Any,_typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, "Resources with multiple parallel data pipes are not yet supported. This problem most often happens when you are creating a source with @dlt.source decorator that has several resources with the same name.") + def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: + super().__init__( + resource_name, + item, + _typ, + "Resources with multiple parallel data pipes are not yet supported. This problem most" + " often happens when you are creating a source with @dlt.source decorator that has" + " several resources with the same name.", + ) class InvalidTransformerDataTypeGeneratorFunctionRequired(InvalidResourceDataType): def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, - "Transformer must be a function decorated with @dlt.transformer that takes data item as its first argument. Only first argument may be 'positional only'.") + super().__init__( + resource_name, + item, + _typ, + "Transformer must be a function decorated with @dlt.transformer that takes data item as" + " its first argument. Only first argument may be 'positional only'.", + ) class InvalidTransformerGeneratorFunction(DltResourceException): @@ -150,29 +221,57 @@ def __init__(self, resource_name: str, func_name: str, sig: Signature, code: int class ResourceInnerCallableConfigWrapDisallowed(DltResourceException): def __init__(self, resource_name: str, section: str) -> None: self.section = section - msg = f"Resource {resource_name} in section {section} is defined over an inner function and requests config/secrets in its arguments. Requesting secret and config values via 'dlt.secrets.values' or 'dlt.config.value' is disallowed for resources that are inner functions. Use the dlt.source to get the required configuration and pass them explicitly to your source." + msg = ( + f"Resource {resource_name} in section {section} is defined over an inner function and" + " requests config/secrets in its arguments. Requesting secret and config values via" + " 'dlt.secrets.values' or 'dlt.config.value' is disallowed for resources that are" + " inner functions. Use the dlt.source to get the required configuration and pass them" + " explicitly to your source." + ) super().__init__(resource_name, msg) class InvalidResourceDataTypeIsNone(InvalidResourceDataType): def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, "Resource data missing. Did you forget the return statement in @dlt.resource decorated function?") + super().__init__( + resource_name, + item, + _typ, + "Resource data missing. Did you forget the return statement in @dlt.resource decorated" + " function?", + ) class ResourceFunctionExpected(InvalidResourceDataType): def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, f"Expected function or callable as first parameter to resource {resource_name} but {_typ.__name__} found. Please decorate a function with @dlt.resource") + super().__init__( + resource_name, + item, + _typ, + f"Expected function or callable as first parameter to resource {resource_name} but" + f" {_typ.__name__} found. Please decorate a function with @dlt.resource", + ) class InvalidParentResourceDataType(InvalidResourceDataType): - def __init__(self, resource_name: str, item: Any,_typ: Type[Any]) -> None: - super().__init__(resource_name, item, _typ, f"A parent resource of {resource_name} is of type {_typ.__name__}. Did you forget to use '@dlt.resource` decorator or `resource` function?") + def __init__(self, resource_name: str, item: Any, _typ: Type[Any]) -> None: + super().__init__( + resource_name, + item, + _typ, + f"A parent resource of {resource_name} is of type {_typ.__name__}. Did you forget to" + " use '@dlt.resource` decorator or `resource` function?", + ) class InvalidParentResourceIsAFunction(DltResourceException): def __init__(self, resource_name: str, func_name: str) -> None: self.func_name = func_name - super().__init__(resource_name, f"A data source {func_name} of a transformer {resource_name} is an undecorated function. Please decorate it with '@dlt.resource' or pass to 'resource' function.") + super().__init__( + resource_name, + f"A data source {func_name} of a transformer {resource_name} is an undecorated" + " function. Please decorate it with '@dlt.resource' or pass to 'resource' function.", + ) class DeletingResourcesNotSupported(DltResourceException): @@ -181,10 +280,16 @@ def __init__(self, source_name: str, resource_name: str) -> None: class ParametrizedResourceUnbound(DltResourceException): - def __init__(self, resource_name: str, func_name: str, sig: Signature, kind: str, error: str) -> None: + def __init__( + self, resource_name: str, func_name: str, sig: Signature, kind: str, error: str + ) -> None: self.func_name = func_name self.sig = sig - msg = f"The {kind} {resource_name} is parametrized and expects following arguments: {sig}. Did you forget to bind the {func_name} function? For example from `source.{resource_name}.bind(...)" + msg = ( + f"The {kind} {resource_name} is parametrized and expects following arguments: {sig}." + f" Did you forget to bind the {func_name} function? For example from" + f" `source.{resource_name}.bind(...)" + ) if error: msg += f" .Details: {error}" super().__init__(resource_name, msg) @@ -197,7 +302,9 @@ def __init__(self, resource_name: str, msg: str) -> None: class TableNameMissing(DltSourceException): def __init__(self) -> None: - super().__init__("""Table name is missing in table template. Please provide a string or a function that takes a data item as an argument""") + super().__init__( + """Table name is missing in table template. Please provide a string or a function that takes a data item as an argument""" + ) class InconsistentTableTemplate(DltSourceException): @@ -208,29 +315,43 @@ def __init__(self, reason: str) -> None: class DataItemRequiredForDynamicTableHints(DltResourceException): def __init__(self, resource_name: str) -> None: - super().__init__(resource_name, f"""An instance of resource's data required to generate table schema in resource {resource_name}. - One of table hints for that resource (typically table name) is a function and hint is computed separately for each instance of data extracted from that resource.""") + super().__init__( + resource_name, + f"""An instance of resource's data required to generate table schema in resource {resource_name}. + One of table hints for that resource (typically table name) is a function and hint is computed separately for each instance of data extracted from that resource.""", + ) class SourceDataIsNone(DltSourceException): def __init__(self, source_name: str) -> None: self.source_name = source_name - super().__init__(f"No data returned or yielded from source function {source_name}. Did you forget the return statement?") + super().__init__( + f"No data returned or yielded from source function {source_name}. Did you forget the" + " return statement?" + ) class SourceExhausted(DltSourceException): def __init__(self, source_name: str) -> None: self.source_name = source_name - super().__init__(f"Source {source_name} is exhausted or has active iterator. You can iterate or pass the source to dlt pipeline only once.") + super().__init__( + f"Source {source_name} is exhausted or has active iterator. You can iterate or pass the" + " source to dlt pipeline only once." + ) class ResourcesNotFoundError(DltSourceException): - def __init__(self, source_name: str, available_resources: Set[str], requested_resources: Set[str]) -> None: + def __init__( + self, source_name: str, available_resources: Set[str], requested_resources: Set[str] + ) -> None: self.source_name = source_name self.available_resources = available_resources self.requested_resources = requested_resources self.not_found_resources = requested_resources.difference(available_resources) - msg = f"The following resources could not be found in source {source_name}: {self.not_found_resources}. Available resources are: {available_resources}" + msg = ( + f"The following resources could not be found in source {source_name}:" + f" {self.not_found_resources}. Available resources are: {available_resources}" + ) super().__init__(msg) @@ -239,28 +360,48 @@ def __init__(self, source_name: str, item: Any, _typ: Type[Any]) -> None: self.source_name = source_name self.item = item self.typ = _typ - super().__init__(f"First parameter to the source {source_name} must be a function or callable but is {_typ.__name__}. Please decorate a function with @dlt.source") + super().__init__( + f"First parameter to the source {source_name} must be a function or callable but is" + f" {_typ.__name__}. Please decorate a function with @dlt.source" + ) class SourceIsAClassTypeError(DltSourceException): - def __init__(self, source_name: str, _typ: Type[Any]) -> None: + def __init__(self, source_name: str, _typ: Type[Any]) -> None: self.source_name = source_name self.typ = _typ - super().__init__(f"First parameter to the source {source_name} is a class {_typ.__name__}. Do not decorate classes with @dlt.source. Instead implement __call__ in your class and pass instance of such class to dlt.source() directly") + super().__init__( + f"First parameter to the source {source_name} is a class {_typ.__name__}. Do not" + " decorate classes with @dlt.source. Instead implement __call__ in your class and pass" + " instance of such class to dlt.source() directly" + ) class SourceSchemaNotAvailable(DltSourceException): def __init__(self) -> None: - super().__init__("Current source schema is available only when called from a function decorated with dlt.source or dlt.resource") + super().__init__( + "Current source schema is available only when called from a function decorated with" + " dlt.source or dlt.resource" + ) class ExplicitSourceNameInvalid(DltSourceException): def __init__(self, source_name: str, schema_name: str) -> None: self.source_name = source_name self.schema_name = schema_name - super().__init__(f"Your explicit source name {source_name} is not a valid schema name. Please use a valid schema name ie. '{schema_name}'.") + super().__init__( + f"Your explicit source name {source_name} is not a valid schema name. Please use a" + f" valid schema name ie. '{schema_name}'." + ) class IncrementalUnboundError(DltResourceException): def __init__(self, cursor_path: str) -> None: - super().__init__("", f"The incremental definition with cursor path {cursor_path} is used without being bound to the resource. This most often happens when you create dynamic resource from a generator function that uses incremental. See https://dlthub.com/docs/general-usage/incremental-loading#incremental-loading-with-last-value for an example.") + super().__init__( + "", + f"The incremental definition with cursor path {cursor_path} is used without being bound" + " to the resource. This most often happens when you create dynamic resource from a" + " generator function that uses incremental. See" + " https://dlthub.com/docs/general-usage/incremental-loading#incremental-loading-with-last-value" + " for an example.", + ) diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 1276f1b1f5..6205c482f6 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -27,7 +27,7 @@ def extract( *, max_parallel_items: int = None, workers: int = None, - futures_poll_interval: float = None + futures_poll_interval: float = None, ) -> None: schema = source.schema resources_with_items: Set[str] = set() @@ -37,17 +37,21 @@ def extract( ), "arrow": ArrowExtractor( extract_id, storage, schema, resources_with_items, collector=collector - ) + ), } last_item_format: Optional[TLoaderFileFormat] = None with collector(f"Extract {source.name}"): # yield from all selected pipes - with PipeIterator.from_pipes(source.resources.selected_pipes, max_parallel_items=max_parallel_items, workers=workers, futures_poll_interval=futures_poll_interval) as pipes: + with PipeIterator.from_pipes( + source.resources.selected_pipes, + max_parallel_items=max_parallel_items, + workers=workers, + futures_poll_interval=futures_poll_interval, + ) as pipes: left_gens = total_gens = len(pipes._sources) collector.update("Resources", 0, total_gens) for pipe_item in pipes: - curr_gens = len(pipes._sources) if left_gens > curr_gens: delta = left_gens - curr_gens @@ -58,7 +62,9 @@ def extract( resource = source.resources[pipe_item.pipe.name] # Fallback to last item's format or default (puae-jsonl) if the current item is an empty list - item_format = Extractor.item_format(pipe_item.item) or last_item_format or "puae-jsonl" + item_format = ( + Extractor.item_format(pipe_item.item) or last_item_format or "puae-jsonl" + ) extractors[item_format].write_items(resource, pipe_item.item, pipe_item.meta) last_item_format = item_format @@ -94,12 +100,24 @@ def extract_with_schema( extract_id = storage.create_extract_id() with Container().injectable_context(SourceSchemaInjectableContext(source.schema)): # inject the config section with the current source name - with inject_section(ConfigSectionContext(sections=(known_sections.SOURCES, source.section, source.name), source_state_key=source.name)): + with inject_section( + ConfigSectionContext( + sections=(known_sections.SOURCES, source.section, source.name), + source_state_key=source.name, + ) + ): # reset resource states, the `extracted` list contains all the explicit resources and all their parents for resource in source.resources.extracted.values(): with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) + extract( + extract_id, + source, + storage, + collector, + max_parallel_items=max_parallel_items, + workers=workers, + ) return extract_id diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py index 0ec8aed968..f7159d478c 100644 --- a/dlt/extract/extractors.py +++ b/dlt/extract/extractors.py @@ -11,11 +11,18 @@ from dlt.common.utils import update_dict_nested from dlt.common.typing import TDataItems, TDataItem from dlt.common.schema import Schema, utils -from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema +from dlt.common.schema.typing import ( + TSchemaContractDict, + TSchemaEvolutionMode, + TTableSchema, + TTableSchemaColumns, + TPartialTableSchema, +) from dlt.extract.resource import DltResource from dlt.extract.typing import TableNameMeta from dlt.extract.storage import ExtractorStorage, ExtractorItemStorage + try: from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem @@ -37,14 +44,14 @@ class ExtractorConfiguration(BaseConfiguration): @with_config(spec=ExtractorConfiguration) def __init__( - self, - extract_id: str, - storage: ExtractorStorage, - schema: Schema, - resources_with_items: Set[str], - collector: Collector = NULL_COLLECTOR, - *, - _caps: DestinationCapabilitiesContext = None + self, + extract_id: str, + storage: ExtractorStorage, + schema: Schema, + resources_with_items: Set[str], + collector: Collector = NULL_COLLECTOR, + *, + _caps: DestinationCapabilitiesContext = None, ) -> None: self.schema = schema self.naming = schema.naming @@ -74,7 +81,7 @@ def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): return "arrow" return "puae-jsonl" - return None # Empty list is unknown format + return None # Empty list is unknown format def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" @@ -101,8 +108,16 @@ def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[s def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) + def _write_item( + self, + table_name: str, + resource_name: str, + items: TDataItems, + columns: TTableSchemaColumns = None, + ) -> None: + new_rows_count = self.storage.write_data_item( + self.extract_id, self.schema.name, table_name, items, columns + ) self.collector.update(table_name, inc=new_rows_count) self.resources_with_items.add(resource_name) @@ -120,7 +135,9 @@ def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> N if table_name not in self._filtered_tables: self._write_item(table_name, resource.name, item) - def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: + def _write_to_static_table( + self, resource: DltResource, table_name: str, items: TDataItems + ) -> None: if table_name not in self._table_contracts: items = self._compute_and_update_table(resource, table_name, items) if table_name not in self._filtered_tables: @@ -128,11 +145,11 @@ def _write_to_static_table(self, resource: DltResource, table_name: str, items: def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: """Computes a schema for a new or dynamic table and normalizes identifiers""" - return self.schema.normalize_table_identifiers( - resource.compute_table_schema(items) - ) + return self.schema.normalize_table_identifiers(resource.compute_table_schema(items)) - def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + def _compute_and_update_table( + self, resource: DltResource, table_name: str, items: TDataItems + ) -> TDataItems: """ Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written """ @@ -141,8 +158,7 @@ def _compute_and_update_table(self, resource: DltResource, table_name: str, item computed_table["name"] = table_name # get or compute contract schema_contract = self._table_contracts.setdefault( - table_name, - self.schema.resolve_contract_settings_for_table(table_name, computed_table) + table_name, self.schema.resolve_contract_settings_for_table(table_name, computed_table) ) # this is a new table so allow evolve once @@ -155,7 +171,9 @@ def _compute_and_update_table(self, resource: DltResource, table_name: str, item diff_table = computed_table # apply contracts - diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table, data_item=items) + diff_table, filters = self.schema.apply_schema_contract( + schema_contract, diff_table, data_item=items + ) # merge with schema table if diff_table: @@ -184,15 +202,24 @@ def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> No items = [ # 3. remove columns and rows in data contract filters # 2. Remove null-type columns from the table(s) as they can't be loaded - self._apply_contract_filters(pyarrow.remove_null_columns(tbl), resource, static_table_name) for tbl in ( - # 1. Convert pandas frame(s) to arrow Table - pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item + self._apply_contract_filters( + pyarrow.remove_null_columns(tbl), resource, static_table_name + ) + for tbl in ( + ( + # 1. Convert pandas frame(s) to arrow Table + pa.Table.from_pandas(item) + if (pd and isinstance(item, pd.DataFrame)) + else item + ) for item in (items if isinstance(items, list) else [items]) ) ] super().write_items(resource, items, meta) - def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, static_table_name: Optional[str]) -> "TAnyArrowItem": + def _apply_contract_filters( + self, item: "TAnyArrowItem", resource: DltResource, static_table_name: Optional[str] + ) -> "TAnyArrowItem": """Removes the columns (discard value) or rows (discard rows) as indicated by contract filters.""" # convert arrow schema names into normalized names rename_mapping = pyarrow.get_normalized_arrow_fields_mapping(item, self.naming) @@ -204,7 +231,9 @@ def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, # create a mask where rows will be False if any of the specified columns are non-null mask = None rev_mapping = {v: k for k, v in rename_mapping.items()} - for column in [name for name, mode in filtered_columns.items() if mode == "discard_row"]: + for column in [ + name for name, mode in filtered_columns.items() if mode == "discard_row" + ]: is_null = pyarrow.pyarrow.compute.is_null(item[rev_mapping[column]]) mask = is_null if mask is None else pyarrow.pyarrow.compute.and_(mask, is_null) # filter the table using the mask @@ -213,16 +242,29 @@ def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, # remove value actually removes the whole columns from the table # NOTE: filtered columns has normalized column names so we need to go through mapping - removed_columns = [name for name in rename_mapping if filtered_columns.get(rename_mapping[name]) is not None] + removed_columns = [ + name + for name in rename_mapping + if filtered_columns.get(rename_mapping[name]) is not None + ] if removed_columns: item = pyarrow.remove_columns(item, removed_columns) return item - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + def _write_item( + self, + table_name: str, + resource_name: str, + items: TDataItems, + columns: TTableSchemaColumns = None, + ) -> None: columns = columns or self.schema.tables[table_name]["columns"] # Note: `items` is always a list here due to the conversion in `write_table` - items = [pyarrow.normalize_py_arrow_schema(item, columns, self.naming, self._caps) for item in items] + items = [ + pyarrow.normalize_py_arrow_schema(item, columns, self.naming, self._caps) + for item in items + ] super()._write_item(table_name, resource_name, items, columns) def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: @@ -235,11 +277,15 @@ def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTa # normalize arrow table before merging arrow_table = self.schema.normalize_table_identifiers(arrow_table) # we must override the columns to preserve the order in arrow table - arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) + arrow_table["columns"] = update_dict_nested( + arrow_table["columns"], computed_table["columns"] + ) return arrow_table - def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + def _compute_and_update_table( + self, resource: DltResource, table_name: str, items: TDataItems + ) -> TDataItems: items = super()._compute_and_update_table(resource, table_name, items) # filter data item as filters could be updated in compute table items = [self._apply_contract_filters(item, resource, table_name) for item in items] diff --git a/dlt/extract/hints.py b/dlt/extract/hints.py index 19d503f970..140af65ac6 100644 --- a/dlt/extract/hints.py +++ b/dlt/extract/hints.py @@ -2,14 +2,29 @@ from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat, TSchemaContract +from dlt.common.schema.typing import ( + TColumnNames, + TColumnProp, + TColumnSchema, + TPartialTableSchema, + TTableSchema, + TTableSchemaColumns, + TWriteDisposition, + TAnySchemaColumns, + TTableFormat, + TSchemaContract, +) from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys from dlt.extract.incremental import Incremental from dlt.extract.typing import TFunHintTemplate, TTableHintTemplate, ValidateItem -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, TableNameMissing +from dlt.extract.exceptions import ( + DataItemRequiredForDynamicTableHints, + InconsistentTableTemplate, + TableNameMissing, +) from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint from dlt.extract.validation import create_item_validator @@ -48,7 +63,11 @@ def table_name(self) -> TTableHintTemplate[str]: if self._table_name_hint_fun: return self._table_name_hint_fun # get table name or default name - return self._table_schema_template.get("name") or self.name if self._table_schema_template else self.name + return ( + self._table_schema_template.get("name") or self.name + if self._table_schema_template + else self.name + ) @table_name.setter def table_name(self, value: TTableHintTemplate[str]) -> None: @@ -56,7 +75,10 @@ def table_name(self, value: TTableHintTemplate[str]) -> None: @property def write_disposition(self) -> TTableHintTemplate[TWriteDisposition]: - if self._table_schema_template is None or self._table_schema_template.get("write_disposition") is None: + if ( + self._table_schema_template is None + or self._table_schema_template.get("write_disposition") is None + ): return DEFAULT_WRITE_DISPOSITION return self._table_schema_template.get("write_disposition") @@ -75,7 +97,7 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: return self._table_schema_template.get("schema_contract") - def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: + def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data""" if not self._table_schema_template: return new_table(self.name, resource=self.name) @@ -113,20 +135,28 @@ def apply_hints( ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. - This method accepts the same table hints arguments as `dlt.resource` decorator with the following additions. - Skip the argument or pass None to leave the existing hint. - Pass empty value (for particular type ie "" for a string) to remove hint + This method accepts the same table hints arguments as `dlt.resource` decorator with the following additions. + Skip the argument or pass None to leave the existing hint. + Pass empty value (for particular type ie "" for a string) to remove hint - parent_table_name (str, optional): A name of parent table if foreign relation is defined. Please note that if you use merge you must define `root_key` columns explicitly - incremental (Incremental, optional): Enables the incremental loading for a resource. + parent_table_name (str, optional): A name of parent table if foreign relation is defined. Please note that if you use merge you must define `root_key` columns explicitly + incremental (Incremental, optional): Enables the incremental loading for a resource. - Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using is to skip already loaded data. - In non-aware resources, `dlt` will filter out the loaded values, however the resource will yield all the values again. + Please note that for efficient incremental loading, the resource must be aware of the Incremental by accepting it as one if its arguments and then using is to skip already loaded data. + In non-aware resources, `dlt` will filter out the loaded values, however the resource will yield all the values again. """ t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract) + t = self.new_table_template( + table_name, + parent_table_name, + write_disposition, + columns, + primary_key, + merge_key, + schema_contract, + ) else: # set single hints t = self._clone_table_template(self._table_schema_template) @@ -173,7 +203,9 @@ def apply_hints( t.pop("schema_contract", None) # recreate validator if columns definition or contract changed if schema_contract is not None or columns is not None: - t["validator"], schema_contract = create_item_validator(t.get("original_columns"), t.get("schema_contract")) + t["validator"], schema_contract = create_item_validator( + t.get("original_columns"), t.get("schema_contract") + ) if schema_contract is not None: t["schema_contract"] = schema_contract @@ -194,7 +226,9 @@ def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: else: self._table_name_hint_fun = None # check if any other hints in the table template should be inferred from data - self._table_has_other_dynamic_hints = any(callable(v) for k, v in table_schema_template.items() if k != "name") + self._table_has_other_dynamic_hints = any( + callable(v) for k, v in table_schema_template.items() if k != "name" + ) self._table_schema_template = table_schema_template @staticmethod @@ -246,7 +280,7 @@ def new_table_template( primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, schema_contract: TTableHintTemplate[TSchemaContract] = None, - table_format: TTableHintTemplate[TTableFormat] = None + table_format: TTableHintTemplate[TTableFormat] = None, ) -> TTableSchemaTemplate: validator, schema_contract = create_item_validator(columns, schema_contract) clean_columns = columns @@ -261,7 +295,7 @@ def new_table_template( write_disposition=write_disposition, # type: ignore columns=clean_columns, # type: ignore schema_contract=schema_contract, # type: ignore - table_format=table_format # type: ignore + table_format=table_format, # type: ignore ) if not table_name: new_template.pop("name") @@ -283,5 +317,11 @@ def new_table_template( def validate_dynamic_hints(template: TTableSchemaTemplate) -> None: table_name = template.get("name") # if any of the hints is a function then name must be as well - if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator", "original_columns"]) and not callable(table_name): - raise InconsistentTableTemplate(f"Table name {table_name} must be a function if any other table hint is a function") + if any( + callable(v) + for k, v in template.items() + if k not in ["name", "incremental", "validator", "original_columns"] + ) and not callable(table_name): + raise InconsistentTableTemplate( + f"Table name {table_name} must be a function if any other table hint is a function" + ) diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 1c5fa7ab38..947d51ce53 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -12,19 +12,38 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common import pendulum, logger from dlt.common.jsonpath import compile_path -from dlt.common.typing import TDataItem, TDataItems, TFun, extract_inner_type, get_generic_type_argument_from_instance, is_optional_type +from dlt.common.typing import ( + TDataItem, + TDataItems, + TFun, + extract_inner_type, + get_generic_type_argument_from_instance, + is_optional_type, +) from dlt.common.schema.typing import TColumnNames from dlt.common.configuration import configspec, ConfigurationValueError from dlt.common.configuration.specs import BaseConfiguration from dlt.common.pipeline import resource_state -from dlt.common.data_types.type_helpers import coerce_from_date_types, coerce_value, py_type_to_sc_type +from dlt.common.data_types.type_helpers import ( + coerce_from_date_types, + coerce_value, + py_type_to_sc_type, +) from dlt.extract.exceptions import IncrementalUnboundError -from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import ( + IncrementalCursorPathMissing, + IncrementalPrimaryKeyMissing, +) from dlt.extract.incremental.typing import IncrementalColumnState, TCursorValue, LastValueFunc from dlt.extract.pipe import Pipe from dlt.extract.typing import SupportsPipe, TTableHintTemplate, ItemTransform -from dlt.extract.incremental.transform import JsonIncremental, ArrowIncremental, IncrementalTransform +from dlt.extract.incremental.transform import ( + JsonIncremental, + ArrowIncremental, + IncrementalTransform, +) + try: from dlt.common.libs.pyarrow import is_arrow_item except MissingDependencyException: @@ -69,6 +88,7 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa The values passed explicitly to Incremental will be ignored. Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded """ + # this is config/dataclass so declare members cursor_path: str = None # TODO: Support typevar here @@ -79,13 +99,13 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa EMPTY: ClassVar["Incremental[Any]"] = None def __init__( - self, - cursor_path: str = dlt.config.value, - initial_value: Optional[TCursorValue]=None, - last_value_func: Optional[LastValueFunc[TCursorValue]]=max, - primary_key: Optional[TTableHintTemplate[TColumnNames]] = None, - end_value: Optional[TCursorValue] = None, - allow_external_schedulers: bool = False + self, + cursor_path: str = dlt.config.value, + initial_value: Optional[TCursorValue] = None, + last_value_func: Optional[LastValueFunc[TCursorValue]] = max, + primary_key: Optional[TTableHintTemplate[TColumnNames]] = None, + end_value: Optional[TCursorValue] = None, + allow_external_schedulers: bool = False, ) -> None: # make sure that path is valid if cursor_path: @@ -122,12 +142,13 @@ def _make_transforms(self) -> None: self.end_value, self._cached_state, self.last_value_func, - self.primary_key + self.primary_key, ) - @classmethod - def from_existing_state(cls, resource_name: str, cursor_path: str) -> "Incremental[TCursorValue]": + def from_existing_state( + cls, resource_name: str, cursor_path: str + ) -> "Incremental[TCursorValue]": """Create Incremental instance from existing state.""" state = Incremental._get_state(resource_name, cursor_path) i = cls(cursor_path, state["initial_value"]) @@ -143,7 +164,7 @@ def copy(self) -> "Incremental[TCursorValue]": last_value_func=self.last_value_func, primary_key=self.primary_key, end_value=self.end_value, - allow_external_schedulers=self.allow_external_schedulers + allow_external_schedulers=self.allow_external_schedulers, ) def merge(self, other: "Incremental[TCursorValue]") -> "Incremental[TCursorValue]": @@ -158,15 +179,17 @@ def merge(self, other: "Incremental[TCursorValue]") -> "Incremental[TCursorValue """ kwargs = dict(self, last_value_func=self.last_value_func, primary_key=self.primary_key) for key, value in dict( - other, - last_value_func=other.last_value_func, primary_key=other.primary_key).items(): + other, last_value_func=other.last_value_func, primary_key=other.primary_key + ).items(): if value is not None: kwargs[key] = value # preserve Generic param information if hasattr(self, "__orig_class__"): constructor = self.__orig_class__ else: - constructor = other.__orig_class__ if hasattr(other, "__orig_class__") else other.__class__ + constructor = ( + other.__orig_class__ if hasattr(other, "__orig_class__") else other.__class__ + ) constructor = extract_inner_type(constructor) return constructor(**kwargs) # type: ignore @@ -174,17 +197,28 @@ def on_resolved(self) -> None: compile_path(self.cursor_path) if self.end_value is not None and self.initial_value is None: raise ConfigurationValueError( - "Incremental 'end_value' was specified without 'initial_value'. 'initial_value' is required when using 'end_value'." + "Incremental 'end_value' was specified without 'initial_value'. 'initial_value' is" + " required when using 'end_value'." ) # Ensure end value is "higher" than initial value - if self.end_value is not None and self.last_value_func([self.end_value, self.initial_value]) != self.end_value: + if ( + self.end_value is not None + and self.last_value_func([self.end_value, self.initial_value]) != self.end_value + ): if self.last_value_func in (min, max): - adject = 'higher' if self.last_value_func is max else 'lower' - msg = f"Incremental 'initial_value' ({self.initial_value}) is {adject} than 'end_value` ({self.end_value}). 'end_value' must be {adject} than 'initial_value'" + adject = "higher" if self.last_value_func is max else "lower" + msg = ( + f"Incremental 'initial_value' ({self.initial_value}) is {adject} than" + f" 'end_value` ({self.end_value}). 'end_value' must be {adject} than" + " 'initial_value'" + ) else: msg = ( - f"Incremental 'initial_value' ({self.initial_value}) is greater than 'end_value' ({self.end_value}) as determined by the custom 'last_value_func'. " - f"The result of '{self.last_value_func.__name__}([end_value, initial_value])' must equal 'end_value'" + f"Incremental 'initial_value' ({self.initial_value}) is greater than" + f" 'end_value' ({self.end_value}) as determined by the custom" + " 'last_value_func'. The result of" + f" '{self.last_value_func.__name__}([end_value, initial_value])' must equal" + " 'end_value'" ) raise ConfigurationValueError(msg) @@ -206,9 +240,9 @@ def get_state(self) -> IncrementalColumnState: if self.end_value is not None: # End value uses mock state. We don't want to write it. return { - 'initial_value': self.initial_value, - 'last_value': self.initial_value, - 'unique_hashes': [] + "initial_value": self.initial_value, + "last_value": self.initial_value, + "unique_hashes": [], } if not self.resource_name: @@ -221,23 +255,27 @@ def get_state(self) -> IncrementalColumnState: { "initial_value": self.initial_value, "last_value": self.initial_value, - 'unique_hashes': [] + "unique_hashes": [], } ) return self._cached_state @staticmethod def _get_state(resource_name: str, cursor_path: str) -> IncrementalColumnState: - state: IncrementalColumnState = resource_state(resource_name).setdefault('incremental', {}).setdefault(cursor_path, {}) + state: IncrementalColumnState = ( + resource_state(resource_name).setdefault("incremental", {}).setdefault(cursor_path, {}) + ) # if state params is empty return state @property def last_value(self) -> Optional[TCursorValue]: s = self.get_state() - return s['last_value'] # type: ignore + return s["last_value"] # type: ignore - def _transform_item(self, transformer: IncrementalTransform, row: TDataItem) -> Optional[TDataItem]: + def _transform_item( + self, transformer: IncrementalTransform, row: TDataItem + ) -> Optional[TDataItem]: row, start_out_of_range, end_out_of_range = transformer(row) self.start_out_of_range = start_out_of_range self.end_out_of_range = end_out_of_range @@ -249,8 +287,8 @@ def get_incremental_value_type(self) -> Type[Any]: def _join_external_scheduler(self) -> None: """Detects existence of external scheduler from which `start_value` and `end_value` are taken. Detects Airflow and environment variables. - The logical "start date" coming from external scheduler will set the `initial_value` in incremental. if additionally logical "end date" is - present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded + The logical "start date" coming from external scheduler will set the `initial_value` in incremental. if additionally logical "end date" is + present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded """ # fit the pendulum into incremental type param_type = self.get_incremental_value_type() @@ -259,14 +297,22 @@ def _join_external_scheduler(self) -> None: if param_type is not Any: data_type = py_type_to_sc_type(param_type) except Exception as ex: - logger.warning(f"Specified Incremental last value type {param_type} is not supported. Please use DateTime, Date, float, int or str to join external schedulers.({ex})") + logger.warning( + f"Specified Incremental last value type {param_type} is not supported. Please use" + f" DateTime, Date, float, int or str to join external schedulers.({ex})" + ) if param_type is Any: - logger.warning("Could not find the last value type of Incremental class participating in external schedule. " - "Please add typing when declaring incremental argument in your resource or pass initial_value from which the type can be inferred.") + logger.warning( + "Could not find the last value type of Incremental class participating in external" + " schedule. Please add typing when declaring incremental argument in your resource" + " or pass initial_value from which the type can be inferred." + ) return - def _ensure_airflow_end_date(start_date: pendulum.DateTime, end_date: pendulum.DateTime) -> Optional[pendulum.DateTime]: + def _ensure_airflow_end_date( + start_date: pendulum.DateTime, end_date: pendulum.DateTime + ) -> Optional[pendulum.DateTime]: """if end_date is in the future or same as start date (manual run), set it to None so dlt state is used for incremental loading""" now = pendulum.now() if end_date is None or end_date > now or start_date == end_date: @@ -276,6 +322,7 @@ def _ensure_airflow_end_date(start_date: pendulum.DateTime, end_date: pendulum.D try: # we can move it to separate module when we have more of those from airflow.operators.python import get_current_context # noqa + context = get_current_context() start_date = context["data_interval_start"] end_date = _ensure_airflow_end_date(start_date, context["data_interval_end"]) @@ -284,10 +331,17 @@ def _ensure_airflow_end_date(start_date: pendulum.DateTime, end_date: pendulum.D self.end_value = coerce_from_date_types(data_type, end_date) else: self.end_value = None - logger.info(f"Found Airflow scheduler: initial value: {self.initial_value} from data_interval_start {context['data_interval_start']}, end value: {self.end_value} from data_interval_end {context['data_interval_end']}") + logger.info( + f"Found Airflow scheduler: initial value: {self.initial_value} from" + f" data_interval_start {context['data_interval_start']}, end value:" + f" {self.end_value} from data_interval_end {context['data_interval_end']}" + ) return except TypeError as te: - logger.warning(f"Could not coerce Airflow execution dates into the last value type {param_type}. ({te})") + logger.warning( + f"Could not coerce Airflow execution dates into the last value type {param_type}." + f" ({te})" + ) except Exception: pass @@ -310,24 +364,30 @@ def bind(self, pipe: SupportsPipe) -> "Incremental[TCursorValue]": self._join_external_scheduler() # set initial value from last value, in case of a new state those are equal self.start_value = self.last_value - logger.info(f"Bind incremental on {self.resource_name} with initial_value: {self.initial_value}, start_value: {self.start_value}, end_value: {self.end_value}") + logger.info( + f"Bind incremental on {self.resource_name} with initial_value: {self.initial_value}," + f" start_value: {self.start_value}, end_value: {self.end_value}" + ) # cache state self._cached_state = self.get_state() self._make_transforms() return self def __str__(self) -> str: - return f"Incremental at {id(self)} for resource {self.resource_name} with cursor path: {self.cursor_path} initial {self.initial_value} lv_func {self.last_value_func}" + return ( + f"Incremental at {id(self)} for resource {self.resource_name} with cursor path:" + f" {self.cursor_path} initial {self.initial_value} lv_func {self.last_value_func}" + ) def _get_transformer(self, items: TDataItems) -> IncrementalTransform: # Assume list is all of the same type for item in items if isinstance(items, list) else [items]: if is_arrow_item(item): - return self._transformers['arrow'] + return self._transformers["arrow"] elif pd is not None and isinstance(item, pd.DataFrame): - return self._transformers['arrow'] - return self._transformers['json'] - return self._transformers['json'] + return self._transformers["arrow"] + return self._transformers["json"] + return self._transformers["json"] def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: if rows is None: @@ -337,9 +397,14 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: transformer.primary_key = self.primary_key if isinstance(rows, list): - return [item for item in (self._transform_item(transformer, row) for row in rows) if item is not None] + return [ + item + for item in (self._transform_item(transformer, row) for row in rows) + if item is not None + ] return self._transform_item(transformer, rows) + Incremental.EMPTY = Incremental[Any]("") @@ -372,14 +437,15 @@ def get_incremental_arg(sig: inspect.Signature) -> Optional[inspect.Parameter]: for p in sig.parameters.values(): annotation = extract_inner_type(p.annotation) annotation = get_origin(annotation) or annotation - if (inspect.isclass(annotation) and issubclass(annotation, Incremental)) or isinstance(p.default, Incremental): + if (inspect.isclass(annotation) and issubclass(annotation, Incremental)) or isinstance( + p.default, Incremental + ): incremental_param = p break return incremental_param def wrap(self, sig: inspect.Signature, func: TFun) -> TFun: - """Wrap the callable to inject an `Incremental` object configured for the resource. - """ + """Wrap the callable to inject an `Incremental` object configured for the resource.""" incremental_param = self.get_incremental_arg(sig) assert incremental_param, "Please use `should_wrap` to decide if to call this function" @@ -413,9 +479,16 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: if is_optional_type(p.annotation): bound_args.arguments[p.name] = None # Remove partial spec return func(*bound_args.args, **bound_args.kwargs) - raise ValueError(f"{p.name} Incremental argument has no default. Please wrap its typing in Optional[] to allow no incremental") + raise ValueError( + f"{p.name} Incremental argument has no default. Please wrap its typing in" + " Optional[] to allow no incremental" + ) # pass Generic information from annotation to new_incremental - if not hasattr(new_incremental, "__orig_class__") and p.annotation and get_args(p.annotation): + if ( + not hasattr(new_incremental, "__orig_class__") + and p.annotation + and get_args(p.annotation) + ): new_incremental.__orig_class__ = p.annotation # type: ignore # set the incremental only if not yet set or if it was passed explicitly diff --git a/dlt/extract/incremental/exceptions.py b/dlt/extract/incremental/exceptions.py index 8de5623c78..e318a028dc 100644 --- a/dlt/extract/incremental/exceptions.py +++ b/dlt/extract/incremental/exceptions.py @@ -3,10 +3,13 @@ class IncrementalCursorPathMissing(PipeException): - def __init__(self, pipe_name: str, json_path: str, item: TDataItem, msg: str=None) -> None: + def __init__(self, pipe_name: str, json_path: str, item: TDataItem, msg: str = None) -> None: self.json_path = json_path self.item = item - msg = msg or f"Cursor element with JSON path {json_path} was not found in extracted data item. All data items must contain this path. Use the same names of fields as in your JSON document - if those are different from the names you see in database." + msg = ( + msg + or f"Cursor element with JSON path {json_path} was not found in extracted data item. All data items must contain this path. Use the same names of fields as in your JSON document - if those are different from the names you see in database." + ) super().__init__(pipe_name, msg) @@ -14,5 +17,9 @@ class IncrementalPrimaryKeyMissing(PipeException): def __init__(self, pipe_name: str, primary_key_column: str, item: TDataItem) -> None: self.primary_key_column = primary_key_column self.item = item - msg = f"Primary key column {primary_key_column} was not found in extracted data item. All data items must contain this column. Use the same names of fields as in your JSON document." + msg = ( + f"Primary key column {primary_key_column} was not found in extracted data item. All" + " data items must contain this column. Use the same names of fields as in your JSON" + " document." + ) super().__init__(pipe_name, msg) diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index 44538aa3f5..adf0c33ad3 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -17,11 +17,15 @@ from dlt.common import pendulum from dlt.common.typing import TDataItem, TDataItems from dlt.common.jsonpath import TJsonPath, find_values, JSONPathFields, compile_path -from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import ( + IncrementalCursorPathMissing, + IncrementalPrimaryKeyMissing, +) from dlt.extract.incremental.typing import IncrementalColumnState, TCursorValue, LastValueFunc from dlt.extract.utils import resolve_column_value from dlt.extract.typing import TTableHintTemplate from dlt.common.schema.typing import TColumnNames + try: from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem @@ -52,24 +56,26 @@ def __init__( # compile jsonpath self._compiled_cursor_path = compile_path(cursor_path) # for simple column name we'll fallback to search in dict - if isinstance(self._compiled_cursor_path, JSONPathFields) and len(self._compiled_cursor_path.fields) == 1 and self._compiled_cursor_path.fields[0] != "*": + if ( + isinstance(self._compiled_cursor_path, JSONPathFields) + and len(self._compiled_cursor_path.fields) == 1 + and self._compiled_cursor_path.fields[0] != "*" + ): self.cursor_path = self._compiled_cursor_path.fields[0] self._compiled_cursor_path = None def __call__( self, row: TDataItem, - ) -> Tuple[bool, bool, bool]: - ... + ) -> Tuple[bool, bool, bool]: ... class JsonIncremental(IncrementalTransform): - def unique_value( self, row: TDataItem, primary_key: Optional[TTableHintTemplate[TColumnNames]], - resource_name: str + resource_name: str, ) -> str: try: if primary_key: @@ -84,7 +90,7 @@ def unique_value( def find_cursor_value(self, row: TDataItem) -> Any: """Finds value in row at cursor defined by self.cursor_path. - Will use compiled JSONPath if present, otherwise it reverts to column search if row is dict + Will use compiled JSONPath if present, otherwise it reverts to column search if row is dict """ row_value: Any = None if self._compiled_cursor_path: @@ -119,33 +125,36 @@ def __call__( if isinstance(row_value, datetime): row_value = pendulum.instance(row_value) - last_value = self.incremental_state['last_value'] + last_value = self.incremental_state["last_value"] # Check whether end_value has been reached # Filter end value ranges exclusively, so in case of "max" function we remove values >= end_value if self.end_value is not None and ( - self.last_value_func((row_value, self.end_value)) != self.end_value or self.last_value_func((row_value, )) == self.end_value + self.last_value_func((row_value, self.end_value)) != self.end_value + or self.last_value_func((row_value,)) == self.end_value ): end_out_of_range = True return None, start_out_of_range, end_out_of_range - check_values = (row_value,) + ((last_value, ) if last_value is not None else ()) + check_values = (row_value,) + ((last_value,) if last_value is not None else ()) new_value = self.last_value_func(check_values) if last_value == new_value: - processed_row_value = self.last_value_func((row_value, )) + processed_row_value = self.last_value_func((row_value,)) # we store row id for all records with the current "last_value" in state and use it to deduplicate if processed_row_value == last_value: unique_value = self.unique_value(row, self.primary_key, self.resource_name) # if unique value exists then use it to deduplicate if unique_value: - if unique_value in self.incremental_state['unique_hashes']: + if unique_value in self.incremental_state["unique_hashes"]: return None, start_out_of_range, end_out_of_range # add new hash only if the record row id is same as current last value - self.incremental_state['unique_hashes'].append(unique_value) + self.incremental_state["unique_hashes"].append(unique_value) return row, start_out_of_range, end_out_of_range # skip the record that is not a last_value or new_value: that record was already processed - check_values = (row_value,) + ((self.start_value,) if self.start_value is not None else ()) + check_values = (row_value,) + ( + (self.start_value,) if self.start_value is not None else () + ) new_value = self.last_value_func(check_values) # Include rows == start_value but exclude "lower" if new_value == self.start_value and processed_row_value != self.start_value: @@ -166,10 +175,7 @@ class ArrowIncremental(IncrementalTransform): _dlt_index = "_dlt_index" def unique_values( - self, - item: "TAnyArrowItem", - unique_columns: List[str], - resource_name: str + self, item: "TAnyArrowItem", unique_columns: List[str], resource_name: str ) -> List[Tuple[int, str]]: if not unique_columns: return [] @@ -180,7 +186,9 @@ def unique_values( (index, digest128(json.dumps(row, sort_keys=True))) for index, row in zip(indices, rows) ] - def _deduplicate(self, tbl: "pa.Table", unique_columns: Optional[List[str]], aggregate: str, cursor_path: str) -> "pa.Table": + def _deduplicate( + self, tbl: "pa.Table", unique_columns: Optional[List[str]], aggregate: str, cursor_path: str + ) -> "pa.Table": """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: @@ -218,7 +226,7 @@ def __call__( if not tbl: # row is None or empty arrow table return tbl, start_out_of_range, end_out_of_range - last_value = self.incremental_state['last_value'] + last_value = self.incremental_state["last_value"] if self.last_value_func is max: compute = pa.compute.max @@ -233,8 +241,9 @@ def __call__( last_value_compare = pa.compute.less_equal new_value_compare = pa.compute.less else: - raise NotImplementedError("Only min or max last_value_func is supported for arrow tables") - + raise NotImplementedError( + "Only min or max last_value_func is supported for arrow tables" + ) # TODO: Json path support. For now assume the cursor_path is a column name cursor_path = self.cursor_path @@ -249,8 +258,12 @@ def __call__( row_value = pendulum.from_timestamp(orig_row_value.cast(pa.int64()).as_py() / 1000) except KeyError as e: raise IncrementalCursorPathMissing( - self.resource_name, cursor_path, tbl, - f"Column name {cursor_path} was not found in the arrow table. Not nested JSON paths are not supported for arrow tables and dataframes, the incremental cursor_path must be a column name." + self.resource_name, + cursor_path, + tbl, + f"Column name {cursor_path} was not found in the arrow table. Not nested JSON paths" + " are not supported for arrow tables and dataframes, the incremental cursor_path" + " must be a column name.", ) from e # If end_value is provided, filter to include table rows that are "less" than end_value @@ -273,26 +286,47 @@ def __call__( eq_rows = tbl.filter(pa.compute.equal(tbl[cursor_path], last_value)) # compute index, unique hash mapping unique_values = self.unique_values(eq_rows, unique_columns, self.resource_name) - unique_values = [(i, uq_val) for i, uq_val in unique_values if uq_val in self.incremental_state['unique_hashes']] + unique_values = [ + (i, uq_val) + for i, uq_val in unique_values + if uq_val in self.incremental_state["unique_hashes"] + ] remove_idx = pa.array(i for i, _ in unique_values) # Filter the table tbl = tbl.filter(pa.compute.invert(pa.compute.is_in(tbl[self._dlt_index], remove_idx))) - if new_value_compare(row_value, last_value).as_py() and row_value != last_value: # Last value has changed - self.incremental_state['last_value'] = row_value + if ( + new_value_compare(row_value, last_value).as_py() and row_value != last_value + ): # Last value has changed + self.incremental_state["last_value"] = row_value # Compute unique hashes for all rows equal to row value - self.incremental_state['unique_hashes'] = [uq_val for _, uq_val in self.unique_values( - tbl.filter(pa.compute.equal(tbl[cursor_path], row_value)), unique_columns, self.resource_name - )] + self.incremental_state["unique_hashes"] = [ + uq_val + for _, uq_val in self.unique_values( + tbl.filter(pa.compute.equal(tbl[cursor_path], row_value)), + unique_columns, + self.resource_name, + ) + ] else: # last value is unchanged, add the hashes - self.incremental_state['unique_hashes'] = list(set(self.incremental_state['unique_hashes'] + [uq_val for _, uq_val in unique_values])) + self.incremental_state["unique_hashes"] = list( + set( + self.incremental_state["unique_hashes"] + + [uq_val for _, uq_val in unique_values] + ) + ) else: tbl = self._deduplicate(tbl, unique_columns, aggregate, cursor_path) - self.incremental_state['last_value'] = row_value - self.incremental_state['unique_hashes'] = [uq_val for _, uq_val in self.unique_values( - tbl.filter(pa.compute.equal(tbl[cursor_path], row_value)), unique_columns, self.resource_name - )] + self.incremental_state["last_value"] = row_value + self.incremental_state["unique_hashes"] = [ + uq_val + for _, uq_val in self.unique_values( + tbl.filter(pa.compute.equal(tbl[cursor_path], row_value)), + unique_columns, + self.resource_name, + ) + ] if len(tbl) == 0: return None, start_out_of_range, end_out_of_range diff --git a/dlt/extract/incremental/typing.py b/dlt/extract/incremental/typing.py index 03f36121be..9cec97d34d 100644 --- a/dlt/extract/incremental/typing.py +++ b/dlt/extract/incremental/typing.py @@ -4,6 +4,7 @@ TCursorValue = TypeVar("TCursorValue", bound=Any) LastValueFunc = Callable[[Sequence[TCursorValue]], Any] + class IncrementalColumnState(TypedDict): initial_value: Optional[Any] last_value: Optional[Any] diff --git a/dlt/extract/pipe.py b/dlt/extract/pipe.py index 24fe3203aa..85c654b46c 100644 --- a/dlt/extract/pipe.py +++ b/dlt/extract/pipe.py @@ -6,7 +6,23 @@ from concurrent.futures import ThreadPoolExecutor from copy import copy from threading import Thread -from typing import Any, Dict, Optional, Sequence, Union, Callable, Iterable, Iterator, List, NamedTuple, Awaitable, Tuple, Type, TYPE_CHECKING, Literal +from typing import ( + Any, + Dict, + Optional, + Sequence, + Union, + Callable, + Iterable, + Iterator, + List, + NamedTuple, + Awaitable, + Tuple, + Type, + TYPE_CHECKING, + Literal, +) from dlt.common import sleep from dlt.common.configuration import configspec @@ -18,11 +34,27 @@ from dlt.common.typing import AnyFun, AnyType, TDataItems from dlt.common.utils import get_callable_name -from dlt.extract.exceptions import (CreatePipeException, DltSourceException, ExtractorException, InvalidStepFunctionArguments, - InvalidResourceDataTypeFunctionNotAGenerator, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, - PipeException, PipeGenInvalid, PipeItemProcessingError, PipeNotBoundToData, ResourceExtractionError) +from dlt.extract.exceptions import ( + CreatePipeException, + DltSourceException, + ExtractorException, + InvalidStepFunctionArguments, + InvalidResourceDataTypeFunctionNotAGenerator, + InvalidTransformerGeneratorFunction, + ParametrizedResourceUnbound, + PipeException, + PipeGenInvalid, + PipeItemProcessingError, + PipeNotBoundToData, + ResourceExtractionError, +) from dlt.extract.typing import DataItemWithMeta, ItemTransform, SupportsPipe, TPipedDataItems -from dlt.extract.utils import check_compat_transformer, simulate_func_call, wrap_compat_transformer, wrap_resource_gen +from dlt.extract.utils import ( + check_compat_transformer, + simulate_func_call, + wrap_compat_transformer, + wrap_resource_gen, +) if TYPE_CHECKING: TItemFuture = Future[Union[TDataItems, DataItemWithMeta]] @@ -61,6 +93,7 @@ class SourcePipeItem(NamedTuple): # pipeline step may be iterator of data items or mapping function that returns data item or another iterator from dlt.common.typing import TDataItem + TPipeStep = Union[ Iterable[TPipedDataItems], Iterator[TPipedDataItems], @@ -115,7 +148,12 @@ def __init__(self, name: str, steps: List[TPipeStep] = None, parent: "Pipe" = No self.append_step(step) @classmethod - def from_data(cls, name: str, gen: Union[Iterable[TPipedDataItems], Iterator[TPipedDataItems], AnyFun], parent: "Pipe" = None) -> "Pipe": + def from_data( + cls, + name: str, + gen: Union[Iterable[TPipedDataItems], Iterator[TPipedDataItems], AnyFun], + parent: "Pipe" = None, + ) -> "Pipe": return cls(name, [gen], parent=parent) @property @@ -150,7 +188,7 @@ def steps(self) -> List[TPipeStep]: def find(self, *step_type: AnyType) -> int: """Finds a step with object of type `step_type`""" - return next((i for i,v in enumerate(self._steps) if isinstance(v, step_type)), -1) + return next((i for i, v in enumerate(self._steps) if isinstance(v, step_type)), -1) def __getitem__(self, i: int) -> TPipeStep: return self._steps[i] @@ -188,7 +226,11 @@ def insert_step(self, step: TPipeStep, index: int) -> "Pipe": return self.append_step(step) if index == 0: if not self.has_parent: - raise CreatePipeException(self.name, "You cannot insert a step before head of the resource that is not a transformer") + raise CreatePipeException( + self.name, + "You cannot insert a step before head of the resource that is not a" + " transformer", + ) step = self._wrap_transform_step_meta(index, step) # actually insert in the list self._steps.insert(index, step) @@ -200,7 +242,10 @@ def insert_step(self, step: TPipeStep, index: int) -> "Pipe": def remove_step(self, index: int) -> None: """Removes steps at a given index. Gen step cannot be removed""" if index == self._gen_idx: - raise CreatePipeException(self.name, f"Step at index {index} holds a data generator for this pipe and cannot be removed") + raise CreatePipeException( + self.name, + f"Step at index {index} holds a data generator for this pipe and cannot be removed", + ) self._steps.pop(index) if index < self._gen_idx: self._gen_idx -= 1 @@ -241,7 +286,13 @@ def ensure_gen_bound(self) -> None: sig.bind() except TypeError as ex: callable_name = get_callable_name(head) - raise ParametrizedResourceUnbound(self.name, callable_name, sig.replace(parameters=list(sig.parameters.values())[1:]), "resource", str(ex)) + raise ParametrizedResourceUnbound( + self.name, + callable_name, + sig.replace(parameters=list(sig.parameters.values())[1:]), + "resource", + str(ex), + ) def evaluate_gen(self) -> None: """Lazily evaluate gen of the pipe when creating PipeIterator. Allows creating multiple use pipes from generator functions and lists""" @@ -255,7 +306,13 @@ def evaluate_gen(self) -> None: # must be parameter-less callable or parameters must have defaults self.replace_gen(gen()) # type: ignore except TypeError as ex: - raise ParametrizedResourceUnbound(self.name, get_callable_name(gen), inspect.signature(gen), "resource", str(ex)) + raise ParametrizedResourceUnbound( + self.name, + get_callable_name(gen), + inspect.signature(gen), + "resource", + str(ex), + ) # otherwise it must be an iterator if isinstance(gen, Iterable): self.replace_gen(iter(gen)) @@ -309,18 +366,28 @@ def _wrap_gen(self, *args: Any, **kwargs: Any) -> Any: def _verify_head_step(self, step: TPipeStep) -> None: # first element must be Iterable, Iterator or Callable in resource pipe if not isinstance(step, (Iterable, Iterator)) and not callable(step): - raise CreatePipeException(self.name, "A head of a resource pipe must be Iterable, Iterator or a Callable") + raise CreatePipeException( + self.name, "A head of a resource pipe must be Iterable, Iterator or a Callable" + ) def _wrap_transform_step_meta(self, step_no: int, step: TPipeStep) -> TPipeStep: # step must be a callable: a transformer or a transformation if isinstance(step, (Iterable, Iterator)) and not callable(step): if self.has_parent: - raise CreatePipeException(self.name, "Iterable or Iterator cannot be a step in transformer pipe") + raise CreatePipeException( + self.name, "Iterable or Iterator cannot be a step in transformer pipe" + ) else: - raise CreatePipeException(self.name, "Iterable or Iterator can only be a first step in resource pipe") + raise CreatePipeException( + self.name, "Iterable or Iterator can only be a first step in resource pipe" + ) if not callable(step): - raise CreatePipeException(self.name, "Pipe step must be a callable taking one data item as argument and optional second meta argument") + raise CreatePipeException( + self.name, + "Pipe step must be a callable taking one data item as argument and optional second" + " meta argument", + ) else: # check the signature sig = inspect.signature(step) @@ -344,8 +411,13 @@ def _partial(*args: Any, **kwargs: Any) -> Any: # del kwargs["meta"] return orig_step(*args, **kwargs) - meta_arg = inspect.Parameter("meta", inspect._ParameterKind.KEYWORD_ONLY, default=None) - kwargs_arg = next((p for p in sig.parameters.values() if p.kind == inspect.Parameter.VAR_KEYWORD), None) + meta_arg = inspect.Parameter( + "meta", inspect._ParameterKind.KEYWORD_ONLY, default=None + ) + kwargs_arg = next( + (p for p in sig.parameters.values() if p.kind == inspect.Parameter.VAR_KEYWORD), + None, + ) if kwargs_arg: # pass meta in variadic new_sig = sig @@ -358,7 +430,6 @@ def _partial(*args: Any, **kwargs: Any) -> Any: self._ensure_transform_step(step_no, step) return step - def _ensure_transform_step(self, step_no: int, step: TPipeStep) -> None: """Verifies that `step` is a valid callable to be a transform step of the pipeline""" assert callable(step), f"{step} must be callable" @@ -375,7 +446,13 @@ def _ensure_transform_step(self, step_no: int, step: TPipeStep) -> None: raise InvalidTransformerGeneratorFunction(self.name, callable_name, sig, code=1) else: # show the sig without first argument - raise ParametrizedResourceUnbound(self.name, callable_name, sig.replace(parameters=list(sig.parameters.values())[1:]), "transformer", str(ty_ex)) + raise ParametrizedResourceUnbound( + self.name, + callable_name, + sig.replace(parameters=list(sig.parameters.values())[1:]), + "transformer", + str(ty_ex), + ) else: raise InvalidStepFunctionArguments(self.name, callable_name, sig, str(ty_ex)) @@ -405,7 +482,6 @@ def __repr__(self) -> str: class PipeIterator(Iterator[PipeItem]): - @configspec class PipeIteratorConfiguration(BaseConfiguration): max_parallel_items: int = 20 @@ -416,7 +492,13 @@ class PipeIteratorConfiguration(BaseConfiguration): __section__ = "extract" - def __init__(self, max_parallel_items: int, workers: int, futures_poll_interval: float, next_item_mode: TPipeNextItemMode) -> None: + def __init__( + self, + max_parallel_items: int, + workers: int, + futures_poll_interval: float, + next_item_mode: TPipeNextItemMode, + ) -> None: self.max_parallel_items = max_parallel_items self.workers = workers self.futures_poll_interval = futures_poll_interval @@ -432,7 +514,15 @@ def __init__(self, max_parallel_items: int, workers: int, futures_poll_interval: @classmethod @with_config(spec=PipeIteratorConfiguration) - def from_pipe(cls, pipe: Pipe, *, max_parallel_items: int = 20, workers: int = 5, futures_poll_interval: float = 0.01, next_item_mode: TPipeNextItemMode = "fifo") -> "PipeIterator": + def from_pipe( + cls, + pipe: Pipe, + *, + max_parallel_items: int = 20, + workers: int = 5, + futures_poll_interval: float = 0.01, + next_item_mode: TPipeNextItemMode = "fifo", + ) -> "PipeIterator": # join all dependent pipes if pipe.parent: pipe = pipe.full_pipe() @@ -460,15 +550,13 @@ def from_pipes( workers: int = 5, futures_poll_interval: float = 0.01, copy_on_fork: bool = False, - next_item_mode: TPipeNextItemMode = "fifo" + next_item_mode: TPipeNextItemMode = "fifo", ) -> "PipeIterator": - # print(f"max_parallel_items: {max_parallel_items} workers: {workers}") extract = cls(max_parallel_items, workers, futures_poll_interval, next_item_mode) # clone all pipes before iterating (recursively) as we will fork them (this add steps) and evaluate gens pipes, _ = PipeIterator.clone_pipes(pipes) - def _fork_pipeline(pipe: Pipe) -> None: if pipe.parent: # fork the parent pipe @@ -524,7 +612,9 @@ def __next__(self) -> PipeItem: # if item is iterator, then add it as a new source if isinstance(item, Iterator): # print(f"adding iterable {item}") - self._sources.append(SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta)) + self._sources.append( + SourcePipeItem(item, pipe_item.step, pipe_item.pipe, pipe_item.meta) + ) pipe_item = None continue @@ -552,7 +642,11 @@ def __next__(self) -> PipeItem: # must be resolved if isinstance(item, (Iterator, Awaitable)) or callable(item): raise PipeItemProcessingError( - pipe_item.pipe.name, f"Pipe item at step {pipe_item.step} was not fully evaluated and is of type {type(pipe_item.item).__name__}. This is internal error or you are yielding something weird from resources ie. functions or awaitables.") + pipe_item.pipe.name, + f"Pipe item at step {pipe_item.step} was not fully evaluated and is of type" + f" {type(pipe_item.item).__name__}. This is internal error or you are" + " yielding something weird from resources ie. functions or awaitables.", + ) # mypy not able to figure out that item was resolved return pipe_item # type: ignore @@ -567,14 +661,23 @@ def __next__(self) -> PipeItem: next_item = next_item.data except TypeError as ty_ex: assert callable(step) - raise InvalidStepFunctionArguments(pipe_item.pipe.name, get_callable_name(step), inspect.signature(step), str(ty_ex)) + raise InvalidStepFunctionArguments( + pipe_item.pipe.name, + get_callable_name(step), + inspect.signature(step), + str(ty_ex), + ) except (PipelineException, ExtractorException, DltSourceException, PipeException): raise except Exception as ex: - raise ResourceExtractionError(pipe_item.pipe.name, step, str(ex), "transform") from ex + raise ResourceExtractionError( + pipe_item.pipe.name, step, str(ex), "transform" + ) from ex # create next pipe item if a value was returned. A None means that item was consumed/filtered out and should not be further processed if next_item is not None: - pipe_item = ResolvablePipeItem(next_item, pipe_item.step + 1, pipe_item.pipe, next_meta) + pipe_item = ResolvablePipeItem( + next_item, pipe_item.step + 1, pipe_item.pipe, next_meta + ) else: pipe_item = None @@ -622,7 +725,7 @@ def start_background_loop(loop: asyncio.AbstractEventLoop) -> None: target=start_background_loop, args=(self._async_pool,), daemon=True, - name="DltFuturesThread" + name="DltFuturesThread", ) self._async_pool_thread.start() @@ -640,7 +743,9 @@ def _ensure_thread_pool(self) -> ThreadPoolExecutor: def __enter__(self) -> "PipeIterator": return self - def __exit__(self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType) -> None: + def __exit__( + self, exc_type: Type[BaseException], exc_val: BaseException, exc_tb: types.TracebackType + ) -> None: self.close() def _next_future(self) -> int: @@ -665,7 +770,9 @@ def _resolve_futures(self) -> ResolvablePipeItem: if future.exception(): ex = future.exception() - if isinstance(ex, (PipelineException, ExtractorException, DltSourceException, PipeException)): + if isinstance( + ex, (PipelineException, ExtractorException, DltSourceException, PipeException) + ): raise ex raise ResourceExtractionError(pipe.name, future, str(ex), "future") from ex @@ -754,7 +861,9 @@ def _get_source_item_round_robin(self) -> ResolvablePipeItem: raise ResourceExtractionError(pipe.name, gen, str(ex), "generator") from ex @staticmethod - def clone_pipes(pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = None) -> Tuple[List[Pipe], Dict[int, Pipe]]: + def clone_pipes( + pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = None + ) -> Tuple[List[Pipe], Dict[int, Pipe]]: """This will clone pipes and fix the parent/dependent references""" cloned_pipes = [p._clone() for p in pipes if id(p) not in (existing_cloned_pairs or {})] cloned_pairs = {id(p): c for p, c in zip(pipes, cloned_pipes)} @@ -784,6 +893,7 @@ def clone_pipes(pipes: Sequence[Pipe], existing_cloned_pairs: Dict[int, Pipe] = class ManagedPipeIterator(PipeIterator): """A version of the pipe iterator that gets closed automatically on an exception in _next_""" + _ctx: List[ContainerInjectableContext] = None _container: Container = None diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py index 2c3018e77d..a5d913c092 100644 --- a/dlt/extract/resource.py +++ b/dlt/extract/resource.py @@ -1,25 +1,56 @@ from copy import deepcopy import inspect -from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Iterable, Iterator, Union, Any, Optional +from typing import ( + AsyncIterable, + AsyncIterator, + ClassVar, + Callable, + Iterable, + Iterator, + Union, + Any, + Optional, +) from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType from dlt.common.configuration.container import Container -from dlt.common.pipeline import PipelineContext, StateInjectableContext, resource_state, pipeline_state +from dlt.common.pipeline import ( + PipelineContext, + StateInjectableContext, + resource_state, + pipeline_state, +) from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id -from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TableNameMeta, - FilterItem, MapItem, YieldMapItem, ValidateItem) +from dlt.extract.typing import ( + DataItemWithMeta, + ItemTransformFunc, + ItemTransformFunctionWithMeta, + TableNameMeta, + FilterItem, + MapItem, + YieldMapItem, + ValidateItem, +) from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep from dlt.extract.hints import DltResourceHints, TTableSchemaTemplate from dlt.extract.incremental import Incremental, IncrementalResourceWrapper from dlt.extract.exceptions import ( - InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, - InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, - InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, - InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer + InvalidTransformerDataTypeGeneratorFunctionRequired, + InvalidParentResourceDataType, + InvalidParentResourceIsAFunction, + InvalidResourceDataType, + InvalidResourceDataTypeIsNone, + InvalidTransformerGeneratorFunction, + InvalidResourceDataTypeAsync, + InvalidResourceDataTypeBasic, + InvalidResourceDataTypeMultiplePipes, + ParametrizedResourceUnbound, + ResourceNameMissing, + ResourceNotATransformer, ) from dlt.extract.wrappers import wrap_additional_type @@ -31,6 +62,7 @@ def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: class DltResource(Iterable[TDataItem], DltResourceHints): """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" + Empty: ClassVar["DltResource"] = None source_name: str """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" @@ -44,7 +76,7 @@ def __init__( selected: bool, incremental: IncrementalResourceWrapper = None, section: str = None, - args_bound: bool = False + args_bound: bool = False, ) -> None: self.section = section self.selected = selected @@ -65,7 +97,7 @@ def from_data( table_schema_template: TTableSchemaTemplate = None, selected: bool = True, data_from: Union["DltResource", Pipe] = None, - incremental: IncrementalResourceWrapper = None + incremental: IncrementalResourceWrapper = None, ) -> "DltResource": if data is None: raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore @@ -74,7 +106,9 @@ def from_data( return data if isinstance(data, Pipe): - return cls(data, table_schema_template, selected, incremental=incremental, section=section) + return cls( + data, table_schema_template, selected, incremental=incremental, section=section + ) if callable(data): name = name or get_callable_name(data) @@ -105,10 +139,19 @@ def from_data( # create resource from iterator, iterable or generator function if isinstance(data, (Iterable, Iterator)) or callable(data): pipe = Pipe.from_data(name, data, parent=parent_pipe) - return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) + return cls( + pipe, + table_schema_template, + selected, + incremental=incremental, + section=section, + args_bound=not callable(data), + ) else: # some other data type that is not supported - raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") + raise InvalidResourceDataType( + name, data, type(data), f"The data type of supplied type is {type(data).__name__}" + ) @property def name(self) -> str: @@ -165,7 +208,9 @@ def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: if self.is_transformer: DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) else: - raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") + raise ResourceNotATransformer( + self.name, "Cannot pipe data into resource that is not a transformer." + ) parent_pipe = self._get_parent_pipe(self.name, data_from) self._pipe.parent = parent_pipe @@ -177,8 +222,9 @@ def add_pipe(self, data: Any) -> None: def select_tables(self, *table_names: Iterable[str]) -> "DltResource": """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. - Both `with_table_name` marker and data-based (function) table name hints are supported. + Both `with_table_name` marker and data-based (function) table name hints are supported. """ + def _filter(item: TDataItem, meta: Any = None) -> bool: is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names @@ -188,7 +234,9 @@ def _filter(item: TDataItem, meta: Any = None) -> bool: self.add_filter(_filter) return self - def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 + def add_map( + self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None + ) -> "DltResource": # noqa: A003 """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically @@ -206,7 +254,9 @@ def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) self._pipe.insert_step(MapItem(item_map), insert_at) return self - def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 + def add_yield_map( + self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None + ) -> "DltResource": # noqa: A003 """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to @@ -225,7 +275,9 @@ def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert self._pipe.insert_step(YieldMapItem(item_map), insert_at) return self - def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 + def add_filter( + self, item_filter: ItemTransformFunc[bool], insert_at: int = None + ) -> "DltResource": # noqa: A003 """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically @@ -253,6 +305,7 @@ def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 Returns: "DltResource": returns self """ + def _gen_wrap(gen: TPipeStep) -> TPipeStep: """Wrap a generator to take the first `max_items` records""" nonlocal max_items @@ -269,12 +322,15 @@ def _gen_wrap(gen: TPipeStep) -> TPipeStep: if inspect.isgenerator(gen): gen.close() return + # transformers should be limited by their input, so we only limit non-transformers if not self.is_transformer: self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) return self - def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 + def add_step( + self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None + ) -> "DltResource": # noqa: A003 if insert_at is None: self._pipe.append_step(item_transform) else: @@ -298,8 +354,8 @@ def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: if primary_key is not None: incremental.primary_key = primary_key - if table_schema_template.get('validator') is not None: - self.validator = table_schema_template['validator'] + if table_schema_template.get("validator") is not None: + self.validator = table_schema_template["validator"] def bind(self, *args: Any, **kwargs: Any) -> "DltResource": """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" @@ -364,7 +420,7 @@ def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": def __iter__(self) -> Iterator[TDataItem]: """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. - A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. + A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. """ # use the same state dict when opening iterator and when iterator is iterated container = Container() @@ -380,7 +436,9 @@ def __iter__(self) -> Iterator[TDataItem]: _iter = map(lambda item: item.item, pipe_iterator) return flatten_list_or_items(_iter) - def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: + def _set_explicit_args( + self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any + ) -> None: try: sig = sig or inspect.signature(f) self._explicit_args = sig.bind_partial(*args, **kwargs).arguments @@ -388,8 +446,7 @@ def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: An pass def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": - """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source - """ + """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source""" pipe = self._pipe if self._pipe and not self._pipe.is_empty: pipe = pipe._clone(new_name=new_name, with_parent=with_parent) @@ -398,7 +455,7 @@ def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResourc pipe, deepcopy(self._table_schema_template), selected=self.selected, - section=self.section + section=self.section, ) def _get_config_section_context(self) -> ConfigSectionContext: @@ -419,8 +476,12 @@ def _get_config_section_context(self) -> ConfigSectionContext: pipeline_name=pipeline_name, # do not emit middle config section to not overwrite the resource section # only sources emit middle config section - sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), - source_state_key=self.source_name or default_schema_name or self.section or uniq_id() + sections=( + known_sections.SOURCES, + "", + self.source_name or default_schema_name or self.name, + ), + source_state_key=self.source_name or default_schema_name or self.section or uniq_id(), ) def __str__(self) -> str: @@ -433,14 +494,24 @@ def __str__(self) -> str: info += ":" if self.is_transformer: - info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" + info += ( + "\nThis resource is a transformer and takes data items from" + f" {self._pipe.parent.name}" + ) else: if self._pipe.is_data_bound: if self.requires_args: head_sig = inspect.signature(self._pipe.gen) # type: ignore - info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." + info += ( + "\nThis resource is parametrized and takes the following arguments" + f" {head_sig}. You must call this resource before loading." + ) else: - info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." + info += ( + "\nIf you want to see the data items in the resource you must iterate it or" + " convert to list ie. list(resource). Note that, like any iterator, you can" + " iterate the resource only once." + ) else: info += "\nThis resource is not bound to the data" info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" @@ -452,7 +523,9 @@ def _ensure_valid_transformer_resource(name: str, data: Any) -> None: if callable(data): valid_code = DltResource.validate_transformer_generator_function(data) if valid_code != 0: - raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) + raise InvalidTransformerGeneratorFunction( + name, get_callable_name(data), inspect.signature(data), valid_code + ) else: raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) diff --git a/dlt/extract/source.py b/dlt/extract/source.py index 0ff24d1f86..17fc7248a2 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -14,14 +14,24 @@ from dlt.common.schema.typing import TColumnName, TSchemaContract from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container -from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, source_state, pipeline_state +from dlt.common.pipeline import ( + PipelineContext, + StateInjectableContext, + SupportsPipelineRun, + source_state, + pipeline_state, +) from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, graph_edges_to_nodes from dlt.extract.typing import TDecompositionStrategy from dlt.extract.pipe import Pipe, ManagedPipeIterator from dlt.extract.hints import DltResourceHints from dlt.extract.resource import DltResource -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, ResourcesNotFoundError, DeletingResourcesNotSupported +from dlt.extract.exceptions import ( + DataItemRequiredForDynamicTableHints, + ResourcesNotFoundError, + DeletingResourcesNotSupported, +) class DltResourceDict(Dict[str, DltResource]): @@ -38,7 +48,7 @@ def __init__(self, source_name: str, source_section: str) -> None: @property def selected(self) -> Dict[str, DltResource]: """Returns a subset of all resources that will be extracted and loaded to the destination.""" - return {k:v for k,v in self.items() if v.selected} + return {k: v for k, v in self.items() if v.selected} @property def extracted(self) -> Dict[str, DltResource]: @@ -55,8 +65,7 @@ def extracted(self) -> Dict[str, DltResource]: except KeyError: # resource for pipe not found: return mock resource mock_template = DltResourceHints.new_table_template( - pipe.name, - write_disposition=resource.write_disposition + pipe.name, write_disposition=resource.write_disposition ) resource = DltResource(pipe, mock_template, False, section=resource.section) resource.source_name = resource.source_name @@ -97,7 +106,9 @@ def select(self, *resource_names: str) -> Dict[str, DltResource]: for name in resource_names: if name not in self: # if any key is missing, display the full info - raise ResourcesNotFoundError(self.source_name, set(self.keys()), set(resource_names)) + raise ResourcesNotFoundError( + self.source_name, set(self.keys()), set(resource_names) + ) # set the selected flags for resource in self.values(): self[resource.name].selected = resource.name in resource_names @@ -131,7 +142,10 @@ def _clone_new_pipes(self, resource_names: Sequence[str]) -> None: def __setitem__(self, resource_name: str, resource: DltResource) -> None: if resource_name != resource.name: - raise ValueError(f"The index name {resource_name} does not correspond to resource name {resource.name}") + raise ValueError( + f"The index name {resource_name} does not correspond to resource name" + f" {resource.name}" + ) pipe_id = id(resource._pipe) # make shallow copy of the resource resource = copy(resource) @@ -167,7 +181,10 @@ class DltSource(Iterable[TDataItem]): * You can use a `run` method to load the data with a default instance of dlt pipeline. * You can get source read only state for the currently active Pipeline instance """ - def __init__(self, schema: Schema, section: str, resources: Sequence[DltResource] = None) -> None: + + def __init__( + self, schema: Schema, section: str, resources: Sequence[DltResource] = None + ) -> None: self.section = section """Tells if iterator associated with a source is exhausted""" self._schema = schema @@ -195,7 +212,6 @@ def from_data(cls, schema: Schema, section: str, data: Any) -> Self: def name(self) -> str: return self._schema.name - # TODO: 4 properties below must go somewhere else ie. into RelationalSchema which is Schema + Relational normalizer. @property def max_table_nesting(self) -> int: @@ -228,19 +244,24 @@ def exhausted(self) -> bool: def root_key(self) -> bool: """Enables merging on all resources by propagating root foreign key to child tables. This option is most useful if you plan to change write disposition of a resource to disable/enable merge""" config = RelationalNormalizer.get_normalizer_config(self._schema).get("propagation") - return config is not None and "root" in config and "_dlt_id" in config["root"] and config["root"]["_dlt_id"] == "_dlt_root_id" + return ( + config is not None + and "root" in config + and "_dlt_id" in config["root"] + and config["root"]["_dlt_id"] == "_dlt_root_id" + ) @root_key.setter def root_key(self, value: bool) -> None: if value is True: - RelationalNormalizer.update_normalizer_config(self._schema, - {"propagation": { - "root": { - "_dlt_id": TColumnName("_dlt_root_id") - }}}) + RelationalNormalizer.update_normalizer_config( + self._schema, {"propagation": {"root": {"_dlt_id": TColumnName("_dlt_root_id")}}} + ) else: if self.root_key: - propagation_config = RelationalNormalizer.get_normalizer_config(self._schema)["propagation"] + propagation_config = RelationalNormalizer.get_normalizer_config(self._schema)[ + "propagation" + ] propagation_config["root"].pop("_dlt_id") # type: ignore @property @@ -283,8 +304,8 @@ def with_resources(self, *resource_names: str) -> "DltSource": def decompose(self, strategy: TDecompositionStrategy) -> List["DltSource"]: """Decomposes source into a list of sources with a given strategy. - "none" will return source as is - "scc" will decompose the dag of selected pipes and their parent into strongly connected components + "none" will return source as is + "scc" will decompose the dag of selected pipes and their parent into strongly connected components """ if strategy == "none": return [self] @@ -315,7 +336,9 @@ def add_limit(self, max_items: int) -> "DltSource": # noqa: A003 @property def run(self) -> SupportsPipelineRun: """A convenience method that will call `run` run on the currently active `dlt` pipeline. If pipeline instance is not found, one with default settings will be created.""" - self_run: SupportsPipelineRun = makefun.partial(Container()[PipelineContext].pipeline().run, *(), data=self) + self_run: SupportsPipelineRun = makefun.partial( + Container()[PipelineContext].pipeline().run, *(), data=self + ) return self_run @property @@ -332,9 +355,9 @@ def clone(self) -> "DltSource": def __iter__(self) -> Iterator[TDataItem]: """Opens iterator that yields the data items from all the resources within the source in the same order as in Pipeline class. - A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. + A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - A source config section is injected to allow secrets/config injection as during regular extraction. + A source config section is injected to allow secrets/config injection as during regular extraction. """ # use the same state dict when opening iterator and when iterator is iterated mock_state, _ = pipeline_state(Container(), {}) @@ -354,14 +377,16 @@ def _get_config_section_context(self) -> ConfigSectionContext: return ConfigSectionContext( pipeline_name=pipeline_name, sections=(known_sections.SOURCES, self.section, self.name), - source_state_key=self.name + source_state_key=self.name, ) def __getattr__(self, resource_name: str) -> DltResource: try: return self._resources[resource_name] except KeyError: - raise AttributeError(f"Resource with name {resource_name} not found in source {self.name}") + raise AttributeError( + f"Resource with name {resource_name} not found in source {self.name}" + ) def __setattr__(self, name: str, value: Any) -> None: if isinstance(value, DltResource): @@ -370,17 +395,29 @@ def __setattr__(self, name: str, value: Any) -> None: super().__setattr__(name, value) def __str__(self) -> str: - info = f"DltSource {self.name} section {self.section} contains {len(self.resources)} resource(s) of which {len(self.selected_resources)} are selected" + info = ( + f"DltSource {self.name} section {self.section} contains" + f" {len(self.resources)} resource(s) of which {len(self.selected_resources)} are" + " selected" + ) for r in self.resources.values(): selected_info = "selected" if r.selected else "not selected" if r.is_transformer: - info += f"\ntransformer {r.name} is {selected_info} and takes data from {r._pipe.parent.name}" + info += ( + f"\ntransformer {r.name} is {selected_info} and takes data from" + f" {r._pipe.parent.name}" + ) else: info += f"\nresource {r.name} is {selected_info}" if self.exhausted: - info += "\nSource is already iterated and cannot be used again ie. to display or load data." + info += ( + "\nSource is already iterated and cannot be used again ie. to display or load data." + ) else: - info += "\nIf you want to see the data items in this source you must iterate it or convert to list ie. list(source)." + info += ( + "\nIf you want to see the data items in this source you must iterate it or convert" + " to list ie. list(source)." + ) info += " Note that, like any iterator, you can iterate the source only once." info += f"\ninstance id: {id(self)}" return info diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py index ddda064aa4..f31bb0f702 100644 --- a/dlt/extract/storage.py +++ b/dlt/extract/storage.py @@ -6,19 +6,23 @@ from dlt.common.utils import uniq_id from dlt.common.typing import TDataItems from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage +from dlt.common.storages import ( + NormalizeStorageConfiguration, + NormalizeStorage, + DataItemStorage, + FileStorage, +) class ExtractorItemStorage(DataItemStorage): load_file_type: TLoaderFileFormat - def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: + def __init__(self, storage: FileStorage, extract_folder: str = "extract") -> None: # data item storage with jsonl with pua encoding super().__init__(self.load_file_type) self.extract_folder = extract_folder self.storage = storage - def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) @@ -39,11 +43,12 @@ class ExtractorStorage(NormalizeStorage): EXTRACT_FOLDER: ClassVar[str] = "extract" """Wrapper around multiple extractor storages with different file formats""" + def __init__(self, C: NormalizeStorageConfiguration) -> None: super().__init__(True, C) self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), - "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) + "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), } def _get_extract_path(self, extract_id: str) -> str: @@ -74,5 +79,15 @@ def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> Non if with_delete: self.storage.delete_folder(extract_path, recursively=True) - def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: - self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) + def write_data_item( + self, + file_format: TLoaderFileFormat, + load_id: str, + schema_name: str, + table_name: str, + item: TDataItems, + columns: TTableSchemaColumns, + ) -> None: + self.get_storage(file_format).write_data_item( + load_id, schema_name, table_name, item, columns + ) diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index 646267c539..e0096a255f 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -1,6 +1,17 @@ import inspect from abc import ABC, abstractmethod -from typing import Any, Callable, Generic, Iterator, Literal, Optional, Protocol, TypeVar, Union, Awaitable +from typing import ( + Any, + Callable, + Generic, + Iterator, + Literal, + Optional, + Protocol, + TypeVar, + Union, + Awaitable, +) from dlt.common.typing import TAny, TDataItem, TDataItems @@ -37,10 +48,12 @@ def __init__(self, table_name: str) -> None: class SupportsPipe(Protocol): """A protocol with the core Pipe properties and operations""" + name: str """Pipe name which is inherited by a resource""" parent: "SupportsPipe" """A parent of the current pipe""" + @property def has_parent(self) -> bool: """Checks if pipe is connected to parent pipe from which it takes data items. Connected pipes are created from transformer resources""" @@ -51,6 +64,7 @@ def has_parent(self) -> bool: ItemTransformFunctionNoMeta = Callable[[TDataItem], TAny] ItemTransformFunc = Union[ItemTransformFunctionWithMeta[TAny], ItemTransformFunctionNoMeta[TAny]] + class ItemTransform(ABC, Generic[TAny]): _f_meta: ItemTransformFunctionWithMeta[TAny] = None _f: ItemTransformFunctionNoMeta[TAny] = None @@ -114,7 +128,7 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: class YieldMapItem(ItemTransform[Iterator[TDataItem]]): - # mypy needs those to type correctly + # mypy needs those to type correctly _f_meta: ItemTransformFunctionWithMeta[TDataItem] _f: ItemTransformFunctionNoMeta[TDataItem] @@ -138,6 +152,7 @@ class ValidateItem(ItemTransform[TDataItem]): Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. See `PydanticValidator` for possible implementation. """ + table_name: str def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: diff --git a/dlt/extract/utils.py b/dlt/extract/utils.py index 3bd9f56a74..119b28b835 100644 --- a/dlt/extract/utils.py +++ b/dlt/extract/utils.py @@ -8,7 +8,10 @@ from dlt.common.schema.typing import TColumnNames, TAnySchemaColumns, TTableSchemaColumns from dlt.common.typing import AnyFun, DictStrAny, TDataItem, TDataItems from dlt.common.utils import get_callable_name -from dlt.extract.exceptions import InvalidResourceDataTypeFunctionNotAGenerator, InvalidStepFunctionArguments +from dlt.extract.exceptions import ( + InvalidResourceDataTypeFunctionNotAGenerator, + InvalidStepFunctionArguments, +) from dlt.extract.typing import TTableHintTemplate, TDataItem, TFunHintTemplate, SupportsPipe @@ -18,7 +21,9 @@ pydantic = None -def resolve_column_value(column_hint: TTableHintTemplate[TColumnNames], item: TDataItem) -> Union[Any, List[Any]]: +def resolve_column_value( + column_hint: TTableHintTemplate[TColumnNames], item: TDataItem +) -> Union[Any, List[Any]]: """Extract values from the data item given a column hint. Returns either a single value or list of values when hint is a composite. """ @@ -42,7 +47,7 @@ def ensure_table_schema_columns(columns: TAnySchemaColumns) -> TTableSchemaColum return columns elif isinstance(columns, Sequence): # Assume list of columns - return {col['name']: col for col in columns} + return {col["name"]: col for col in columns} elif pydantic is not None and ( isinstance(columns, pydantic.BaseModel) or issubclass(columns, pydantic.BaseModel) ): @@ -51,13 +56,19 @@ def ensure_table_schema_columns(columns: TAnySchemaColumns) -> TTableSchemaColum raise ValueError(f"Unsupported columns type: {type(columns)}") -def ensure_table_schema_columns_hint(columns: TTableHintTemplate[TAnySchemaColumns]) -> TTableHintTemplate[TTableSchemaColumns]: +def ensure_table_schema_columns_hint( + columns: TTableHintTemplate[TAnySchemaColumns], +) -> TTableHintTemplate[TTableSchemaColumns]: """Convert column schema hint to a hint returning `TTableSchemaColumns`. A callable hint is wrapped in another function which converts the original result. """ if callable(columns) and not isinstance(columns, type): + def wrapper(item: TDataItem) -> TTableSchemaColumns: - return ensure_table_schema_columns(cast(TFunHintTemplate[TAnySchemaColumns], columns)(item)) + return ensure_table_schema_columns( + cast(TFunHintTemplate[TAnySchemaColumns], columns)(item) + ) + return wrapper return ensure_table_schema_columns(columns) @@ -70,10 +81,12 @@ def reset_pipe_state(pipe: SupportsPipe, source_state_: Optional[DictStrAny] = N reset_resource_state(pipe.name, source_state_) -def simulate_func_call(f: Union[Any, AnyFun], args_to_skip: int, *args: Any, **kwargs: Any) -> Tuple[inspect.Signature, inspect.Signature, inspect.BoundArguments]: +def simulate_func_call( + f: Union[Any, AnyFun], args_to_skip: int, *args: Any, **kwargs: Any +) -> Tuple[inspect.Signature, inspect.Signature, inspect.BoundArguments]: """Simulates a call to a resource or transformer function before it will be wrapped for later execution in the pipe - Returns a tuple with a `f` signature, modified signature in case of transformers and bound arguments + Returns a tuple with a `f` signature, modified signature in case of transformers and bound arguments """ if not callable(f): # just provoke a call to raise default exception @@ -100,11 +113,15 @@ def check_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature) -> in meta_arg = next((p for p in sig.parameters.values() if p.name == "meta"), None) if meta_arg is not None: if meta_arg.kind not in (meta_arg.KEYWORD_ONLY, meta_arg.POSITIONAL_OR_KEYWORD): - raise InvalidStepFunctionArguments(name, callable_name, sig, "'meta' cannot be pos only argument '") + raise InvalidStepFunctionArguments( + name, callable_name, sig, "'meta' cannot be pos only argument '" + ) return meta_arg -def wrap_compat_transformer(name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any) -> AnyFun: +def wrap_compat_transformer( + name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any +) -> AnyFun: """Creates a compatible wrapper over transformer function. A pure transformer function expects data item in first argument and one keyword argument called `meta`""" check_compat_transformer(name, f, sig) if len(sig.parameters) == 2 and "meta" in sig.parameters: @@ -121,7 +138,9 @@ def _tx_partial(item: TDataItems, meta: Any = None) -> Any: return makefun.wraps(f, new_sig=inspect.signature(_tx_partial))(_tx_partial) # type: ignore -def wrap_resource_gen(name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any) -> AnyFun: +def wrap_resource_gen( + name: str, f: AnyFun, sig: inspect.Signature, *args: Any, **kwargs: Any +) -> AnyFun: """Wraps a generator or generator function so it is evaluated on extraction""" if inspect.isgeneratorfunction(inspect.unwrap(f)) or inspect.isgenerator(f): # always wrap generators and generator functions. evaluate only at runtime! diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index 8bd6c7afb9..72b70c5661 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -17,7 +17,12 @@ class PydanticValidator(ValidateItem, Generic[_TPydanticModel]): model: Type[_TPydanticModel] - def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> None: + def __init__( + self, + model: Type[_TPydanticModel], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode, + ) -> None: from dlt.common.libs.pydantic import apply_schema_contract_to_model, create_list_model self.column_mode: TSchemaEvolutionMode = column_mode @@ -25,7 +30,9 @@ def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMo self.model = apply_schema_contract_to_model(model, column_mode, data_mode) self.list_model = create_list_model(self.model, data_mode) - def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, List[_TPydanticModel]]: + def __call__( + self, item: TDataItems, meta: Any = None + ) -> Union[_TPydanticModel, List[_TPydanticModel]]: """Validate a data item against the pydantic model""" if item is None: return None @@ -33,7 +40,9 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, from dlt.common.libs.pydantic import validate_item, validate_items if isinstance(item, list): - return validate_items(self.table_name, self.list_model, item, self.column_mode, self.data_mode) + return validate_items( + self.table_name, self.list_model, item, self.column_mode, self.data_mode + ) return validate_item(self.table_name, self.model, item, self.column_mode, self.data_mode) def __str__(self, *args: Any, **kwargs: Any) -> str: @@ -42,28 +51,38 @@ def __str__(self, *args: Any, **kwargs: Any) -> str: def create_item_validator( columns: TTableHintTemplate[TAnySchemaColumns], - schema_contract: TTableHintTemplate[TSchemaContract] = None + schema_contract: TTableHintTemplate[TSchemaContract] = None, ) -> Tuple[Optional[ValidateItem], TTableHintTemplate[TSchemaContract]]: """Creates item validator for a `columns` definition and a `schema_contract` - Returns a tuple (validator, schema contract). If validator could not be created, returns None at first position. - If schema_contract was not specified a default schema contract for given validator will be returned + Returns a tuple (validator, schema contract). If validator could not be created, returns None at first position. + If schema_contract was not specified a default schema contract for given validator will be returned """ - if PydanticBaseModel is not None and isinstance(columns, type) and issubclass(columns, PydanticBaseModel): - assert not callable(schema_contract), "schema_contract cannot be dynamic for Pydantic item validator" + if ( + PydanticBaseModel is not None + and isinstance(columns, type) + and issubclass(columns, PydanticBaseModel) + ): + assert not callable( + schema_contract + ), "schema_contract cannot be dynamic for Pydantic item validator" from dlt.common.libs.pydantic import extra_to_column_mode, get_extra_from_model + # freeze the columns if we have a fully defined table and no other explicit contract expanded_schema_contract = Schema.expand_schema_contract_settings( schema_contract, # corresponds to default Pydantic behavior - default={"tables": "evolve", "columns": extra_to_column_mode(get_extra_from_model(columns)), "data_type": "freeze"} + default={ + "tables": "evolve", + "columns": extra_to_column_mode(get_extra_from_model(columns)), + "data_type": "freeze", + }, ) - return (PydanticValidator( - columns, - expanded_schema_contract["columns"], - expanded_schema_contract["data_type"] + return ( + PydanticValidator( + columns, expanded_schema_contract["columns"], expanded_schema_contract["data_type"] ), - schema_contract or expanded_schema_contract + schema_contract or expanded_schema_contract, ) return None, schema_contract diff --git a/dlt/extract/wrappers.py b/dlt/extract/wrappers.py index e8e295f245..7ffb6b4fc6 100644 --- a/dlt/extract/wrappers.py +++ b/dlt/extract/wrappers.py @@ -22,4 +22,4 @@ def wrap_additional_type(data: Any) -> Any: if isinstance(data, (PandaFrame, ArrowTable, ArrowRecords)): return [data] - return data \ No newline at end of file + return data diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index e0329d583c..c72118cfc9 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -1,7 +1,13 @@ import os from tempfile import gettempdir from typing import Any, Callable, List, Literal, Optional, Sequence, Tuple -from tenacity import retry_if_exception, wait_exponential, stop_after_attempt, Retrying, RetryCallState +from tenacity import ( + retry_if_exception, + wait_exponential, + stop_after_attempt, + Retrying, + RetryCallState, +) from dlt.common import pendulum from dlt.common.exceptions import MissingDependencyException @@ -33,7 +39,9 @@ DEFAULT_RETRY_NO_RETRY = Retrying(stop=stop_after_attempt(1), reraise=True) -DEFAULT_RETRY_BACKOFF = Retrying(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1.5, min=4, max=10), reraise=True) +DEFAULT_RETRY_BACKOFF = Retrying( + stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1.5, min=4, max=10), reraise=True +) class PipelineTasksGroup(TaskGroup): @@ -50,13 +58,13 @@ def __init__( log_progress_period: float = 30.0, buffer_max_items: int = 1000, retry_policy: Retrying = DEFAULT_RETRY_NO_RETRY, - retry_pipeline_steps: Sequence[TPipelineStep] = ("load", ), + retry_pipeline_steps: Sequence[TPipelineStep] = ("load",), fail_task_if_any_job_failed: bool = True, abort_task_if_any_job_failed: bool = False, wipe_local_data: bool = True, save_load_info: bool = False, save_trace_info: bool = False, - **kwargs: Any + **kwargs: Any, ) -> None: """Creates a task group to which you can add pipeline runs @@ -103,7 +111,7 @@ def __init__( self.save_trace_info = save_trace_info # reload providers so config.toml in dags folder is included - dags_folder = conf.get('core', 'dags_folder') + dags_folder = conf.get("core", "dags_folder") # set the dlt project folder to dags os.environ["DLT_PROJECT_DIR"] = dags_folder @@ -129,7 +137,7 @@ def add_run( decompose: Literal["none", "serialize"] = "none", table_name: str = None, write_disposition: TWriteDisposition = None, - **kwargs: Any + **kwargs: Any, ) -> List[PythonOperator]: """Creates a task or a group of tasks to run `data` with `pipeline` @@ -153,7 +161,10 @@ def add_run( # make sure that pipeline was created after dag was initialized if not pipeline.pipelines_dir.startswith(os.environ["DLT_DATA_DIR"]): - raise ValueError("Please create your Pipeline instance after AirflowTasks are created. The dlt pipelines directory is not set correctly") + raise ValueError( + "Please create your Pipeline instance after AirflowTasks are created. The dlt" + " pipelines directory is not set correctly" + ) def task_name(pipeline: Pipeline, data: Any) -> str: task_name = pipeline.pipeline_name @@ -165,11 +176,9 @@ def task_name(pipeline: Pipeline, data: Any) -> str: return task_name with self: - # use factory function to make test, in order to parametrize it. passing arguments to task function (_run) is serializing them and # running template engine on them def make_task(pipeline: Pipeline, data: Any) -> PythonOperator: - def _run() -> None: # activate pipeline pipeline.activate() @@ -182,7 +191,10 @@ def _run() -> None: logger.LOGGER = ti.log # set global number of buffered items - if dlt.config.get("data_writer.buffer_max_items") is None and self.buffer_max_items > 0: + if ( + dlt.config.get("data_writer.buffer_max_items") is None + and self.buffer_max_items > 0 + ): dlt.config["data_writer.buffer_max_items"] = self.buffer_max_items logger.info(f"Set data_writer.buffer_max_items to {self.buffer_max_items}") @@ -192,24 +204,36 @@ def _run() -> None: logger.info("Set load.abort_task_if_any_job_failed to True") if self.log_progress_period > 0 and task_pipeline.collector == NULL_COLLECTOR: - task_pipeline.collector = log(log_period=self.log_progress_period, logger=logger.LOGGER) + task_pipeline.collector = log( + log_period=self.log_progress_period, logger=logger.LOGGER + ) logger.info(f"Enabled log progress with period {self.log_progress_period}") logger.info(f"Pipeline data in {task_pipeline.working_dir}") def log_after_attempt(retry_state: RetryCallState) -> None: if not retry_state.retry_object.stop(retry_state): - logger.error("Retrying pipeline run due to exception: %s", retry_state.outcome.exception()) + logger.error( + "Retrying pipeline run due to exception: %s", + retry_state.outcome.exception(), + ) try: # retry with given policy on selected pipeline steps for attempt in self.retry_policy.copy( - retry=retry_if_exception(retry_load(retry_on_pipeline_steps=self.retry_pipeline_steps)), - after=log_after_attempt + retry=retry_if_exception( + retry_load(retry_on_pipeline_steps=self.retry_pipeline_steps) + ), + after=log_after_attempt, ): with attempt: - logger.info("Running the pipeline, attempt=%s" % attempt.retry_state.attempt_number) - load_info = task_pipeline.run(data, table_name=table_name, write_disposition=write_disposition) + logger.info( + "Running the pipeline, attempt=%s" + % attempt.retry_state.attempt_number + ) + load_info = task_pipeline.run( + data, table_name=table_name, write_disposition=write_disposition + ) logger.info(str(load_info)) # save load and trace if self.save_load_info: @@ -217,7 +241,9 @@ def log_after_attempt(retry_state: RetryCallState) -> None: task_pipeline.run([load_info], table_name="_load_info") if self.save_trace_info: logger.info("Saving the trace in the destination") - task_pipeline.run([task_pipeline.last_trace], table_name="_trace") + task_pipeline.run( + [task_pipeline.last_trace], table_name="_trace" + ) # raise on failed jobs if requested if self.fail_task_if_any_job_failed: load_info.raise_on_failed_jobs() @@ -228,9 +254,7 @@ def log_after_attempt(retry_state: RetryCallState) -> None: task_pipeline._wipe_working_folder() return PythonOperator( - task_id=task_name(pipeline, data), - python_callable=_run, - **kwargs + task_id=task_name(pipeline, data), python_callable=_run, **kwargs ) if decompose == "none": @@ -263,6 +287,7 @@ def airflow_get_execution_dates() -> Tuple[pendulum.DateTime, Optional[pendulum. # prefer logging to task logger try: from airflow.operators.python import get_current_context # noqa + context = get_current_context() return context["data_interval_start"], context["data_interval_end"] except Exception: diff --git a/dlt/helpers/dbt/__init__.py b/dlt/helpers/dbt/__init__.py index c6107b2873..992c66dcf2 100644 --- a/dlt/helpers/dbt/__init__.py +++ b/dlt/helpers/dbt/__init__.py @@ -27,14 +27,16 @@ def _default_profile_name(credentials: DestinationClientDwhConfiguration) -> str if isinstance(credentials.credentials, CredentialsWithDefault): if credentials.credentials.has_default_credentials(): profile_name += "_default" - elif profile_name == 'snowflake': - if getattr(credentials.credentials, 'private_key', None): + elif profile_name == "snowflake": + if getattr(credentials.credentials, "private_key", None): # snowflake with private key is a separate profile - profile_name += '_pkey' + profile_name += "_pkey" return profile_name -def _create_dbt_deps(destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION) -> List[str]: +def _create_dbt_deps( + destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION +) -> List[str]: if dbt_version: # if parses as version use "==" operator with contextlib.suppress(ValueError): @@ -56,19 +58,24 @@ def _create_dbt_deps(destination_names: List[str], dbt_version: str = DEFAULT_DB additional_deps: List[str] = [] if "duckdb" in destination_names or "motherduck" in destination_names: from importlib.metadata import version as pkg_version + # force locally installed duckdb additional_deps = ["duckdb" + "==" + pkg_version("duckdb")] return all_packages + additional_deps + [dlt_requirement] -def restore_venv(venv_dir: str, destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION) -> Venv: +def restore_venv( + venv_dir: str, destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION +) -> Venv: venv = Venv.restore(venv_dir) venv.add_dependencies(_create_dbt_deps(destination_names, dbt_version)) return venv -def create_venv(venv_dir: str, destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION) -> Venv: +def create_venv( + venv_dir: str, destination_names: List[str], dbt_version: str = DEFAULT_DBT_VERSION +) -> Venv: return Venv.create(venv_dir, _create_dbt_deps(destination_names, dbt_version)) @@ -79,7 +86,7 @@ def package_runner( package_location: str, package_repository_branch: str = None, package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa - auto_full_refresh_when_out_of_sync: bool = None + auto_full_refresh_when_out_of_sync: bool = None, ) -> DBTPackageRunner: default_profile_name = _default_profile_name(destination_configuration) return create_runner( @@ -90,5 +97,5 @@ def package_runner( package_repository_branch=package_repository_branch, package_repository_ssh_key=package_repository_ssh_key, package_profile_name=default_profile_name, - auto_full_refresh_when_out_of_sync=auto_full_refresh_when_out_of_sync + auto_full_refresh_when_out_of_sync=auto_full_refresh_when_out_of_sync, ) diff --git a/dlt/helpers/dbt/configuration.py b/dlt/helpers/dbt/configuration.py index d21266196e..4cd3f3a0f4 100644 --- a/dlt/helpers/dbt/configuration.py +++ b/dlt/helpers/dbt/configuration.py @@ -10,7 +10,9 @@ class DBTRunnerConfiguration(BaseConfiguration): package_location: str = None package_repository_branch: Optional[str] = None - package_repository_ssh_key: TSecretValue = TSecretValue("") # the default is empty value which will disable custom SSH KEY + package_repository_ssh_key: TSecretValue = TSecretValue( + "" + ) # the default is empty value which will disable custom SSH KEY package_profiles_dir: Optional[str] = None package_profile_name: Optional[str] = None auto_full_refresh_when_out_of_sync: bool = True diff --git a/dlt/helpers/dbt/dbt_utils.py b/dlt/helpers/dbt/dbt_utils.py index 06154eb58c..b4097e4434 100644 --- a/dlt/helpers/dbt/dbt_utils.py +++ b/dlt/helpers/dbt/dbt_utils.py @@ -7,12 +7,17 @@ from dlt.common.exceptions import MissingDependencyException from dlt.common.typing import StrAny -from dlt.helpers.dbt.exceptions import DBTProcessingError, DBTNodeResult, IncrementalSchemaOutOfSyncError +from dlt.helpers.dbt.exceptions import ( + DBTProcessingError, + DBTNodeResult, + IncrementalSchemaOutOfSyncError, +) try: # block disabling root logger import logbook.compat - logbook.compat.redirect_logging = lambda : None + + logbook.compat.redirect_logging = lambda: None # can only import DBT after redirect is disabled # https://stackoverflow.com/questions/48619517/call-a-click-command-from-code @@ -78,9 +83,12 @@ def set_path_wrapper(self: dbt.logger.LogManager, path: str) -> None: def is_incremental_schema_out_of_sync_error(error: Any) -> bool: - def _check_single_item(error_: dbt_results.RunResult) -> bool: - return error_.status == dbt_results.RunStatus.Error and "The source and target schemas on this incremental model are out of sync" in error_.message + return ( + error_.status == dbt_results.RunStatus.Error + and "The source and target schemas on this incremental model are out of sync" + in error_.message + ) if isinstance(error, dbt_results.RunResult): return _check_single_item(error) @@ -102,18 +110,20 @@ def parse_dbt_execution_results(results: Any) -> Sequence[DBTNodeResult]: return None return [ - DBTNodeResult(res.node.name, res.message, res.execution_time, str(res.status)) for res in results if isinstance(res, dbt_results.NodeResult) - ] + DBTNodeResult(res.node.name, res.message, res.execution_time, str(res.status)) + for res in results + if isinstance(res, dbt_results.NodeResult) + ] def run_dbt_command( - package_path: str, - command: str, - profiles_dir: str, - profile_name: Optional[str] = None, - global_args: Sequence[str] = None, - command_args: Sequence[str] = None, - package_vars: StrAny = None + package_path: str, + command: str, + profiles_dir: str, + profile_name: Optional[str] = None, + global_args: Sequence[str] = None, + command_args: Sequence[str] = None, + package_vars: StrAny = None, ) -> Union[Sequence[DBTNodeResult], dbt_results.ExecutionResult]: args = ["--profiles-dir", profiles_dir] # add profile name if provided @@ -133,7 +143,7 @@ def run_dbt_command( success: bool = None # dbt uses logbook which does not run on python 10. below is a hack that allows that warnings.filterwarnings("ignore", category=DeprecationWarning, module="logbook") - runner_args = (global_args or []) + [command] + args # type: ignore + runner_args = (global_args or []) + [command] + args # type: ignore with dbt.logger.log_manager.applicationbound(): try: @@ -177,8 +187,16 @@ def init_logging_and_run_dbt_command( profiles_dir: str, profile_name: Optional[str] = None, command_args: Sequence[str] = None, - package_vars: StrAny = None + package_vars: StrAny = None, ) -> Union[Sequence[DBTNodeResult], dbt_results.ExecutionResult]: # initialize dbt logging, returns global parameters to dbt command dbt_global_args = initialize_dbt_logging(log_level, is_json_logging) - return run_dbt_command(package_path, command, profiles_dir, profile_name, dbt_global_args, command_args, package_vars) + return run_dbt_command( + package_path, + command, + profiles_dir, + profile_name, + dbt_global_args, + command_args, + package_vars, + ) diff --git a/dlt/helpers/dbt/exceptions.py b/dlt/helpers/dbt/exceptions.py index 3a9d6f9c80..545b01868a 100644 --- a/dlt/helpers/dbt/exceptions.py +++ b/dlt/helpers/dbt/exceptions.py @@ -23,7 +23,9 @@ class DBTNodeResult(NamedTuple): class DBTProcessingError(DBTRunnerException): - def __init__(self, command: str, run_results: Sequence[DBTNodeResult], dbt_results: Any) -> None: + def __init__( + self, command: str, run_results: Sequence[DBTNodeResult], dbt_results: Any + ) -> None: self.command = command self.run_results = run_results # the results from DBT may be anything diff --git a/dlt/helpers/dbt/runner.py b/dlt/helpers/dbt/runner.py index 381260536c..388b81b2ee 100644 --- a/dlt/helpers/dbt/runner.py +++ b/dlt/helpers/dbt/runner.py @@ -17,7 +17,12 @@ from dlt.common.utils import with_custom_environ from dlt.helpers.dbt.configuration import DBTRunnerConfiguration -from dlt.helpers.dbt.exceptions import IncrementalSchemaOutOfSyncError, PrerequisitesException, DBTNodeResult, DBTProcessingError +from dlt.helpers.dbt.exceptions import ( + IncrementalSchemaOutOfSyncError, + PrerequisitesException, + DBTNodeResult, + DBTProcessingError, +) from dlt.common.runtime.telemetry import with_telemetry @@ -31,12 +36,13 @@ class DBTPackageRunner: passed via DBTRunnerConfiguration instance """ - def __init__(self, + def __init__( + self, venv: Venv, credentials: DestinationClientDwhConfiguration, working_dir: str, source_dataset_name: str, - config: DBTRunnerConfiguration + config: DBTRunnerConfiguration, ) -> None: self.venv = venv self.credentials = credentials @@ -62,7 +68,9 @@ def _setup_location(self) -> None: self.cloned_package_name = url.name self.package_path = os.path.join(self.working_dir, self.cloned_package_name) - def _get_package_vars(self, additional_vars: StrAny = None, destination_dataset_name: str = None) -> StrAny: + def _get_package_vars( + self, additional_vars: StrAny = None, destination_dataset_name: str = None + ) -> StrAny: if self.config.package_additional_vars: package_vars = dict(self.config.package_additional_vars) else: @@ -82,7 +90,9 @@ def _log_dbt_run_results(self, results: Sequence[DBTNodeResult]) -> None: if res.status == "error": logger.error(f"Model {res.model_name} error! Error: {res.message}") else: - logger.info(f"Model {res.model_name} {res.status} in {res.time} seconds with {res.message}") + logger.info( + f"Model {res.model_name} {res.status} in {res.time} seconds with {res.message}" + ) def ensure_newest_package(self) -> None: """Clones or brings the dbt package at `package_location` up to date.""" @@ -90,19 +100,37 @@ def ensure_newest_package(self) -> None: with git_custom_key_command(self.config.package_repository_ssh_key) as ssh_command: try: - ensure_remote_head(self.package_path, branch=self.config.package_repository_branch, with_git_command=ssh_command) + ensure_remote_head( + self.package_path, + branch=self.config.package_repository_branch, + with_git_command=ssh_command, + ) except GitError as err: # cleanup package folder logger.info(f"Package will be cloned due to {type(err).__name__}:{str(err)}") - logger.info(f"Will clone {self.config.package_location} head {self.config.package_repository_branch} into {self.package_path}") - force_clone_repo(self.config.package_location, self.repo_storage, self.cloned_package_name, self.config.package_repository_branch, with_git_command=ssh_command) + logger.info( + f"Will clone {self.config.package_location} head" + f" {self.config.package_repository_branch} into {self.package_path}" + ) + force_clone_repo( + self.config.package_location, + self.repo_storage, + self.cloned_package_name, + self.config.package_repository_branch, + with_git_command=ssh_command, + ) @with_custom_environ - def _run_dbt_command(self, command: str, command_args: Sequence[str] = None, package_vars: StrAny = None) -> Sequence[DBTNodeResult]: - logger.info(f"Exec dbt command: {command} {command_args} {package_vars} on profile {self.config.package_profile_name}") + def _run_dbt_command( + self, command: str, command_args: Sequence[str] = None, package_vars: StrAny = None + ) -> Sequence[DBTNodeResult]: + logger.info( + f"Exec dbt command: {command} {command_args} {package_vars} on profile" + f" {self.config.package_profile_name}" + ) # write credentials to environ to pass them to dbt, add DLT__ prefix if self.credentials: - add_config_to_env(self.credentials, ("dlt", )) + add_config_to_env(self.credentials, ("dlt",)) args = [ self.config.runtime.log_level, is_json_logging(self.config.runtime.log_format), @@ -111,7 +139,7 @@ def _run_dbt_command(self, command: str, command_args: Sequence[str] = None, pac self.config.package_profiles_dir, self.config.package_profile_name, command_args, - package_vars + package_vars, ] script = f""" from functools import partial @@ -134,7 +162,12 @@ def _run_dbt_command(self, command: str, command_args: Sequence[str] = None, pac print(cpe.stderr) raise - def run(self, cmd_params: Sequence[str] = ("--fail-fast", ), additional_vars: StrAny = None, destination_dataset_name: str = None) -> Sequence[DBTNodeResult]: + def run( + self, + cmd_params: Sequence[str] = ("--fail-fast",), + additional_vars: StrAny = None, + destination_dataset_name: str = None, + ) -> Sequence[DBTNodeResult]: """Runs `dbt` package Executes `dbt run` on previously cloned package. @@ -151,12 +184,15 @@ def run(self, cmd_params: Sequence[str] = ("--fail-fast", ), additional_vars: St DBTProcessingError: `run` command failed. Contains a list of models with their execution statuses and error messages """ return self._run_dbt_command( - "run", - cmd_params, - self._get_package_vars(additional_vars, destination_dataset_name) + "run", cmd_params, self._get_package_vars(additional_vars, destination_dataset_name) ) - def test(self, cmd_params: Sequence[str] = None, additional_vars: StrAny = None, destination_dataset_name: str = None) -> Sequence[DBTNodeResult]: + def test( + self, + cmd_params: Sequence[str] = None, + additional_vars: StrAny = None, + destination_dataset_name: str = None, + ) -> Sequence[DBTNodeResult]: """Tests `dbt` package Executes `dbt test` on previously cloned package. @@ -173,12 +209,12 @@ def test(self, cmd_params: Sequence[str] = None, additional_vars: StrAny = None, DBTProcessingError: `test` command failed. Contains a list of models with their execution statuses and error messages """ return self._run_dbt_command( - "test", - cmd_params, - self._get_package_vars(additional_vars, destination_dataset_name) + "test", cmd_params, self._get_package_vars(additional_vars, destination_dataset_name) ) - def _run_db_steps(self, run_params: Sequence[str], package_vars: StrAny, source_tests_selector: str) -> Sequence[DBTNodeResult]: + def _run_db_steps( + self, run_params: Sequence[str], package_vars: StrAny, source_tests_selector: str + ) -> Sequence[DBTNodeResult]: if self.repo_storage: # make sure we use package from the remote head self.ensure_newest_package() @@ -209,8 +245,9 @@ def _run_db_steps(self, run_params: Sequence[str], package_vars: StrAny, source_ else: raise - def run_all(self, - run_params: Sequence[str] = ("--fail-fast", ), + def run_all( + self, + run_params: Sequence[str] = ("--fail-fast",), additional_vars: StrAny = None, source_tests_selector: str = None, destination_dataset_name: str = None, @@ -244,7 +281,7 @@ def run_all(self, results = self._run_db_steps( run_params, self._get_package_vars(additional_vars, destination_dataset_name), - source_tests_selector + source_tests_selector, ) self._log_dbt_run_results(results) return results @@ -270,7 +307,7 @@ def create_runner( package_profiles_dir: str = None, package_profile_name: str = None, auto_full_refresh_when_out_of_sync: bool = None, - config: DBTRunnerConfiguration = None + config: DBTRunnerConfiguration = None, ) -> DBTPackageRunner: """Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. diff --git a/dlt/helpers/dbt_cloud/client.py b/dlt/helpers/dbt_cloud/client.py index 8851aaa168..67d315f0d1 100644 --- a/dlt/helpers/dbt_cloud/client.py +++ b/dlt/helpers/dbt_cloud/client.py @@ -41,9 +41,7 @@ def __init__( self.accounts_url = f"accounts/{self.account_id}" def get_endpoint(self, endpoint: str) -> Any: - response = requests.get( - f"{self.base_api_url}/{endpoint}", headers=self._headers - ) + response = requests.get(f"{self.base_api_url}/{endpoint}", headers=self._headers) results = response.json() return results @@ -103,16 +101,15 @@ def trigger_job_run( """ if not (self.account_id and job_id): raise InvalidCredentialsException( - f"account_id and job_id are required, got account_id: {self.account_id} and job_id: {job_id}" + f"account_id and job_id are required, got account_id: {self.account_id} and job_id:" + f" {job_id}" ) json_body = {} if data: json_body.update(data) - response = self.post_endpoint( - f"{self.accounts_url}/jobs/{job_id}/run", json_body=json_body - ) + response = self.post_endpoint(f"{self.accounts_url}/jobs/{job_id}/run", json_body=json_body) return int(response["data"]["id"]) def get_run_status(self, run_id: Union[int, str]) -> Dict[Any, Any]: @@ -136,7 +133,8 @@ def get_run_status(self, run_id: Union[int, str]) -> Dict[Any, Any]: """ if not (self.account_id and run_id): raise InvalidCredentialsException( - f"account_id and run_id are required, got account_id: {self.account_id} and run_id: {run_id}." + f"account_id and run_id are required, got account_id: {self.account_id} and run_id:" + f" {run_id}." ) response = self.get_endpoint(f"{self.accounts_url}/runs/{run_id}") diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index e43e794bf6..7d44dac898 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -16,9 +16,14 @@ try: import streamlit as st + # from streamlit import SECRETS_FILE_LOC, secrets except ModuleNotFoundError: - raise MissingDependencyException("DLT Streamlit Helpers", ["streamlit"], "DLT Helpers for Streamlit should be run within a streamlit app.") + raise MissingDependencyException( + "DLT Streamlit Helpers", + ["streamlit"], + "DLT Helpers for Streamlit should be run within a streamlit app.", + ) # use right caching function to disable deprecation message @@ -129,11 +134,17 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: st.header("Last load info") col1, col2, col3 = st.columns(3) loads_df = _query_data_live( - f"SELECT load_id, inserted_at FROM {pipeline.default_schema.loads_table_name} WHERE status = 0 ORDER BY inserted_at DESC LIMIT 101 " + f"SELECT load_id, inserted_at FROM {pipeline.default_schema.loads_table_name} WHERE" + " status = 0 ORDER BY inserted_at DESC LIMIT 101 " ) loads_no = loads_df.shape[0] if loads_df.shape[0] > 0: - rel_time = humanize.naturaldelta(pendulum.now() - pendulum.from_timestamp(loads_df.iloc[0, 1].timestamp())) + " ago" + rel_time = ( + humanize.naturaldelta( + pendulum.now() - pendulum.from_timestamp(loads_df.iloc[0, 1].timestamp()) + ) + + " ago" + ) last_load_id = loads_df.iloc[0, 0] if loads_no > 100: loads_no = "> " + str(loads_no) @@ -154,7 +165,10 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: if "parent" in table: continue table_name = table["name"] - query_parts.append(f"SELECT '{table_name}' as table_name, COUNT(1) As rows_count FROM {table_name} WHERE _dlt_load_id = '{selected_load_id}'") + query_parts.append( + f"SELECT '{table_name}' as table_name, COUNT(1) As rows_count FROM" + f" {table_name} WHERE _dlt_load_id = '{selected_load_id}'" + ) query_parts.append("UNION ALL") query_parts.pop() rows_counts_df = _query_data("\n".join(query_parts)) @@ -167,8 +181,9 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: st.header("Schema updates") schemas_df = _query_data_live( - f"SELECT schema_name, inserted_at, version, version_hash FROM {pipeline.default_schema.version_table_name} ORDER BY inserted_at DESC LIMIT 101 " - ) + "SELECT schema_name, inserted_at, version, version_hash FROM" + f" {pipeline.default_schema.version_table_name} ORDER BY inserted_at DESC LIMIT 101 " + ) st.markdown("**100 recent schema updates**") st.dataframe(schemas_df) @@ -188,14 +203,19 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: col2.metric("Remote state version", remote_state_version) if remote_state_version != local_state["_state_version"]: - st.warning("Looks like that local state is not yet synchronized or synchronization is disabled") + st.warning( + "Looks like that local state is not yet synchronized or synchronization is disabled" + ) except CannotRestorePipelineException as restore_ex: st.error("Seems like the pipeline does not exist. Did you run it at least once?") st.exception(restore_ex) except ConfigFieldMissingException as cf_ex: - st.error("Pipeline credentials/configuration is missing. This most often happen when you run the streamlit app from different folder than the `.dlt` with `toml` files resides.") + st.error( + "Pipeline credentials/configuration is missing. This most often happen when you run the" + " streamlit app from different folder than the `.dlt` with `toml` files resides." + ) st.text(str(cf_ex)) except Exception as ex: @@ -203,8 +223,13 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: st.exception(ex) - -def write_data_explorer_page(pipeline: Pipeline, schema_name: str = None, show_dlt_tables: bool = False, example_query: str = "", show_charts: bool = True) -> None: +def write_data_explorer_page( + pipeline: Pipeline, + schema_name: str = None, + show_dlt_tables: bool = False, + example_query: str = "", + show_charts: bool = True, +) -> None: """Writes Streamlit app page with a schema and live data preview. #### Args: @@ -253,21 +278,29 @@ def _query_data(query: str, chunk_size: int = None) -> pd.DataFrame: if "write_disposition" in table: table_hints.append("write disposition: **%s**" % table["write_disposition"]) columns = table["columns"] - primary_keys: Iterator[str] = flatten_list_or_items([ - col_name for col_name in columns.keys() + primary_keys: Iterator[str] = flatten_list_or_items( + [ + col_name + for col_name in columns.keys() if not col_name.startswith("_") and not columns[col_name].get("primary_key") is None - ]) + ] + ) table_hints.append("primary key(s): **%s**" % ", ".join(primary_keys)) - merge_keys = flatten_list_or_items([ - col_name for col_name in columns.keys() + merge_keys = flatten_list_or_items( + [ + col_name + for col_name in columns.keys() if not col_name.startswith("_") and not columns[col_name].get("merge_key") is None - ]) + ] + ) table_hints.append("merge key(s): **%s**" % ", ".join(merge_keys)) st.markdown(" | ".join(table_hints)) # table schema contains various hints (like clustering or partition options) that we do not want to show in basic view - essentials_f = lambda c: {k:v for k, v in c.items() if k in ["name", "data_type", "nullable"]} + essentials_f = lambda c: { + k: v for k, v in c.items() if k in ["name", "data_type", "nullable"] + } st.table(map(essentials_f, table["columns"].values())) # add a button that when pressed will show the full content of a table @@ -302,7 +335,6 @@ def _query_data(query: str, chunk_size: int = None) -> pd.DataFrame: # try barchart st.bar_chart(df) if df.dtypes.shape[0] == 2 and show_charts: - # try to import altair charts try: import altair as alt @@ -310,13 +342,17 @@ def _query_data(query: str, chunk_size: int = None) -> pd.DataFrame: raise MissingDependencyException( "DLT Streamlit Helpers", ["altair"], - "DLT Helpers for Streamlit should be run within a streamlit app." + "DLT Helpers for Streamlit should be run within a streamlit" + " app.", ) # try altair - bar_chart = alt.Chart(df).mark_bar().encode( - x=f'{df.columns[1]}:Q', - y=alt.Y(f'{df.columns[0]}:N', sort='-x') + bar_chart = ( + alt.Chart(df) + .mark_bar() + .encode( + x=f"{df.columns[1]}:Q", y=alt.Y(f"{df.columns[0]}:N", sort="-x") + ) ) st.altair_chart(bar_chart, use_container_width=True) except Exception as ex: diff --git a/dlt/load/configuration.py b/dlt/load/configuration.py index f62c3a4dda..0a84e3c331 100644 --- a/dlt/load/configuration.py +++ b/dlt/load/configuration.py @@ -20,11 +20,11 @@ def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "thread" if TYPE_CHECKING: + def __init__( self, pool_type: TPoolType = "thread", workers: int = None, raise_on_failed_jobs: bool = False, - _load_storage_config: LoadStorageConfiguration = None - ) -> None: - ... + _load_storage_config: LoadStorageConfiguration = None, + ) -> None: ... diff --git a/dlt/load/exceptions.py b/dlt/load/exceptions.py index 93d4ef76e1..8a704660ce 100644 --- a/dlt/load/exceptions.py +++ b/dlt/load/exceptions.py @@ -12,7 +12,10 @@ def __init__(self, load_id: str, job_id: str, failed_message: str) -> None: self.load_id = load_id self.job_id = job_id self.failed_message = failed_message - super().__init__(f"Job for {job_id} failed terminally in load {load_id} with message {failed_message}. The package is aborted and cannot be retried.") + super().__init__( + f"Job for {job_id} failed terminally in load {load_id} with message {failed_message}." + " The package is aborted and cannot be retried." + ) class LoadClientJobRetry(DestinationTransientException): @@ -21,15 +24,23 @@ def __init__(self, load_id: str, job_id: str, retry_count: int, max_retry_count: self.job_id = job_id self.retry_count = retry_count self.max_retry_count = max_retry_count - super().__init__(f"Job for {job_id} had {retry_count} retries which a multiple of {max_retry_count}. Exiting retry loop. You can still rerun the load package to retry this job.") + super().__init__( + f"Job for {job_id} had {retry_count} retries which a multiple of {max_retry_count}." + " Exiting retry loop. You can still rerun the load package to retry this job." + ) class LoadClientUnsupportedFileFormats(DestinationTerminalException): - def __init__(self, file_format: str, supported_file_format: Sequence[str], file_path: str) -> None: + def __init__( + self, file_format: str, supported_file_format: Sequence[str], file_path: str + ) -> None: self.file_format = file_format self.supported_types = supported_file_format self.file_path = file_path - super().__init__(f"Loader does not support writer {file_format} in file {file_path}. Supported writers: {supported_file_format}") + super().__init__( + f"Loader does not support writer {file_format} in file {file_path}. Supported writers:" + f" {supported_file_format}" + ) class LoadClientUnsupportedWriteDisposition(DestinationTerminalException): @@ -37,4 +48,7 @@ def __init__(self, table_name: str, write_disposition: str, file_name: str) -> N self.table_name = table_name self.write_disposition = write_disposition self.file_name = file_name - super().__init__(f"Loader does not support {write_disposition} in table {table_name} when loading file {file_name}") + super().__init__( + f"Loader does not support {write_disposition} in table {table_name} when loading file" + f" {file_name}" + ) diff --git a/dlt/load/load.py b/dlt/load/load.py index 725f8589f5..485d5269c9 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -16,16 +16,37 @@ from dlt.common.runners import TRunMetrics, Runnable, workermethod, NullExecutor from dlt.common.runtime.collector import Collector, NULL_COLLECTOR from dlt.common.runtime.logger import pretty_format_exception -from dlt.common.exceptions import TerminalValueError, DestinationTerminalException, DestinationTransientException +from dlt.common.exceptions import ( + TerminalValueError, + DestinationTerminalException, + DestinationTransientException, +) from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TTableSchema, TWriteDisposition from dlt.common.storages import LoadStorage -from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, Destination, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination, TDestination +from dlt.common.destination.reference import ( + DestinationClientDwhConfiguration, + FollowupJob, + JobClientBase, + WithStagingDataset, + Destination, + LoadJob, + NewLoadJob, + TLoadJobState, + DestinationClientConfiguration, + SupportsStagingDestination, + TDestination, +) from dlt.destinations.job_impl import EmptyLoadJob from dlt.load.configuration import LoaderConfiguration -from dlt.load.exceptions import LoadClientJobFailed, LoadClientJobRetry, LoadClientUnsupportedWriteDisposition, LoadClientUnsupportedFileFormats +from dlt.load.exceptions import ( + LoadClientJobFailed, + LoadClientJobRetry, + LoadClientUnsupportedWriteDisposition, + LoadClientUnsupportedFileFormats, +) class Load(Runnable[Executor]): @@ -40,7 +61,7 @@ def __init__( is_storage_owner: bool = False, config: LoaderConfiguration = config.value, initial_client_config: DestinationClientConfiguration = config.value, - initial_staging_client_config: DestinationClientConfiguration = config.value + initial_staging_client_config: DestinationClientConfiguration = config.value, ) -> None: self.config = config self.collector = collector @@ -57,14 +78,17 @@ def __init__( def create_storage(self, is_storage_owner: bool) -> LoadStorage: supported_file_formats = self.capabilities.supported_loader_file_formats if self.staging_destination: - supported_file_formats = self.staging_destination.capabilities().supported_loader_file_formats + ["reference"] + supported_file_formats = ( + self.staging_destination.capabilities().supported_loader_file_formats + + ["reference"] + ) if isinstance(self.get_destination_client(Schema("test")), WithStagingDataset): supported_file_formats += ["sql"] load_storage = LoadStorage( is_storage_owner, self.capabilities.preferred_loader_file_format, supported_file_formats, - config=self.config._load_storage_config + config=self.config._load_storage_config, ) return load_storage @@ -75,10 +99,16 @@ def get_staging_destination_client(self, schema: Schema) -> JobClientBase: return self.staging_destination.client(schema, self.initial_staging_client_config) def is_staging_destination_job(self, file_path: str) -> bool: - return self.staging_destination is not None and os.path.splitext(file_path)[1][1:] in self.staging_destination.capabilities().supported_loader_file_formats + return ( + self.staging_destination is not None + and os.path.splitext(file_path)[1][1:] + in self.staging_destination.capabilities().supported_loader_file_formats + ) @contextlib.contextmanager - def maybe_with_staging_dataset(self, job_client: JobClientBase, use_staging: bool) -> Iterator[None]: + def maybe_with_staging_dataset( + self, job_client: JobClientBase, use_staging: bool + ) -> Iterator[None]: """Executes job client methods in context of staging dataset if `table` has `write_disposition` that requires it""" if isinstance(job_client, WithStagingDataset) and use_staging: with job_client.with_staging_dataset(): @@ -88,29 +118,49 @@ def maybe_with_staging_dataset(self, job_client: JobClientBase, use_staging: boo @staticmethod @workermethod - def w_spool_job(self: "Load", file_path: str, load_id: str, schema: Schema) -> Optional[LoadJob]: + def w_spool_job( + self: "Load", file_path: str, load_id: str, schema: Schema + ) -> Optional[LoadJob]: job: LoadJob = None try: is_staging_destination_job = self.is_staging_destination_job(file_path) job_client = self.get_destination_client(schema) # if we have a staging destination and the file is not a reference, send to staging - with (self.get_staging_destination_client(schema) if is_staging_destination_job else job_client) as client: + with ( + self.get_staging_destination_client(schema) + if is_staging_destination_job + else job_client + ) as client: job_info = self.load_storage.parse_job_file_name(file_path) if job_info.file_format not in self.load_storage.supported_file_formats: - raise LoadClientUnsupportedFileFormats(job_info.file_format, self.capabilities.supported_loader_file_formats, file_path) + raise LoadClientUnsupportedFileFormats( + job_info.file_format, + self.capabilities.supported_loader_file_formats, + file_path, + ) logger.info(f"Will load file {file_path} with table name {job_info.table_name}") table = client.get_load_table(job_info.table_name) if table["write_disposition"] not in ["append", "replace", "merge"]: - raise LoadClientUnsupportedWriteDisposition(job_info.table_name, table["write_disposition"], file_path) + raise LoadClientUnsupportedWriteDisposition( + job_info.table_name, table["write_disposition"], file_path + ) if is_staging_destination_job: - use_staging_dataset = isinstance(job_client, SupportsStagingDestination) and job_client.should_load_data_to_staging_dataset_on_staging_destination(table) + use_staging_dataset = isinstance( + job_client, SupportsStagingDestination + ) and job_client.should_load_data_to_staging_dataset_on_staging_destination( + table + ) else: - use_staging_dataset = isinstance(job_client, WithStagingDataset) and job_client.should_load_data_to_staging_dataset(table) + use_staging_dataset = isinstance( + job_client, WithStagingDataset + ) and job_client.should_load_data_to_staging_dataset(table) with self.maybe_with_staging_dataset(client, use_staging_dataset): - job = client.start_file_load(table, self.load_storage.storage.make_full_path(file_path), load_id) + job = client.start_file_load( + table, self.load_storage.storage.make_full_path(file_path), load_id + ) except (DestinationTerminalException, TerminalValueError): # if job irreversibly cannot be started, mark it as failed logger.exception(f"Terminal problem when adding job {file_path}") @@ -128,7 +178,7 @@ def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJo # use thread based pool as jobs processing is mostly I/O and we do not want to pickle jobs # TODO: combine files by providing a list of files pertaining to same table into job, so job must be # extended to accept a list - load_files = self.load_storage.list_new_jobs(load_id)[:self.config.workers] + load_files = self.load_storage.list_new_jobs(load_id)[: self.config.workers] file_count = len(load_files) if file_count == 0: logger.info(f"No new jobs found in {load_id}") @@ -141,7 +191,9 @@ def spool_new_jobs(self, load_id: str, schema: Schema) -> Tuple[int, List[LoadJo # remove None jobs and check the rest return file_count, [job for job in jobs if job is not None] - def retrieve_jobs(self, client: JobClientBase, load_id: str, staging_client: JobClientBase = None) -> Tuple[int, List[LoadJob]]: + def retrieve_jobs( + self, client: JobClientBase, load_id: str, staging_client: JobClientBase = None + ) -> Tuple[int, List[LoadJob]]: jobs: List[LoadJob] = [] # list all files that were started but not yet completed @@ -168,12 +220,21 @@ def retrieve_jobs(self, client: JobClientBase, load_id: str, staging_client: Job return len(jobs), jobs def get_new_jobs_info(self, load_id: str) -> List[ParsedLoadJobFileName]: - return [LoadStorage.parse_job_file_name(job_file) for job_file in self.load_storage.list_new_jobs(load_id)] + return [ + LoadStorage.parse_job_file_name(job_file) + for job_file in self.load_storage.list_new_jobs(load_id) + ] - def get_completed_table_chain(self, load_id: str, schema: Schema, top_merged_table: TTableSchema, being_completed_job_id: str = None) -> List[TTableSchema]: + def get_completed_table_chain( + self, + load_id: str, + schema: Schema, + top_merged_table: TTableSchema, + being_completed_job_id: str = None, + ) -> List[TTableSchema]: """Gets a table chain starting from the `top_merged_table` containing only tables with completed/failed jobs. None is returned if there's any job that is not completed - Optionally `being_completed_job_id` can be passed that is considered to be completed before job itself moves in storage + Optionally `being_completed_job_id` can be passed that is considered to be completed before job itself moves in storage """ # returns ordered list of tables from parent to child leaf tables table_chain: List[TTableSchema] = [] @@ -181,17 +242,23 @@ def get_completed_table_chain(self, load_id: str, schema: Schema, top_merged_tab for table in get_child_tables(schema.tables, top_merged_table["name"]): table_jobs = self.load_storage.list_jobs_for_table(load_id, table["name"]) # all jobs must be completed in order for merge to be created - if any(job.state not in ("failed_jobs", "completed_jobs") and job.job_file_info.job_id() != being_completed_job_id for job in table_jobs): + if any( + job.state not in ("failed_jobs", "completed_jobs") + and job.job_file_info.job_id() != being_completed_job_id + for job in table_jobs + ): return None # if there are no jobs for the table, skip it, unless the write disposition is replace, as we need to create and clear the child tables if not table_jobs and top_merged_table["write_disposition"] != "replace": - continue + continue table_chain.append(table) # there must be at least table assert len(table_chain) > 0 return table_chain - def create_followup_jobs(self, load_id: str, state: TLoadJobState, starting_job: LoadJob, schema: Schema) -> List[NewLoadJob]: + def create_followup_jobs( + self, load_id: str, state: TLoadJobState, starting_job: LoadJob, schema: Schema + ) -> List[NewLoadJob]: jobs: List[NewLoadJob] = [] if isinstance(starting_job, FollowupJob): # check for merge jobs only for jobs executing on the destination, the staging destination jobs must be excluded @@ -199,10 +266,16 @@ def create_followup_jobs(self, load_id: str, state: TLoadJobState, starting_job: starting_job_file_name = starting_job.file_name() if state == "completed" and not self.is_staging_destination_job(starting_job_file_name): client = self.destination.client(schema, self.initial_client_config) - top_job_table = get_top_level_table(schema.tables, starting_job.job_file_info().table_name) + top_job_table = get_top_level_table( + schema.tables, starting_job.job_file_info().table_name + ) # if all tables of chain completed, create follow up jobs - if table_chain := self.get_completed_table_chain(load_id, schema, top_job_table, starting_job.job_file_info().job_id()): - if follow_up_jobs := client.create_table_chain_completed_followup_jobs(table_chain): + if table_chain := self.get_completed_table_chain( + load_id, schema, top_job_table, starting_job.job_file_info().job_id() + ): + if follow_up_jobs := client.create_table_chain_completed_followup_jobs( + table_chain + ): jobs = jobs + follow_up_jobs jobs = jobs + starting_job.create_followup_jobs(state) return jobs @@ -222,22 +295,34 @@ def complete_jobs(self, load_id: str, jobs: List[LoadJob], schema: Schema) -> Li # try to get exception message from job failed_message = job.exception() self.load_storage.fail_job(load_id, job.file_name(), failed_message) - logger.error(f"Job for {job.job_id()} failed terminally in load {load_id} with message {failed_message}") + logger.error( + f"Job for {job.job_id()} failed terminally in load {load_id} with message" + f" {failed_message}" + ) elif state == "retry": # try to get exception message from job retry_message = job.exception() # move back to new folder to try again self.load_storage.retry_job(load_id, job.file_name()) - logger.warning(f"Job for {job.job_id()} retried in load {load_id} with message {retry_message}") + logger.warning( + f"Job for {job.job_id()} retried in load {load_id} with message {retry_message}" + ) elif state == "completed": # create followup jobs followup_jobs = self.create_followup_jobs(load_id, state, job, schema) for followup_job in followup_jobs: # running should be moved into "new jobs", other statuses into started - folder: TJobState = "new_jobs" if followup_job.state() == "running" else "started_jobs" + folder: TJobState = ( + "new_jobs" if followup_job.state() == "running" else "started_jobs" + ) # save all created jobs - self.load_storage.add_new_job(load_id, followup_job.new_file_path(), job_state=folder) - logger.info(f"Job {job.job_id()} CREATED a new FOLLOWUP JOB {followup_job.new_file_path()} placed in {folder}") + self.load_storage.add_new_job( + load_id, followup_job.new_file_path(), job_state=folder + ) + logger.info( + f"Job {job.job_id()} CREATED a new FOLLOWUP JOB" + f" {followup_job.new_file_path()} placed in {folder}" + ) # if followup job is not "running" place it in current queue to be finalized if not followup_job.state() == "running": remaining_jobs.append(followup_job) @@ -249,7 +334,9 @@ def complete_jobs(self, load_id: str, jobs: List[LoadJob], schema: Schema) -> Li if state in ["failed", "completed"]: self.collector.update("Jobs") if state == "failed": - self.collector.update("Jobs", 1, message="WARNING: Some of the jobs failed!", label="Failed") + self.collector.update( + "Jobs", 1, message="WARNING: Some of the jobs failed!", label="Failed" + ) return remaining_jobs @@ -261,14 +348,20 @@ def complete_package(self, load_id: str, schema: Schema, aborted: bool = False) # TODO: Load must provide a clear interface to get last loads and metrics # TODO: get more info ie. was package aborted, schema name etc. if isinstance(job_client.config, DestinationClientDwhConfiguration): - self._processed_load_ids[load_id] = job_client.config.normalize_dataset_name(schema) + self._processed_load_ids[load_id] = job_client.config.normalize_dataset_name( + schema + ) else: self._processed_load_ids[load_id] = None self.load_storage.complete_load_package(load_id, aborted) - logger.info(f"All jobs completed, archiving package {load_id} with aborted set to {aborted}") + logger.info( + f"All jobs completed, archiving package {load_id} with aborted set to {aborted}" + ) @staticmethod - def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchema], bool], tables_with_jobs: Iterable[str]) -> Set[str]: + def _get_table_chain_tables_with_filter( + schema: Schema, f: Callable[[TTableSchema], bool], tables_with_jobs: Iterable[str] + ) -> Set[str]: """Get all jobs for tables with given write disposition and resolve the table chain""" result: Set[str] = set() for table_name in tables_with_jobs: @@ -279,54 +372,106 @@ def _get_table_chain_tables_with_filter(schema: Schema, f: Callable[[TTableSchem # only add tables for tables that have jobs unless the disposition is replace # TODO: this is a (formerly used) hack to make test_merge_on_keys_in_schema, # we should change that test - if not table["name"] in tables_with_jobs and top_job_table["write_disposition"] != "replace": + if ( + not table["name"] in tables_with_jobs + and top_job_table["write_disposition"] != "replace" + ): continue result.add(table["name"]) return result @staticmethod - def _init_dataset_and_update_schema(job_client: JobClientBase, expected_update: TSchemaTables, update_tables: Iterable[str], truncate_tables: Iterable[str] = None, staging_info: bool = False) -> TSchemaTables: + def _init_dataset_and_update_schema( + job_client: JobClientBase, + expected_update: TSchemaTables, + update_tables: Iterable[str], + truncate_tables: Iterable[str] = None, + staging_info: bool = False, + ) -> TSchemaTables: staging_text = "for staging dataset" if staging_info else "" - logger.info(f"Client for {job_client.config.destination_name} will start initialize storage {staging_text}") + logger.info( + f"Client for {job_client.config.destination_name} will start initialize storage" + f" {staging_text}" + ) job_client.initialize_storage() - logger.info(f"Client for {job_client.config.destination_name} will update schema to package schema {staging_text}") - applied_update = job_client.update_stored_schema(only_tables=update_tables, expected_update=expected_update) - logger.info(f"Client for {job_client.config.destination_name} will truncate tables {staging_text}") + logger.info( + f"Client for {job_client.config.destination_name} will update schema to package schema" + f" {staging_text}" + ) + applied_update = job_client.update_stored_schema( + only_tables=update_tables, expected_update=expected_update + ) + logger.info( + f"Client for {job_client.config.destination_name} will truncate tables {staging_text}" + ) job_client.initialize_storage(truncate_tables=truncate_tables) return applied_update - - def _init_client(self, job_client: JobClientBase, schema: Schema, expected_update: TSchemaTables, load_id: str, truncate_filter: Callable[[TTableSchema], bool], truncate_staging_filter: Callable[[TTableSchema], bool]) -> TSchemaTables: - + def _init_client( + self, + job_client: JobClientBase, + schema: Schema, + expected_update: TSchemaTables, + load_id: str, + truncate_filter: Callable[[TTableSchema], bool], + truncate_staging_filter: Callable[[TTableSchema], bool], + ) -> TSchemaTables: tables_with_jobs = set(job.table_name for job in self.get_new_jobs_info(load_id)) dlt_tables = set(t["name"] for t in schema.dlt_tables()) # update the default dataset - truncate_tables = self._get_table_chain_tables_with_filter(schema, truncate_filter, tables_with_jobs) - applied_update = self._init_dataset_and_update_schema(job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables) + truncate_tables = self._get_table_chain_tables_with_filter( + schema, truncate_filter, tables_with_jobs + ) + applied_update = self._init_dataset_and_update_schema( + job_client, expected_update, tables_with_jobs | dlt_tables, truncate_tables + ) # update the staging dataset if client supports this if isinstance(job_client, WithStagingDataset): - if staging_tables := self._get_table_chain_tables_with_filter(schema, truncate_staging_filter, tables_with_jobs): + if staging_tables := self._get_table_chain_tables_with_filter( + schema, truncate_staging_filter, tables_with_jobs + ): with job_client.with_staging_dataset(): - self._init_dataset_and_update_schema(job_client, expected_update, staging_tables | {schema.version_table_name}, staging_tables, staging_info=True) + self._init_dataset_and_update_schema( + job_client, + expected_update, + staging_tables | {schema.version_table_name}, + staging_tables, + staging_info=True, + ) return applied_update - def load_single_package(self, load_id: str, schema: Schema) -> None: # initialize analytical storage ie. create dataset required by passed schema with self.get_destination_client(schema) as job_client: - if (expected_update := self.load_storage.begin_schema_update(load_id)) is not None: - # init job client - applied_update = self._init_client(job_client, schema, expected_update, load_id, job_client.should_truncate_table_before_load, job_client.should_load_data_to_staging_dataset if isinstance(job_client, WithStagingDataset) else None) + applied_update = self._init_client( + job_client, + schema, + expected_update, + load_id, + job_client.should_truncate_table_before_load, + ( + job_client.should_load_data_to_staging_dataset + if isinstance(job_client, WithStagingDataset) + else None + ), + ) # init staging client if self.staging_destination and isinstance(job_client, SupportsStagingDestination): with self.get_staging_destination_client(schema) as staging_client: - self._init_client(staging_client, schema, expected_update, load_id, job_client.should_truncate_table_before_load_on_staging_destination, job_client.should_load_data_to_staging_dataset_on_staging_destination) + self._init_client( + staging_client, + schema, + expected_update, + load_id, + job_client.should_truncate_table_before_load_on_staging_destination, + job_client.should_load_data_to_staging_dataset_on_staging_destination, + ) self.load_storage.commit_schema_update(load_id, applied_update) @@ -351,7 +496,9 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: no_completed_jobs = len(package_info.jobs["completed_jobs"]) + no_failed_jobs self.collector.update("Jobs", no_completed_jobs, total_jobs) if no_failed_jobs > 0: - self.collector.update("Jobs", no_failed_jobs, message="WARNING: Some of the jobs failed!", label="Failed") + self.collector.update( + "Jobs", no_failed_jobs, message="WARNING: Some of the jobs failed!", label="Failed" + ) # loop until all jobs are processed while True: try: @@ -363,13 +510,22 @@ def load_single_package(self, load_id: str, schema: Schema) -> None: if self.config.raise_on_failed_jobs: if package_info.jobs["failed_jobs"]: failed_job = package_info.jobs["failed_jobs"][0] - raise LoadClientJobFailed(load_id, failed_job.job_file_info.job_id(), failed_job.failed_message) + raise LoadClientJobFailed( + load_id, + failed_job.job_file_info.job_id(), + failed_job.failed_message, + ) # possibly raise on too many retires if self.config.raise_on_max_retries: for new_job in package_info.jobs["new_jobs"]: r_c = new_job.job_file_info.retry_count if r_c > 0 and r_c % self.config.raise_on_max_retries == 0: - raise LoadClientJobRetry(load_id, new_job.job_file_info.job_id(), r_c, self.config.raise_on_max_retries) + raise LoadClientJobRetry( + load_id, + new_job.job_file_info.job_id(), + r_c, + self.config.raise_on_max_retries, + ) break # process remaining jobs again jobs = remaining_jobs @@ -405,7 +561,9 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) - def get_load_info(self, pipeline: SupportsPipeline, started_at: datetime.datetime = None) -> LoadInfo: + def get_load_info( + self, pipeline: SupportsPipeline, started_at: datetime.datetime = None + ) -> LoadInfo: # TODO: LoadInfo should hold many datasets load_ids = list(self._processed_load_ids.keys()) load_packages: List[LoadPackageInfo] = [] @@ -418,12 +576,16 @@ def get_load_info(self, pipeline: SupportsPipeline, started_at: datetime.datetim pipeline, self.initial_client_config.destination_name, str(self.initial_client_config), - self.initial_staging_client_config.destination_name if self.initial_staging_client_config else None, + ( + self.initial_staging_client_config.destination_name + if self.initial_staging_client_config + else None + ), str(self.initial_staging_client_config) if self.initial_staging_client_config else None, self.initial_client_config.fingerprint(), _dataset_name, list(load_ids), load_packages, started_at, - pipeline.first_run + pipeline.first_run, ) diff --git a/dlt/normalize/__init__.py b/dlt/normalize/__init__.py index 25d6a4afd3..b2fba68797 100644 --- a/dlt/normalize/__init__.py +++ b/dlt/normalize/__init__.py @@ -1,3 +1,3 @@ from .normalize import Normalize -__all__ = ['Normalize'] +__all__ = ["Normalize"] diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index 13b408945c..3949a07fa8 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -4,7 +4,11 @@ from dlt.common.configuration.specs import BaseConfiguration from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType -from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration +from dlt.common.storages import ( + LoadStorageConfiguration, + NormalizeStorageConfiguration, + SchemaStorageConfiguration, +) @configspec @@ -15,8 +19,8 @@ class ItemsNormalizerConfiguration(BaseConfiguration): """When true, items to be normalized will have `_dlt_load_id` column added with the current load ID.""" if TYPE_CHECKING: - def __init__(self, add_dlt_id: bool = None, add_dlt_load_id: bool = None) -> None: - ... + + def __init__(self, add_dlt_id: bool = None, add_dlt_load_id: bool = None) -> None: ... @configspec @@ -27,20 +31,24 @@ class NormalizeConfiguration(PoolRunnerConfiguration): _normalize_storage_config: NormalizeStorageConfiguration _load_storage_config: LoadStorageConfiguration - json_normalizer: ItemsNormalizerConfiguration = ItemsNormalizerConfiguration(add_dlt_id=True, add_dlt_load_id=True) + json_normalizer: ItemsNormalizerConfiguration = ItemsNormalizerConfiguration( + add_dlt_id=True, add_dlt_load_id=True + ) - parquet_normalizer: ItemsNormalizerConfiguration = ItemsNormalizerConfiguration(add_dlt_id=False, add_dlt_load_id=False) + parquet_normalizer: ItemsNormalizerConfiguration = ItemsNormalizerConfiguration( + add_dlt_id=False, add_dlt_load_id=False + ) def on_resolved(self) -> None: self.pool_type = "none" if self.workers == 1 else "process" if TYPE_CHECKING: + def __init__( self, pool_type: TPoolType = "process", workers: int = None, _schema_storage_config: SchemaStorageConfiguration = None, _normalize_storage_config: NormalizeStorageConfiguration = None, - _load_storage_config: LoadStorageConfiguration = None - ) -> None: - ... + _load_storage_config: LoadStorageConfiguration = None, + ) -> None: ... diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py index 79da16b925..11463e4dd7 100644 --- a/dlt/normalize/exceptions.py +++ b/dlt/normalize/exceptions.py @@ -1,5 +1,6 @@ from dlt.common.exceptions import DltException + class NormalizeException(DltException): def __init__(self, msg: str) -> None: super().__init__(msg) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index 6146d864b6..0a7be1535c 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -30,7 +30,7 @@ def __init__( normalize_storage: NormalizeStorage, schema: Schema, load_id: str, - config: NormalizeConfiguration + config: NormalizeConfiguration, ) -> None: self.load_storage = load_storage self.normalize_storage = normalize_storage @@ -39,8 +39,9 @@ def __init__( self.config = config @abstractmethod - def __call__(self, extracted_items_file: str, root_table_name: str) -> Tuple[List[TSchemaUpdate], int, TRowCount]: - ... + def __call__( + self, extracted_items_file: str, root_table_name: str + ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: ... class JsonLItemsNormalizer(ItemsNormalizer): @@ -50,7 +51,7 @@ def __init__( normalize_storage: NormalizeStorage, schema: Schema, load_id: str, - config: NormalizeConfiguration + config: NormalizeConfiguration, ) -> None: super().__init__(load_storage, normalize_storage, schema, load_id, config) self._table_contracts: Dict[str, TSchemaContractDict] = {} @@ -59,7 +60,9 @@ def __init__( # quick access to column schema for writers below self._column_schemas: Dict[str, TTableSchemaColumns] = {} - def _filter_columns(self, filtered_columns: Dict[str, TSchemaEvolutionMode], row: DictStrAny) -> DictStrAny: + def _filter_columns( + self, filtered_columns: Dict[str, TSchemaEvolutionMode], row: DictStrAny + ) -> DictStrAny: for name, mode in filtered_columns.items(): if name in row: if mode == "discard_row": @@ -68,7 +71,9 @@ def _filter_columns(self, filtered_columns: Dict[str, TSchemaEvolutionMode], row row.pop(name) return row - def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_have_pua: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: + def _normalize_chunk( + self, root_table_name: str, items: List[TDataItem], may_have_pua: bool + ) -> Tuple[TSchemaUpdate, int, TRowCount]: column_schemas = self._column_schemas schema_update: TSchemaUpdate = {} schema = self.schema @@ -115,23 +120,27 @@ def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_hav row[k] = custom_pua_decode(v) # type: ignore # coerce row of values into schema table, generating partial table with new columns if any - row, partial_table = schema.coerce_row( - table_name, parent_table, row - ) + row, partial_table = schema.coerce_row(table_name, parent_table, row) # if we detect a migration, check schema contract if partial_table: schema_contract = self._table_contracts.setdefault( table_name, - schema.resolve_contract_settings_for_table(parent_table or table_name) # parent_table, if present, exists in the schema + schema.resolve_contract_settings_for_table( + parent_table or table_name + ), # parent_table, if present, exists in the schema + ) + partial_table, filters = schema.apply_schema_contract( + schema_contract, partial_table, data_item=row ) - partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table, data_item=row) if filters: for entity, name, mode in filters: if entity == "tables": self._filtered_tables.add(name) elif entity == "columns": - filtered_columns = self._filtered_tables_columns.setdefault(table_name, {}) + filtered_columns = self._filtered_tables_columns.setdefault( + table_name, {} + ) filtered_columns[name] = mode if partial_table is None: @@ -145,9 +154,7 @@ def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_hav table_updates.append(partial_table) # update our columns - column_schemas[table_name] = schema.get_table_columns( - table_name - ) + column_schemas[table_name] = schema.get_table_columns(table_name) # apply new filters if filtered_columns and filters: @@ -190,11 +197,14 @@ def __call__( line: bytes for line_no, line in enumerate(f): items: List[TDataItem] = json.loadb(line) - partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items, may_have_pua(line)) + partial_update, items_count, r_counts = self._normalize_chunk( + root_table_name, items, may_have_pua(line) + ) schema_updates.append(partial_update) merge_row_count(row_counts, r_counts) logger.debug( - f"Processed {line_no} items from file {extracted_items_file}, items {items_count}" + f"Processed {line_no} items from file {extracted_items_file}, items" + f" {items_count}" ) return schema_updates, items_count, row_counts @@ -212,39 +222,71 @@ def _write_with_dlt_columns( schema_update: TSchemaUpdate = {} if add_load_id: - table_update = schema.update_table({"name": root_table_name, "columns": {"_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text", "nullable": False}}}) + table_update = schema.update_table( + { + "name": root_table_name, + "columns": { + "_dlt_load_id": { + "name": "_dlt_load_id", + "data_type": "text", + "nullable": False, + } + }, + } + ) table_updates = schema_update.setdefault(root_table_name, []) table_updates.append(table_update) load_id_type = pa.dictionary(pa.int8(), pa.string()) - new_columns.append(( - -1, - pa.field("_dlt_load_id", load_id_type, nullable=False), - lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type) - )) + new_columns.append( + ( + -1, + pa.field("_dlt_load_id", load_id_type, nullable=False), + lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type), + ) + ) if add_dlt_id: - table_update = schema.update_table({"name": root_table_name, "columns": {"_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}}}) + table_update = schema.update_table( + { + "name": root_table_name, + "columns": { + "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False} + }, + } + ) table_updates = schema_update.setdefault(root_table_name, []) table_updates.append(table_update) - new_columns.append(( - -1, - pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), - lambda batch: pa.array(generate_dlt_ids(batch.num_rows)) - )) + new_columns.append( + ( + -1, + pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), + lambda batch: pa.array(generate_dlt_ids(batch.num_rows)), + ) + ) items_count = 0 as_py = self.load_storage.loader_file_format != "arrow" with self.normalize_storage.storage.open_file(extracted_items_file, "rb") as f: - for batch in pyarrow.pq_stream_with_new_columns(f, new_columns, row_groups_per_read=self.REWRITE_ROW_GROUPS): + for batch in pyarrow.pq_stream_with_new_columns( + f, new_columns, row_groups_per_read=self.REWRITE_ROW_GROUPS + ): items_count += batch.num_rows if as_py: # Write python rows to jsonl, insert-values, etc... storage self.load_storage.write_data_item( - load_id, schema.name, root_table_name, batch.to_pylist(), schema.get_table_columns(root_table_name) + load_id, + schema.name, + root_table_name, + batch.to_pylist(), + schema.get_table_columns(root_table_name), ) else: self.load_storage.write_data_item( - load_id, schema.name, root_table_name, batch, schema.get_table_columns(root_table_name) + load_id, + schema.name, + root_table_name, + batch, + schema.get_table_columns(root_table_name), ) return [schema_update], items_count @@ -262,10 +304,9 @@ def _fix_schema_precisions(self, root_table_name: str) -> List[TSchemaUpdate]: if not new_cols: return [] - return [{root_table_name: [schema.update_table({ - "name": root_table_name, - "columns": new_cols - })]}] + return [ + {root_table_name: [schema.update_table({"name": root_table_name, "columns": new_cols})]} + ] def __call__( self, extracted_items_file: str, root_table_name: str @@ -277,21 +318,23 @@ def __call__( if add_dlt_id or add_dlt_load_id or self.load_storage.loader_file_format != "arrow": schema_update, items_count = self._write_with_dlt_columns( - extracted_items_file, - root_table_name, - add_dlt_load_id, - add_dlt_id + extracted_items_file, root_table_name, add_dlt_load_id, add_dlt_id ) return base_schema_update + schema_update, items_count, {root_table_name: items_count} from dlt.common.libs.pyarrow import get_row_count + with self.normalize_storage.storage.open_file(extracted_items_file, "rb") as f: items_count = get_row_count(f) - target_folder = self.load_storage.storage.make_full_path(os.path.join(self.load_id, LoadStorage.NEW_JOBS_FOLDER)) + target_folder = self.load_storage.storage.make_full_path( + os.path.join(self.load_id, LoadStorage.NEW_JOBS_FOLDER) + ) parts = NormalizeStorage.parse_normalize_file_name(extracted_items_file) - new_file_name = self.load_storage.build_job_file_name(parts.table_name, parts.file_id, with_extension=True) + new_file_name = self.load_storage.build_job_file_name( + parts.table_name, parts.file_id, with_extension=True + ) FileStorage.link_hard_with_fallback( self.normalize_storage.storage.make_full_path(extracted_items_file), - os.path.join(target_folder, new_file_name) + os.path.join(target_folder, new_file_name), ) return base_schema_update, items_count, {root_table_name: items_count} diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index ab87a5a2a1..fd7c4ed894 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -13,27 +13,45 @@ from dlt.common.schema.typing import TStoredSchema from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError -from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration +from dlt.common.storages import ( + NormalizeStorage, + SchemaStorage, + LoadStorage, + LoadStorageConfiguration, + NormalizeStorageConfiguration, +) from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo from dlt.common.utils import chunks, TRowCount, merge_row_count, increase_row_count from dlt.normalize.configuration import NormalizeConfiguration -from dlt.normalize.items_normalizers import ParquetItemsNormalizer, JsonLItemsNormalizer, ItemsNormalizer +from dlt.normalize.items_normalizers import ( + ParquetItemsNormalizer, + JsonLItemsNormalizer, + ItemsNormalizer, +) # normalize worker wrapping function (map_parallel, map_single) return type TMapFuncRV = Tuple[Sequence[TSchemaUpdate], TRowCount] # normalize worker wrapping function signature -TMapFuncType = Callable[[Schema, str, Sequence[str]], TMapFuncRV] # input parameters: (schema name, load_id, list of files to process) +TMapFuncType = Callable[ + [Schema, str, Sequence[str]], TMapFuncRV +] # input parameters: (schema name, load_id, list of files to process) # tuple returned by the worker TWorkerRV = Tuple[List[TSchemaUpdate], int, List[str], TRowCount] class Normalize(Runnable[Executor]): pool: Executor + @with_config(spec=NormalizeConfiguration, sections=(known_sections.NORMALIZE,)) - def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: SchemaStorage = None, config: NormalizeConfiguration = config.value) -> None: + def __init__( + self, + collector: Collector = NULL_COLLECTOR, + schema_storage: SchemaStorage = None, + config: NormalizeConfiguration = config.value, + ) -> None: self.config = config self.collector = collector self.normalize_storage: NormalizeStorage = None @@ -45,17 +63,21 @@ def __init__(self, collector: Collector = NULL_COLLECTOR, schema_storage: Schema # setup storages self.create_storages() # create schema storage with give type - self.schema_storage = schema_storage or SchemaStorage(self.config._schema_storage_config, makedirs=True) + self.schema_storage = schema_storage or SchemaStorage( + self.config._schema_storage_config, makedirs=True + ) def create_storages(self) -> None: # pass initial normalize storage config embedded in normalize config - self.normalize_storage = NormalizeStorage(True, config=self.config._normalize_storage_config) + self.normalize_storage = NormalizeStorage( + True, config=self.config._normalize_storage_config + ) # normalize saves in preferred format but can read all supported formats self.load_storage = LoadStorage( True, self.config.destination_capabilities.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, - config=self.config._load_storage_config + config=self.config._load_storage_config, ) @staticmethod @@ -63,7 +85,9 @@ def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Sc try: schema = schema_storage.load_schema(schema_name) schema.update_normalizers() - logger.info(f"Loaded schema with name {schema_name} with version {schema.stored_version}") + logger.info( + f"Loaded schema with name {schema_name} with version {schema.stored_version}" + ) except SchemaNotFoundError: schema = Schema(schema_name) logger.info(f"Created new schema with name {schema_name}") @@ -89,27 +113,41 @@ def _get_load_storage(file_format: TLoaderFileFormat) -> LoadStorage: supported_formats = destination_caps.supported_loader_file_formats or [] if file_format == "parquet": if file_format in supported_formats: - supported_formats.append("arrow") # TODO: Hack to make load storage use the correct writer + supported_formats.append( + "arrow" + ) # TODO: Hack to make load storage use the correct writer file_format = "arrow" else: # Use default storage if parquet is not supported to make normalizer fallback to read rows from the file - file_format = destination_caps.preferred_loader_file_format or destination_caps.preferred_staging_file_format + file_format = ( + destination_caps.preferred_loader_file_format + or destination_caps.preferred_staging_file_format + ) else: - file_format = destination_caps.preferred_loader_file_format or destination_caps.preferred_staging_file_format + file_format = ( + destination_caps.preferred_loader_file_format + or destination_caps.preferred_staging_file_format + ) if storage := load_storages.get(file_format): return storage - storage = load_storages[file_format] = LoadStorage(False, file_format, supported_formats, loader_storage_config) + storage = load_storages[file_format] = LoadStorage( + False, file_format, supported_formats, loader_storage_config + ) return storage # process all files with data items and write to buffered item storage with Container().injectable_context(destination_caps): schema = Schema.from_stored_schema(stored_schema) - load_storage = _get_load_storage(destination_caps.preferred_loader_file_format) # Default load storage, used for empty tables when no data + load_storage = _get_load_storage( + destination_caps.preferred_loader_file_format + ) # Default load storage, used for empty tables when no data normalize_storage = NormalizeStorage(False, normalize_storage_config) item_normalizers: Dict[TLoaderFileFormat, ItemsNormalizer] = {} - def _get_items_normalizer(file_format: TLoaderFileFormat) -> Tuple[ItemsNormalizer, LoadStorage]: + def _get_items_normalizer( + file_format: TLoaderFileFormat, + ) -> Tuple[ItemsNormalizer, LoadStorage]: load_storage = _get_load_storage(file_format) if file_format in item_normalizers: return item_normalizers[file_format], load_storage @@ -124,22 +162,34 @@ def _get_items_normalizer(file_format: TLoaderFileFormat) -> Tuple[ItemsNormaliz populated_root_tables: Set[str] = set() for extracted_items_file in extracted_items_files: line_no: int = 0 - parsed_file_name = NormalizeStorage.parse_normalize_file_name(extracted_items_file) + parsed_file_name = NormalizeStorage.parse_normalize_file_name( + extracted_items_file + ) # normalize table name in case the normalization changed # NOTE: this is the best we can do, until a full lineage information is in the schema - root_table_name = schema.naming.normalize_table_identifier(parsed_file_name.table_name) + root_table_name = schema.naming.normalize_table_identifier( + parsed_file_name.table_name + ) root_tables.add(root_table_name) - logger.debug(f"Processing extracted items in {extracted_items_file} in load_id {load_id} with table name {root_table_name} and schema {schema.name}") + logger.debug( + f"Processing extracted items in {extracted_items_file} in load_id" + f" {load_id} with table name {root_table_name} and schema {schema.name}" + ) file_format = parsed_file_name.file_format normalizer, load_storage = _get_items_normalizer(file_format) - partial_updates, items_count, r_counts = normalizer(extracted_items_file, root_table_name) + partial_updates, items_count, r_counts = normalizer( + extracted_items_file, root_table_name + ) schema_updates.extend(partial_updates) total_items += items_count merge_row_count(row_counts, r_counts) if items_count > 0: populated_root_tables.add(root_table_name) - logger.debug(f"Processed total {line_no + 1} lines from file {extracted_items_file}, total items {total_items}") + logger.debug( + f"Processed total {line_no + 1} lines from file {extracted_items_file}," + f" total items {total_items}" + ) # make sure base tables are all covered increase_row_count(row_counts, root_table_name, 0) # write empty jobs for tables without items if table exists in schema @@ -150,7 +200,9 @@ def _get_items_normalizer(file_format: TLoaderFileFormat) -> Tuple[ItemsNormaliz columns = schema.get_table_columns(table_name) load_storage.write_empty_file(load_id, schema.name, table_name, columns) except Exception: - logger.exception(f"Exception when processing file {extracted_items_file}, line {line_no}") + logger.exception( + f"Exception when processing file {extracted_items_file}, line {line_no}" + ) raise finally: load_storage.close_writers(load_id) @@ -162,7 +214,9 @@ def _get_items_normalizer(file_format: TLoaderFileFormat) -> Tuple[ItemsNormaliz def update_table(self, schema: Schema, schema_updates: List[TSchemaUpdate]) -> None: for schema_update in schema_updates: for table_name, table_updates in schema_update.items(): - logger.info(f"Updating schema for table {table_name} with {len(table_updates)} deltas") + logger.info( + f"Updating schema for table {table_name} with {len(table_updates)} deltas" + ) for partial_table in table_updates: # merge columns schema.update_table(partial_table) @@ -180,17 +234,25 @@ def group_worker_files(files: Sequence[str], no_groups: int) -> List[Sequence[st while remainder_l > 0: for idx, file in enumerate(reversed(chunk_files.pop())): chunk_files[-l_idx - idx - remainder_l].append(file) # type: ignore - remainder_l -=1 + remainder_l -= 1 l_idx = idx + 1 return chunk_files def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMapFuncRV: - workers: int = getattr(self.pool, '_max_workers', 1) + workers: int = getattr(self.pool, "_max_workers", 1) chunk_files = self.group_worker_files(files, workers) schema_dict: TStoredSchema = schema.to_dict() - param_chunk = [( - self.config, self.normalize_storage.config, self.load_storage.config, schema_dict, load_id, files - ) for files in chunk_files] + param_chunk = [ + ( + self.config, + self.normalize_storage.config, + self.load_storage.config, + schema_dict, + load_id, + files, + ) + for files in chunk_files + ] row_counts: TRowCount = {} # return stats @@ -198,7 +260,8 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM # push all tasks to queue tasks = [ - (self.pool.submit(Normalize.w_normalize_files, *params), params) for params in param_chunk + (self.pool.submit(Normalize.w_normalize_files, *params), params) + for params in param_chunk ] while len(tasks) > 0: @@ -207,7 +270,9 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM for task in list(tasks): pending, params = task if pending.done(): - result: TWorkerRV = pending.result() # Exception in task (if any) is raised here + result: TWorkerRV = ( + pending.result() + ) # Exception in task (if any) is raised here try: # gather schema from all manifests, validate consistency and combine self.update_table(schema, result[0]) @@ -219,7 +284,9 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM merge_row_count(row_counts, result[3]) except CannotCoerceColumnException as exc: # schema conflicts resulting from parallel executing - logger.warning(f"Parallel schema update conflict, retrying task ({str(exc)}") + logger.warning( + f"Parallel schema update conflict, retrying task ({str(exc)}" + ) # delete all files produced by the task for file in result[2]: os.remove(file) @@ -227,7 +294,9 @@ def map_parallel(self, schema: Schema, load_id: str, files: Sequence[str]) -> TM schema_dict = schema.to_dict() # TODO: it's time for a named tuple params = params[:3] + (schema_dict,) + params[4:] - retry_pending: Future[TWorkerRV] = self.pool.submit(Normalize.w_normalize_files, *params) + retry_pending: Future[TWorkerRV] = self.pool.submit( + Normalize.w_normalize_files, *params + ) tasks.append((retry_pending, params)) # remove finished tasks tasks.remove(task) @@ -241,21 +310,25 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap self.load_storage.config, schema.to_dict(), load_id, - files + files, ) self.update_table(schema, result[0]) self.collector.update("Files", len(result[2])) self.collector.update("Items", result[1]) return result[0], result[3] - def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None: + def spool_files( + self, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str] + ) -> None: schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) # remove normalizer specific info for table in schema.tables.values(): table.pop("x-normalizer", None) # type: ignore[typeddict-item] - logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") + logger.info( + f"Saving schema {schema_name} with version {schema.version}, writing manifest files" + ) # schema is updated, save it to schema volume self.schema_storage.save_schema(schema) # save schema to temp load folder @@ -283,7 +356,9 @@ def spool_schema_files(self, load_id: str, schema_name: str, files: Sequence[str self.spool_files(schema_name, load_id, self.map_parallel, files) except CannotCoerceColumnException as exc: # schema conflicts resulting from parallel executing - logger.warning(f"Parallel schema update conflict, switching to single thread ({str(exc)}") + logger.warning( + f"Parallel schema update conflict, switching to single thread ({str(exc)}" + ) # start from scratch self.load_storage.create_temp_load_package(load_id) self.spool_files(schema_name, load_id, self.map_single, files) @@ -304,7 +379,9 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: for schema_name, files_iter in self.normalize_storage.group_by_schema(files): schema_files = list(files_iter) load_id = str(pendulum.now().timestamp()) - logger.info(f"Found {len(schema_files)} files in schema {schema_name} load_id {load_id}") + logger.info( + f"Found {len(schema_files)} files in schema {schema_name} load_id {load_id}" + ) with self.collector(f"Normalize {schema_name} in {load_id}"): self.collector.update("Files", 0, len(schema_files)) self.collector.update("Items", 0) diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 0f173307a0..ddb7d6d489 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -97,7 +97,7 @@ def pipeline( full_refresh: bool = False, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, - **kwargs: Any + **kwargs: Any, ) -> Pipeline: ensure_correct_pipeline_kwargs(pipeline, **kwargs) # call without arguments returns current pipeline @@ -120,7 +120,11 @@ def pipeline( pipelines_dir = get_dlt_pipelines_dir() destination = Destination.from_reference(destination or kwargs["destination_name"]) - staging = Destination.from_reference(staging or kwargs.get("staging_name", None)) if staging is not None else None + staging = ( + Destination.from_reference(staging or kwargs.get("staging_name", None)) + if staging is not None + else None + ) progress = collector_from_name(progress) # create new pipeline instance @@ -138,7 +142,8 @@ def pipeline( progress, False, last_config(**kwargs), - kwargs["runtime"]) + kwargs["runtime"], + ) # set it as current pipeline p.activate() return p @@ -152,7 +157,7 @@ def attach( full_refresh: bool = False, credentials: Any = None, progress: TCollectorArg = _NULL_COLLECTOR, - **kwargs: Any + **kwargs: Any, ) -> Pipeline: """Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder.""" ensure_correct_pipeline_kwargs(attach, **kwargs) @@ -161,7 +166,22 @@ def attach( pipelines_dir = get_dlt_pipelines_dir() progress = collector_from_name(progress) # create new pipeline instance - p = Pipeline(pipeline_name, pipelines_dir, pipeline_salt, None, None, None, credentials, None, None, full_refresh, progress, True, last_config(**kwargs), kwargs["runtime"]) + p = Pipeline( + pipeline_name, + pipelines_dir, + pipeline_salt, + None, + None, + None, + credentials, + None, + None, + full_refresh, + progress, + True, + last_config(**kwargs), + kwargs["runtime"], + ) # set it as current pipeline p.activate() return p @@ -241,11 +261,13 @@ def run( columns=columns, schema=schema, loader_file_format=loader_file_format, - schema_contract=schema_contract + schema_contract=schema_contract, ) + # plug default tracking module from dlt.pipeline import trace, track + trace.TRACKING_MODULE = track # setup default pipeline in the container diff --git a/dlt/pipeline/dbt.py b/dlt/pipeline/dbt.py index 70bd425f12..e235fe5270 100644 --- a/dlt/pipeline/dbt.py +++ b/dlt/pipeline/dbt.py @@ -6,11 +6,19 @@ from dlt.common.typing import TSecretValue from dlt.common.schema.utils import normalize_schema_name -from dlt.helpers.dbt import create_venv as _create_venv, package_runner as _package_runner, DBTPackageRunner, DEFAULT_DBT_VERSION as _DEFAULT_DBT_VERSION, restore_venv as _restore_venv +from dlt.helpers.dbt import ( + create_venv as _create_venv, + package_runner as _package_runner, + DBTPackageRunner, + DEFAULT_DBT_VERSION as _DEFAULT_DBT_VERSION, + restore_venv as _restore_venv, +) from dlt.pipeline.pipeline import Pipeline -def get_venv(pipeline: Pipeline, venv_path: str = "dbt", dbt_version: str = _DEFAULT_DBT_VERSION) -> Venv: +def get_venv( + pipeline: Pipeline, venv_path: str = "dbt", dbt_version: str = _DEFAULT_DBT_VERSION +) -> Venv: """Creates or restores a virtual environment in which the `dbt` packages are executed. The recommended way to execute dbt package is to use a separate virtual environment where only the dbt-core @@ -42,12 +50,12 @@ def get_venv(pipeline: Pipeline, venv_path: str = "dbt", dbt_version: str = _DEF def package( - pipeline: Pipeline, - package_location: str, - package_repository_branch: str = None, - package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa - auto_full_refresh_when_out_of_sync: bool = None, - venv: Venv = None + pipeline: Pipeline, + package_location: str, + package_repository_branch: str = None, + package_repository_ssh_key: TSecretValue = TSecretValue(""), # noqa + auto_full_refresh_when_out_of_sync: bool = None, + venv: Venv = None, ) -> DBTPackageRunner: """Creates a Python wrapper over `dbt` package present at specified location, that allows to control it (ie. run and test) from Python code. @@ -70,7 +78,11 @@ def package( Returns: DBTPackageRunner: A configured and authenticated Python `dbt` wrapper """ - schema = pipeline.default_schema if pipeline.default_schema_name else Schema(normalize_schema_name(pipeline.dataset_name)) + schema = ( + pipeline.default_schema + if pipeline.default_schema_name + else Schema(normalize_schema_name(pipeline.dataset_name)) + ) job_client = pipeline._sql_job_client(schema) if not venv: venv = Venv.restore_current() @@ -81,5 +93,5 @@ def package( package_location, package_repository_branch, package_repository_ssh_key, - auto_full_refresh_when_out_of_sync + auto_full_refresh_when_out_of_sync, ) diff --git a/dlt/pipeline/deprecations.py b/dlt/pipeline/deprecations.py index 138167c8d3..4f714e3bea 100644 --- a/dlt/pipeline/deprecations.py +++ b/dlt/pipeline/deprecations.py @@ -13,8 +13,9 @@ def credentials_argument_deprecated( dest_name = Destination.to_name(destination) if destination else "postgres" warnings.warn( - f"The `credentials argument` to {caller_name} is deprecated and will be removed in a future version. " - f"Pass the same credentials to the `destination` instance instead, e.g. {caller_name}(destination=dlt.destinations.{dest_name}(credentials=...))", + f"The `credentials argument` to {caller_name} is deprecated and will be removed in a future" + " version. Pass the same credentials to the `destination` instance instead, e.g." + f" {caller_name}(destination=dlt.destinations.{dest_name}(credentials=...))", DeprecationWarning, stacklevel=2, ) diff --git a/dlt/pipeline/exceptions.py b/dlt/pipeline/exceptions.py index 0289c07158..6c35343015 100644 --- a/dlt/pipeline/exceptions.py +++ b/dlt/pipeline/exceptions.py @@ -6,14 +6,24 @@ class InvalidPipelineName(PipelineException, ValueError): def __init__(self, pipeline_name: str, details: str) -> None: - super().__init__(pipeline_name, f"The pipeline name {pipeline_name} contains invalid characters. The pipeline name is used to create a pipeline working directory and must be a valid directory name. The actual error is: {details}") + super().__init__( + pipeline_name, + f"The pipeline name {pipeline_name} contains invalid characters. The pipeline name is" + " used to create a pipeline working directory and must be a valid directory name. The" + f" actual error is: {details}", + ) class PipelineConfigMissing(PipelineException): - def __init__(self, pipeline_name: str, config_elem: str, step: TPipelineStep, _help: str = None) -> None: + def __init__( + self, pipeline_name: str, config_elem: str, step: TPipelineStep, _help: str = None + ) -> None: self.config_elem = config_elem self.step = step - msg = f"Configuration element {config_elem} was not provided and {step} step cannot be executed" + msg = ( + f"Configuration element {config_elem} was not provided and {step} step cannot be" + " executed" + ) if _help: msg += f"\n{_help}\n" super().__init__(pipeline_name, msg) @@ -21,49 +31,77 @@ def __init__(self, pipeline_name: str, config_elem: str, step: TPipelineStep, _h class CannotRestorePipelineException(PipelineException): def __init__(self, pipeline_name: str, pipelines_dir: str, reason: str) -> None: - msg = f"Pipeline with name {pipeline_name} in working directory {pipelines_dir} could not be restored: {reason}" + msg = ( + f"Pipeline with name {pipeline_name} in working directory {pipelines_dir} could not be" + f" restored: {reason}" + ) super().__init__(pipeline_name, msg) class SqlClientNotAvailable(PipelineException): - def __init__(self, pipeline_name: str,destination_name: str) -> None: - super().__init__(pipeline_name, f"SQL Client not available for destination {destination_name} in pipeline {pipeline_name}") + def __init__(self, pipeline_name: str, destination_name: str) -> None: + super().__init__( + pipeline_name, + f"SQL Client not available for destination {destination_name} in pipeline" + f" {pipeline_name}", + ) class PipelineStepFailed(PipelineException): - def __init__(self, pipeline: SupportsPipeline, step: TPipelineStep, exception: BaseException, step_info: Any = None) -> None: + def __init__( + self, + pipeline: SupportsPipeline, + step: TPipelineStep, + exception: BaseException, + step_info: Any = None, + ) -> None: self.pipeline = pipeline self.step = step self.exception = exception self.step_info = step_info - super().__init__(pipeline.pipeline_name, f"Pipeline execution failed at stage {step} with exception:\n\n{type(exception)}\n{exception}") + super().__init__( + pipeline.pipeline_name, + f"Pipeline execution failed at stage {step} with" + f" exception:\n\n{type(exception)}\n{exception}", + ) class PipelineStateEngineNoUpgradePathException(PipelineException): - def __init__(self, pipeline_name: str, init_engine: int, from_engine: int, to_engine: int) -> None: + def __init__( + self, pipeline_name: str, init_engine: int, from_engine: int, to_engine: int + ) -> None: self.init_engine = init_engine self.from_engine = from_engine self.to_engine = to_engine - super().__init__(pipeline_name, f"No engine upgrade path for state in pipeline {pipeline_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + super().__init__( + pipeline_name, + f"No engine upgrade path for state in pipeline {pipeline_name} from {init_engine} to" + f" {to_engine}, stopped at {from_engine}", + ) class PipelineHasPendingDataException(PipelineException): def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: msg = ( - f" Operation failed because pipeline with name {pipeline_name} in working directory {pipelines_dir} contains pending extracted files or load packages. " - "Use `dlt pipeline sync` to reset the local state then run this operation again." + f" Operation failed because pipeline with name {pipeline_name} in working directory" + f" {pipelines_dir} contains pending extracted files or load packages. Use `dlt pipeline" + " sync` to reset the local state then run this operation again." ) super().__init__(pipeline_name, msg) + class PipelineNeverRan(PipelineException): def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: msg = ( - f" Operation failed because pipeline with name {pipeline_name} in working directory {pipelines_dir} was never run or never synced with destination. " - "Use `dlt pipeline sync` to synchronize." + f" Operation failed because pipeline with name {pipeline_name} in working directory" + f" {pipelines_dir} was never run or never synced with destination. Use `dlt pipeline" + " sync` to synchronize." ) super().__init__(pipeline_name, msg) class PipelineNotActive(PipelineException): def __init__(self, pipeline_name: str) -> None: - super().__init__(pipeline_name, f"Pipeline {pipeline_name} is not active so it cannot be deactivated") + super().__init__( + pipeline_name, f"Pipeline {pipeline_name} is not active so it cannot be deactivated" + ) diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index ebb85f5e23..b71dad3298 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -4,19 +4,35 @@ from dlt.common.jsonpath import resolve_paths, TAnyJsonPath, compile_paths from dlt.common.exceptions import TerminalException -from dlt.common.schema.utils import group_tables_by_resource, compile_simple_regexes, compile_simple_regex +from dlt.common.schema.utils import ( + group_tables_by_resource, + compile_simple_regexes, + compile_simple_regex, +) from dlt.common.schema.typing import TSimpleRegex from dlt.common.typing import REPattern -from dlt.common.pipeline import TSourceState, reset_resource_state, _sources_state, _delete_source_state_keys, _get_matching_resources +from dlt.common.pipeline import ( + TSourceState, + reset_resource_state, + _sources_state, + _delete_source_state_keys, + _get_matching_resources, +) from dlt.common.destination.reference import WithStagingDataset from dlt.destinations.exceptions import DatabaseUndefinedRelation -from dlt.pipeline.exceptions import PipelineNeverRan, PipelineStepFailed, PipelineHasPendingDataException +from dlt.pipeline.exceptions import ( + PipelineNeverRan, + PipelineStepFailed, + PipelineHasPendingDataException, +) from dlt.pipeline.typing import TPipelineStep from dlt.pipeline import Pipeline -def retry_load(retry_on_pipeline_steps: Sequence[TPipelineStep] = ("load",)) -> Callable[[BaseException], bool]: +def retry_load( + retry_on_pipeline_steps: Sequence[TPipelineStep] = ("load",) +) -> Callable[[BaseException], bool]: """A retry strategy for Tenacity that, with default setting, will repeat `load` step for all exceptions that are not terminal Use this condition with tenacity `retry_if_exception`. Terminal exceptions are exceptions that will not go away when operations is repeated. @@ -31,12 +47,15 @@ def retry_load(retry_on_pipeline_steps: Sequence[TPipelineStep] = ("load",)) -> retry_on_pipeline_steps (Tuple[TPipelineStep, ...], optional): which pipeline steps are allowed to be repeated. Default: "load" """ + def _retry_load(ex: BaseException) -> bool: # do not retry in normalize or extract stages if isinstance(ex, PipelineStepFailed) and ex.step not in retry_on_pipeline_steps: return False # do not retry on terminal exceptions - if isinstance(ex, TerminalException) or (ex.__context__ is not None and isinstance(ex.__context__, TerminalException)): + if isinstance(ex, TerminalException) or ( + ex.__context__ is not None and isinstance(ex.__context__, TerminalException) + ): return False return True @@ -83,14 +102,16 @@ def __init__( resources = set(resources) resource_names = [] if drop_all: - self.resource_pattern = compile_simple_regex(TSimpleRegex('re:.*')) # Match everything + self.resource_pattern = compile_simple_regex(TSimpleRegex("re:.*")) # Match everything elif resources: self.resource_pattern = compile_simple_regexes(TSimpleRegex(r) for r in resources) else: self.resource_pattern = None if self.resource_pattern: - data_tables = {t["name"]: t for t in self.schema.data_tables()} # Don't remove _dlt tables + data_tables = { + t["name"]: t for t in self.schema.data_tables() + } # Don't remove _dlt tables resource_tables = group_tables_by_resource(data_tables, pattern=self.resource_pattern) if self.drop_tables: self.tables_to_drop = list(chain.from_iterable(resource_tables.values())) @@ -105,25 +126,34 @@ def __init__( self.drop_all = drop_all self.info: _DropInfo = dict( - tables=[t['name'] for t in self.tables_to_drop], resource_states=[], state_paths=[], + tables=[t["name"] for t in self.tables_to_drop], + resource_states=[], + state_paths=[], resource_names=resource_names, - schema_name=self.schema.name, dataset_name=self.pipeline.dataset_name, + schema_name=self.schema.name, + dataset_name=self.pipeline.dataset_name, drop_all=drop_all, resource_pattern=self.resource_pattern, - warnings=[] + warnings=[], ) if self.resource_pattern and not resource_tables: - self.info['warnings'].append( - f"Specified resource(s) {str(resources)} did not select any table(s) in schema {self.schema.name}. Possible resources are: {list(group_tables_by_resource(data_tables).keys())}" + self.info["warnings"].append( + f"Specified resource(s) {str(resources)} did not select any table(s) in schema" + f" {self.schema.name}. Possible resources are:" + f" {list(group_tables_by_resource(data_tables).keys())}" ) self._new_state = self._create_modified_state() @property def is_empty(self) -> bool: - return len(self.info['tables']) == 0 and len(self.info["state_paths"]) == 0 and len(self.info["resource_states"]) == 0 + return ( + len(self.info["tables"]) == 0 + and len(self.info["state_paths"]) == 0 + and len(self.info["resource_states"]) == 0 + ) def _drop_destination_tables(self) -> None: - table_names = [tbl['name'] for tbl in self.tables_to_drop] + table_names = [tbl["name"] for tbl in self.tables_to_drop] with self.pipeline._sql_job_client(self.schema) as client: client.drop_tables(*table_names) # also delete staging but ignore if staging does not exist @@ -134,7 +164,7 @@ def _drop_destination_tables(self) -> None: def _delete_pipeline_tables(self) -> None: for tbl in self.tables_to_drop: - del self.schema_tables[tbl['name']] + del self.schema_tables[tbl["name"]] self.schema.bump_version() def _list_state_paths(self, source_state: Dict[str, Any]) -> List[str]: @@ -149,14 +179,17 @@ def _create_modified_state(self) -> Dict[str, Any]: # drop table states if self.drop_state and self.resource_pattern: for key in _get_matching_resources(self.resource_pattern, source_state): - self.info['resource_states'].append(key) + self.info["resource_states"].append(key) reset_resource_state(key, source_state) # drop additional state paths resolved_paths = resolve_paths(self.state_paths_to_drop, source_state) if self.state_paths_to_drop and not resolved_paths: - self.info['warnings'].append(f"State paths {self.state_paths_to_drop} did not select any paths in source {source_name}") + self.info["warnings"].append( + f"State paths {self.state_paths_to_drop} did not select any paths in source" + f" {source_name}" + ) _delete_source_state_keys(resolved_paths, source_state) - self.info['state_paths'].extend(f"{source_name}.{p}" for p in resolved_paths) + self.info["state_paths"].extend(f"{source_name}.{p}" for p in resolved_paths) return state # type: ignore[return-value] def _drop_state_keys(self) -> None: @@ -166,8 +199,12 @@ def _drop_state_keys(self) -> None: state.update(self._new_state) def __call__(self) -> None: - if self.pipeline.has_pending_data: # Raise when there are pending extracted/load files to prevent conflicts - raise PipelineHasPendingDataException(self.pipeline.pipeline_name, self.pipeline.pipelines_dir) + if ( + self.pipeline.has_pending_data + ): # Raise when there are pending extracted/load files to prevent conflicts + raise PipelineHasPendingDataException( + self.pipeline.pipeline_name, self.pipeline.pipelines_dir + ) self.pipeline.sync_destination() if not self.drop_state and not self.drop_tables: @@ -198,6 +235,6 @@ def drop( schema_name: str = None, state_paths: TAnyJsonPath = (), drop_all: bool = False, - state_only: bool = False + state_only: bool = False, ) -> None: return DropCommand(pipeline, resources, schema_name, state_paths, drop_all, state_only)() diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 14a7108683..3b9b3ccfc7 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,2 @@ """Module with market functions that make data to be specially processed""" -from dlt.extract import with_table_name \ No newline at end of file +from dlt.extract import with_table_name diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index c893fd4e75..4c45f0e486 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -4,32 +4,86 @@ from contextlib import contextmanager from functools import wraps from collections.abc import Sequence as C_Sequence -from typing import Any, Callable, ClassVar, List, Iterator, Optional, Sequence, Tuple, cast, get_type_hints, ContextManager +from typing import ( + Any, + Callable, + ClassVar, + List, + Iterator, + Optional, + Sequence, + Tuple, + cast, + get_type_hints, + ContextManager, +) from dlt import version from dlt.common import json, logger, pendulum from dlt.common.configuration import inject_section, known_sections from dlt.common.configuration.specs import RunConfiguration, CredentialsConfiguration from dlt.common.configuration.container import Container -from dlt.common.configuration.exceptions import ConfigFieldMissingException, ContextDefaultCannotBeCreated +from dlt.common.configuration.exceptions import ( + ConfigFieldMissingException, + ContextDefaultCannotBeCreated, +) from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.resolve import initialize_credentials -from dlt.common.exceptions import (DestinationLoadingViaStagingNotSupported, DestinationLoadingWithoutStagingNotSupported, DestinationNoStagingMode, - MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) +from dlt.common.exceptions import ( + DestinationLoadingViaStagingNotSupported, + DestinationLoadingWithoutStagingNotSupported, + DestinationNoStagingMode, + MissingDependencyException, + DestinationUndefinedEntity, + DestinationIncompatibleLoaderFileFormatException, +) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract +from dlt.common.schema.typing import ( + TColumnNames, + TColumnSchema, + TSchemaTables, + TWriteDisposition, + TAnySchemaColumns, + TSchemaContract, +) from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner -from dlt.common.storages import LiveSchemaStorage, NormalizeStorage, LoadStorage, SchemaStorage, FileStorage, NormalizeStorageConfiguration, SchemaStorageConfiguration, LoadStorageConfiguration +from dlt.common.storages import ( + LiveSchemaStorage, + NormalizeStorage, + LoadStorage, + SchemaStorage, + FileStorage, + NormalizeStorageConfiguration, + SchemaStorageConfiguration, + LoadStorageConfiguration, +) from dlt.common.destination import DestinationCapabilitiesContext, TDestination -from dlt.common.destination.reference import (DestinationClientDwhConfiguration, WithStateSync, Destination, JobClientBase, DestinationClientConfiguration, - TDestinationReferenceArg, DestinationClientStagingConfiguration, DestinationClientStagingConfiguration, - DestinationClientDwhWithStagingConfiguration) +from dlt.common.destination.reference import ( + DestinationClientDwhConfiguration, + WithStateSync, + Destination, + JobClientBase, + DestinationClientConfiguration, + TDestinationReferenceArg, + DestinationClientStagingConfiguration, + DestinationClientStagingConfiguration, + DestinationClientDwhWithStagingConfiguration, +) from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS -from dlt.common.pipeline import ExtractInfo, LoadInfo, NormalizeInfo, PipelineContext, SupportsPipeline, TPipelineLocalState, TPipelineState, StateInjectableContext +from dlt.common.pipeline import ( + ExtractInfo, + LoadInfo, + NormalizeInfo, + PipelineContext, + SupportsPipeline, + TPipelineLocalState, + TPipelineState, + StateInjectableContext, +) from dlt.common.schema import Schema from dlt.common.utils import is_interactive from dlt.common.data_writers import TLoaderFileFormat @@ -46,15 +100,39 @@ from dlt.pipeline.configuration import PipelineConfiguration from dlt.pipeline.progress import _Collector, _NULL_COLLECTOR -from dlt.pipeline.exceptions import CannotRestorePipelineException, InvalidPipelineName, PipelineConfigMissing, PipelineNotActive, PipelineStepFailed, SqlClientNotAvailable -from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace, describe_extract_data +from dlt.pipeline.exceptions import ( + CannotRestorePipelineException, + InvalidPipelineName, + PipelineConfigMissing, + PipelineNotActive, + PipelineStepFailed, + SqlClientNotAvailable, +) +from dlt.pipeline.trace import ( + PipelineTrace, + PipelineStepTrace, + load_trace, + merge_traces, + start_trace, + start_trace_step, + end_trace_step, + end_trace, + describe_extract_data, +) from dlt.pipeline.typing import TPipelineStep -from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state +from dlt.pipeline.state_sync import ( + STATE_ENGINE_VERSION, + load_state_from_destination, + merge_state_if_changed, + migrate_state, + state_resource, + json_encode_state, + json_decode_state, +) from dlt.pipeline.deprecations import credentials_argument_deprecated def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: - def decorator(f: TFun) -> TFun: @wraps(f) def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: @@ -73,7 +151,6 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: def with_schemas_sync(f: TFun) -> TFun: - @wraps(f) def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: for name in self._schema_storage.live_schemas: @@ -91,7 +168,6 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: def with_runtime_trace(f: TFun) -> TFun: - @wraps(f) def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: trace: PipelineTrace = self._trace @@ -119,12 +195,16 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # if there was a step, finish it end_trace_step(self._trace, trace_step, self, step_info) if is_new_trace: - assert trace is self._trace, f"Messed up trace reference {id(self._trace)} vs {id(trace)}" + assert ( + trace is self._trace + ), f"Messed up trace reference {id(self._trace)} vs {id(trace)}" end_trace(trace, self, self._pipeline_storage.storage_path) finally: # always end trace if is_new_trace: - assert self._trace == trace, f"Messed up trace reference {id(self._trace)} vs {id(trace)}" + assert ( + self._trace == trace + ), f"Messed up trace reference {id(self._trace)} vs {id(trace)}" # if we end new trace that had only 1 step, add it to previous trace # this way we combine several separate calls to extract, normalize, load as single trace # the trace of "run" has many steps and will not be merged @@ -135,13 +215,13 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: def with_config_section(sections: Tuple[str, ...]) -> Callable[[TFun], TFun]: - def decorator(f: TFun) -> TFun: - @wraps(f) def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # add section context to the container to be used by all configuration without explicit sections resolution - with inject_section(ConfigSectionContext(pipeline_name=self.pipeline_name, sections=sections)): + with inject_section( + ConfigSectionContext(pipeline_name=self.pipeline_name, sections=sections) + ): return f(self, *args, **kwargs) return _wrap # type: ignore @@ -150,9 +230,10 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: class Pipeline(SupportsPipeline): - STATE_FILE: ClassVar[str] = "state.json" - STATE_PROPS: ClassVar[List[str]] = list(set(get_type_hints(TPipelineState).keys()) - {"sources"}) + STATE_PROPS: ClassVar[List[str]] = list( + set(get_type_hints(TPipelineState).keys()) - {"sources"} + ) LOCAL_STATE_PROPS: ClassVar[List[str]] = list(get_type_hints(TPipelineLocalState).keys()) DEFAULT_DATASET_SUFFIX: ClassVar[str] = "_dataset" @@ -181,22 +262,22 @@ class Pipeline(SupportsPipeline): runtime_config: RunConfiguration def __init__( - self, - pipeline_name: str, - pipelines_dir: str, - pipeline_salt: TSecretValue, - destination: TDestination, - staging: TDestination, - dataset_name: str, - credentials: Any, - import_schema_path: str, - export_schema_path: str, - full_refresh: bool, - progress: _Collector, - must_attach_to_local_pipeline: bool, - config: PipelineConfiguration, - runtime: RunConfiguration, - ) -> None: + self, + pipeline_name: str, + pipelines_dir: str, + pipeline_salt: TSecretValue, + destination: TDestination, + staging: TDestination, + dataset_name: str, + credentials: Any, + import_schema_path: str, + export_schema_path: str, + full_refresh: bool, + progress: _Collector, + must_attach_to_local_pipeline: bool, + config: PipelineConfiguration, + runtime: RunConfiguration, + ) -> None: """Initializes the Pipeline class which implements `dlt` pipeline. Please use `pipeline` function in `dlt` module to create a new Pipeline instance.""" self.pipeline_salt = pipeline_salt self.config = config @@ -251,7 +332,7 @@ def drop(self) -> "Pipeline": self.collector, False, self.config, - self.runtime_config + self.runtime_config, ) @with_runtime_trace @@ -270,7 +351,7 @@ def extract( schema: Schema = None, max_parallel_items: int = None, workers: int = None, - schema_contract: TSchemaContract = None + schema_contract: TSchemaContract = None, ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted @@ -279,7 +360,16 @@ def extract( try: with self._maybe_destination_capabilities(): # extract all sources - for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key, schema_contract): + for source in self._data_to_sources( + data, + schema, + table_name, + parent_table_name, + write_disposition, + columns, + primary_key, + schema_contract, + ): if source.exhausted: raise SourceExhausted(source.name) # TODO: merge infos for all the sources @@ -294,12 +384,16 @@ def extract( return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor - raise PipelineStepFailed(self, "extract", exc, ExtractInfo(describe_extract_data(data))) from exc + raise PipelineStepFailed( + self, "extract", exc, ExtractInfo(describe_extract_data(data)) + ) from exc @with_runtime_trace @with_schemas_sync @with_config_section((known_sections.NORMALIZE,)) - def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = None) -> NormalizeInfo: + def normalize( + self, workers: int = 1, loader_file_format: TLoaderFileFormat = None + ) -> NormalizeInfo: """Normalizes the data prepared with `extract` method, infers the schema and creates load packages for the `load` method. Requires `destination` to be known.""" if is_interactive(): workers = 1 @@ -316,18 +410,24 @@ def normalize(self, workers: int = 1, loader_file_format: TLoaderFileFormat = No workers=workers, _schema_storage_config=self._schema_storage_config, _normalize_storage_config=self._normalize_storage_config, - _load_storage_config=self._load_storage_config + _load_storage_config=self._load_storage_config, ) # run with destination context with self._maybe_destination_capabilities(loader_file_format=loader_file_format): # shares schema storage with the pipeline so we do not need to install - normalize = Normalize(collector=self.collector, config=normalize_config, schema_storage=self._schema_storage) + normalize = Normalize( + collector=self.collector, + config=normalize_config, + schema_storage=self._schema_storage, + ) try: with signals.delayed_signals(): runner.run_pool(normalize.config, normalize) return normalize.get_normalize_info() except Exception as n_ex: - raise PipelineStepFailed(self, "normalize", n_ex, normalize.get_normalize_info()) from n_ex + raise PipelineStepFailed( + self, "normalize", n_ex, normalize.get_normalize_info() + ) from n_ex @with_runtime_trace @with_schemas_sync @@ -340,7 +440,7 @@ def load( credentials: Any = None, *, workers: int = 20, - raise_on_failed_jobs: bool = False + raise_on_failed_jobs: bool = False, ) -> LoadInfo: """Loads the packages prepared by `normalize` method into the `dataset_name` at `destination`, using provided `credentials`""" # set destination and default dataset if provided @@ -362,7 +462,7 @@ def load( load_config = LoaderConfiguration( workers=workers, raise_on_failed_jobs=raise_on_failed_jobs, - _load_storage_config=self._load_storage_config + _load_storage_config=self._load_storage_config, ) load = Load( self.destination, @@ -371,7 +471,7 @@ def load( is_storage_owner=False, config=load_config, initial_client_config=client.config, - initial_staging_client_config=staging_client.config if staging_client else None + initial_staging_client_config=staging_client.config if staging_client else None, ) try: with signals.delayed_signals(): @@ -398,7 +498,7 @@ def run( primary_key: TColumnNames = None, schema: Schema = None, loader_file_format: TLoaderFileFormat = None, - schema_contract: TSchemaContract = None + schema_contract: TSchemaContract = None, ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -462,7 +562,12 @@ def run( credentials_argument_deprecated("pipeline.run", credentials, self.destination) # sync state with destination - if self.config.restore_from_destination and not self.full_refresh and not self._state_restored and (self.destination or destination): + if ( + self.config.restore_from_destination + and not self.full_refresh + and not self._state_restored + and (self.destination or destination) + ): self.sync_destination(destination, staging, dataset_name) # sync only once self._state_restored = True @@ -473,20 +578,36 @@ def run( if self.list_normalized_load_packages(): # if there were any pending loads, load them and **exit** if data is not None: - logger.warn("The pipeline `run` method will now load the pending load packages. The data you passed to the run function will not be loaded. In order to do that you must run the pipeline again") + logger.warn( + "The pipeline `run` method will now load the pending load packages. The data" + " you passed to the run function will not be loaded. In order to do that you" + " must run the pipeline again" + ) return self.load(destination, dataset_name, credentials=credentials) - # extract from the source if data is not None: - self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract=schema_contract) + self.extract( + data, + table_name=table_name, + write_disposition=write_disposition, + columns=columns, + primary_key=primary_key, + schema=schema, + schema_contract=schema_contract, + ) self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) else: return None @with_schemas_sync - def sync_destination(self, destination: TDestinationReferenceArg = None, staging: TDestinationReferenceArg = None, dataset_name: str = None) -> None: + def sync_destination( + self, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + ) -> None: """Synchronizes pipeline state with the `destination`'s state kept in `dataset_name` #### Note: @@ -513,7 +634,9 @@ def sync_destination(self, destination: TDestinationReferenceArg = None, staging # print(f'REMOTE STATE: {(remote_state or {}).get("_state_version")} >= {state["_state_version"]}') if remote_state and remote_state["_state_version"] >= state["_state_version"]: # compare changes and updates local state - merged_state = merge_state_if_changed(state, remote_state, increase_version=False) + merged_state = merge_state_if_changed( + state, remote_state, increase_version=False + ) # print(f"MERGED STATE: {bool(merged_state)}") if merged_state: # see if state didn't change the pipeline name @@ -521,15 +644,20 @@ def sync_destination(self, destination: TDestinationReferenceArg = None, staging raise CannotRestorePipelineException( state["pipeline_name"], self.pipelines_dir, - f"destination state contains state for pipeline with name {remote_state['pipeline_name']}" + "destination state contains state for pipeline with name" + f" {remote_state['pipeline_name']}", ) # if state was modified force get all schemas - restored_schemas = self._get_schemas_from_destination(merged_state["schema_names"], always_download=True) + restored_schemas = self._get_schemas_from_destination( + merged_state["schema_names"], always_download=True + ) # TODO: we should probably wipe out pipeline here # if we didn't full refresh schemas, get only missing schemas if restored_schemas is None: - restored_schemas = self._get_schemas_from_destination(state["schema_names"], always_download=False) + restored_schemas = self._get_schemas_from_destination( + state["schema_names"], always_download=False + ) # commit all the changes locally if merged_state: # set the pipeline props from merged state @@ -559,8 +687,11 @@ def sync_destination(self, destination: TDestinationReferenceArg = None, staging # reset pipeline self._wipe_working_folder() state = self._get_state() - self._configure(self._schema_storage_config.export_schema_path, self._schema_storage_config.import_schema_path, False) - + self._configure( + self._schema_storage_config.export_schema_path, + self._schema_storage_config.import_schema_path, + False, + ) # write the state back state = merged_state or state @@ -603,12 +734,20 @@ def deactivate(self) -> None: @property def has_data(self) -> bool: """Tells if the pipeline contains any data: schemas, extracted files, load packages or loaded packages in the destination""" - return not self.first_run or bool(self.schema_names) or len(self.list_extracted_resources()) > 0 or len(self.list_normalized_load_packages()) > 0 + return ( + not self.first_run + or bool(self.schema_names) + or len(self.list_extracted_resources()) > 0 + or len(self.list_normalized_load_packages()) > 0 + ) @property def has_pending_data(self) -> bool: """Tells if the pipeline contains any extracted files or pending load packages""" - return len(self.list_normalized_load_packages()) > 0 or len(self.list_extracted_resources()) > 0 + return ( + len(self.list_normalized_load_packages()) > 0 + or len(self.list_extracted_resources()) > 0 + ) @property def schemas(self) -> SchemaStorage: @@ -668,7 +807,13 @@ def drop_pending_packages(self, with_partial_loads: bool = True) -> None: def sync_schema(self, schema_name: str = None, credentials: Any = None) -> TSchemaTables: """Synchronizes the schema `schema_name` with the destination. If no name is provided, the default schema will be synchronized.""" if not schema_name and not self.default_schema_name: - raise PipelineConfigMissing(self.pipeline_name, "default_schema_name", "load", "Pipeline contains no schemas. Please extract any data with `extract` or `run` methods.") + raise PipelineConfigMissing( + self.pipeline_name, + "default_schema_name", + "load", + "Pipeline contains no schemas. Please extract any data with `extract` or `run`" + " methods.", + ) schema = self.schemas[schema_name] if schema_name else self.default_schema client_config = self._get_destination_client_initial_config(credentials) @@ -694,19 +839,19 @@ def get_local_state_val(self, key: str) -> Any: state = self._container[StateInjectableContext].state except ContextDefaultCannotBeCreated: state = self._get_state() - return state["_local"][key] # type: ignore + return state["_local"][key] # type: ignore def sql_client(self, schema_name: str = None, credentials: Any = None) -> SqlClientBase[Any]: """Returns a sql client configured to query/change the destination and dataset that were used to load the data. - Use the client with `with` statement to manage opening and closing connection to the destination: - >>> with pipeline.sql_client() as client: - >>> with client.execute_query( - >>> "SELECT id, name, email FROM customers WHERE id = %s", 10 - >>> ) as cursor: - >>> print(cursor.fetchall()) - - The client is authenticated and defaults all queries to dataset_name used by the pipeline. You can provide alternative - `schema_name` which will be used to normalize dataset name and alternative `credentials`. + Use the client with `with` statement to manage opening and closing connection to the destination: + >>> with pipeline.sql_client() as client: + >>> with client.execute_query( + >>> "SELECT id, name, email FROM customers WHERE id = %s", 10 + >>> ) as cursor: + >>> print(cursor.fetchall()) + + The client is authenticated and defaults all queries to dataset_name used by the pipeline. You can provide alternative + `schema_name` which will be used to normalize dataset name and alternative `credentials`. """ # if not self.default_schema_name and not schema_name: # raise PipelineConfigMissing( @@ -720,12 +865,12 @@ def sql_client(self, schema_name: str = None, credentials: Any = None) -> SqlCli def destination_client(self, schema_name: str = None, credentials: Any = None) -> JobClientBase: """Get the destination job client for the configured destination - Use the client with `with` statement to manage opening and closing connection to the destination: - >>> with pipeline.destination_client() as client: - >>> client.drop_storage() # removes storage which typically wipes all data in it + Use the client with `with` statement to manage opening and closing connection to the destination: + >>> with pipeline.destination_client() as client: + >>> client.drop_storage() # removes storage which typically wipes all data in it - The client is authenticated. You can provide alternative `schema_name` which will be used to normalize dataset name and alternative `credentials`. - If no schema name is provided and no default schema is present in the pipeline, and ad hoc schema will be created and discarded after use. + The client is authenticated. You can provide alternative `schema_name` which will be used to normalize dataset name and alternative `credentials`. + If no schema name is provided and no default schema is present in the pipeline, and ad hoc schema will be created and discarded after use. """ schema = self._get_schema_or_create(schema_name) client_config = self._get_destination_client_initial_config(credentials) @@ -752,7 +897,12 @@ def _get_normalize_storage(self) -> NormalizeStorage: def _get_load_storage(self) -> LoadStorage: caps = self._get_destination_capabilities() - return LoadStorage(True, caps.preferred_loader_file_format, caps.supported_loader_file_formats, self._load_storage_config) + return LoadStorage( + True, + caps.preferred_loader_file_format, + caps.supported_loader_file_formats, + self._load_storage_config, + ) def _init_working_dir(self, pipeline_name: str, pipelines_dir: str) -> None: self.pipeline_name = pipeline_name @@ -766,21 +916,31 @@ def _init_working_dir(self, pipeline_name: str, pipelines_dir: str) -> None: if self.full_refresh: self._wipe_working_folder() - def _configure(self, import_schema_path: str, export_schema_path: str, must_attach_to_local_pipeline: bool) -> None: + def _configure( + self, import_schema_path: str, export_schema_path: str, must_attach_to_local_pipeline: bool + ) -> None: # create schema storage and folders self._schema_storage_config = SchemaStorageConfiguration( schema_volume_path=os.path.join(self.working_dir, "schemas"), import_schema_path=import_schema_path, - export_schema_path=export_schema_path + export_schema_path=export_schema_path, ) # create default configs - self._normalize_storage_config = NormalizeStorageConfiguration(normalize_volume_path=os.path.join(self.working_dir, "normalize")) - self._load_storage_config = LoadStorageConfiguration(load_volume_path=os.path.join(self.working_dir, "load"),) + self._normalize_storage_config = NormalizeStorageConfiguration( + normalize_volume_path=os.path.join(self.working_dir, "normalize") + ) + self._load_storage_config = LoadStorageConfiguration( + load_volume_path=os.path.join(self.working_dir, "load"), + ) # are we running again? has_state = self._pipeline_storage.has_file(Pipeline.STATE_FILE) if must_attach_to_local_pipeline and not has_state: - raise CannotRestorePipelineException(self.pipeline_name, self.pipelines_dir, f"the pipeline was not found in {self.working_dir}.") + raise CannotRestorePipelineException( + self.pipeline_name, + self.pipelines_dir, + f"the pipeline was not found in {self.working_dir}.", + ) self.must_attach_to_local_pipeline = must_attach_to_local_pipeline # attach to pipeline if folder exists and contains state @@ -808,7 +968,8 @@ def _wipe_working_folder(self) -> None: def _attach_pipeline(self) -> None: pass - def _data_to_sources(self, + def _data_to_sources( + self, data: Any, schema: Schema, table_name: str = None, @@ -816,9 +977,8 @@ def _data_to_sources(self, write_disposition: TWriteDisposition = None, columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, - schema_contract: TSchemaContract = None + schema_contract: TSchemaContract = None, ) -> List[DltSource]: - def apply_hint_args(resource: DltResource) -> None: resource.apply_hints( table_name, @@ -826,7 +986,7 @@ def apply_hint_args(resource: DltResource) -> None: write_disposition, columns, primary_key, - schema_contract=schema_contract + schema_contract=schema_contract, ) def apply_settings(source_: DltSource) -> None: @@ -837,7 +997,7 @@ def apply_settings(source_: DltSource) -> None: def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: - schema_ = schema + schema_ = schema elif self.default_schema_name: schema_ = self.default_schema.clone() else: @@ -859,9 +1019,7 @@ def append_data(data_item: Any) -> None: elif isinstance(data_item, DltResource): # do not set section to prevent source that represent a standalone resource # to overwrite other standalone resources (ie. parents) in that source - sources.append( - DltSource(effective_schema, "", [data_item]) - ) + sources.append(DltSource(effective_schema, "", [data_item])) else: # iterator/iterable/generator # create resource first without table template @@ -891,7 +1049,9 @@ def append_data(data_item: Any) -> None: return sources - def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: + def _extract_source( + self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int + ) -> str: # discover the existing pipeline schema if source.schema.name in self.schemas: # use clone until extraction complete @@ -903,7 +1063,9 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para source.schema = pipeline_schema # extract into pipeline schema - extract_id = extract_with_schema(storage, source, self.collector, max_parallel_items, workers) + extract_id = extract_with_schema( + storage, source, self.collector, max_parallel_items, workers + ) # save import with fully discovered schema self._schema_storage.save_import_schema_if_not_exists(source.schema) @@ -918,14 +1080,17 @@ def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_para return extract_id - def _get_destination_client_initial_config(self, destination: TDestination = None, credentials: Any = None, as_staging: bool = False) -> DestinationClientConfiguration: + def _get_destination_client_initial_config( + self, destination: TDestination = None, credentials: Any = None, as_staging: bool = False + ) -> DestinationClientConfiguration: destination = destination or self.destination if not destination: raise PipelineConfigMissing( self.pipeline_name, "destination", "load", - "Please provide `destination` argument to `pipeline`, `run` or `load` method directly or via .dlt config.toml file or environment variable." + "Please provide `destination` argument to `pipeline`, `run` or `load` method" + " directly or via .dlt config.toml file or environment variable.", ) # create initial destination client config client_spec = destination.spec @@ -936,27 +1101,41 @@ def _get_destination_client_initial_config(self, destination: TDestination = Non if credentials is not None and not isinstance(credentials, CredentialsConfiguration): # use passed credentials as initial value. initial value may resolve credentials credentials = initialize_credentials( - client_spec.get_resolvable_fields()["credentials"], - credentials + client_spec.get_resolvable_fields()["credentials"], credentials ) # this client support many schemas and datasets if issubclass(client_spec, DestinationClientDwhConfiguration): if not self.dataset_name and self.full_refresh: - logger.warning("Full refresh may not work if dataset name is not set. Please set the dataset_name argument in dlt.pipeline or run method") + logger.warning( + "Full refresh may not work if dataset name is not set. Please set the" + " dataset_name argument in dlt.pipeline or run method" + ) # set default schema name to load all incoming data to a single dataset, no matter what is the current schema name - default_schema_name = None if self.config.use_single_dataset else self.default_schema_name + default_schema_name = ( + None if self.config.use_single_dataset else self.default_schema_name + ) if issubclass(client_spec, DestinationClientStagingConfiguration): - return client_spec(dataset_name=self.dataset_name, default_schema_name=default_schema_name, credentials=credentials, as_staging=as_staging) - return client_spec(dataset_name=self.dataset_name, default_schema_name=default_schema_name, credentials=credentials) + return client_spec( + dataset_name=self.dataset_name, + default_schema_name=default_schema_name, + credentials=credentials, + as_staging=as_staging, + ) + return client_spec( + dataset_name=self.dataset_name, + default_schema_name=default_schema_name, + credentials=credentials, + ) return client_spec(credentials=credentials) - def _get_destination_clients(self, + def _get_destination_clients( + self, schema: Schema, initial_config: DestinationClientConfiguration = None, - initial_staging_config: DestinationClientConfiguration = None + initial_staging_config: DestinationClientConfiguration = None, ) -> Tuple[JobClientBase, JobClientBase]: try: # resolve staging config in order to pass it to destination client config @@ -964,14 +1143,20 @@ def _get_destination_clients(self, if self.staging: if not initial_staging_config: # this is just initial config - without user configuration injected - initial_staging_config = self._get_destination_client_initial_config(self.staging, as_staging=True) + initial_staging_config = self._get_destination_client_initial_config( + self.staging, as_staging=True + ) # create the client - that will also resolve the config staging_client = self.staging.client(schema, initial_staging_config) if not initial_config: # config is not provided then get it with injected credentials initial_config = self._get_destination_client_initial_config(self.destination) # attach the staging client config to destination client config - if its type supports it - if self.staging and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) and isinstance(staging_client.config ,DestinationClientStagingConfiguration): + if ( + self.staging + and isinstance(initial_config, DestinationClientDwhWithStagingConfiguration) + and isinstance(staging_client.config, DestinationClientStagingConfiguration) + ): initial_config.staging_config = staging_client.config # create instance with initial_config properly set client = self.destination.client(schema, initial_config) @@ -981,17 +1166,18 @@ def _get_destination_clients(self, raise MissingDependencyException( f"{client_spec.destination_name} destination", [f"{version.DLT_PKG_NAME}[{client_spec.destination_name}]"], - "Dependencies for specific destinations are available as extras of dlt" + "Dependencies for specific destinations are available as extras of dlt", ) def _get_destination_capabilities(self) -> DestinationCapabilitiesContext: if not self.destination: - raise PipelineConfigMissing( - self.pipeline_name, - "destination", - "normalize", - "Please provide `destination` argument to `pipeline`, `run` or `load` method directly or via .dlt config.toml file or environment variable." - ) + raise PipelineConfigMissing( + self.pipeline_name, + "destination", + "normalize", + "Please provide `destination` argument to `pipeline`, `run` or `load` method" + " directly or via .dlt config.toml file or environment variable.", + ) return self.destination.capabilities() def _get_staging_capabilities(self) -> Optional[DestinationCapabilitiesContext]: @@ -1017,25 +1203,38 @@ def _set_context(self, is_active: bool) -> None: # set destination context on activation if self.destination: # inject capabilities context - self._container[DestinationCapabilitiesContext] = self._get_destination_capabilities() + self._container[DestinationCapabilitiesContext] = ( + self._get_destination_capabilities() + ) else: # remove destination context on deactivation if DestinationCapabilitiesContext in self._container: del self._container[DestinationCapabilitiesContext] - def _set_destinations(self, destination: TDestinationReferenceArg, staging: TDestinationReferenceArg) -> None: + def _set_destinations( + self, destination: TDestinationReferenceArg, staging: TDestinationReferenceArg + ) -> None: # destination_mod = DestinationReference.from_name(destination) if destination: self.destination = Destination.from_reference(destination) - if destination and not self.destination.capabilities().supported_loader_file_formats and not staging: - logger.warning(f"The destination {self.destination.name} requires the filesystem staging destination to be set, but it was not provided. Setting it to 'filesystem'.") + if ( + destination + and not self.destination.capabilities().supported_loader_file_formats + and not staging + ): + logger.warning( + f"The destination {self.destination.name} requires the filesystem staging" + " destination to be set, but it was not provided. Setting it to 'filesystem'." + ) staging = "filesystem" if staging: # staging_module = DestinationReference.from_name(staging) staging_module = Destination.from_reference(staging) - if staging_module and not issubclass(staging_module.spec, DestinationClientStagingConfiguration): + if staging_module and not issubclass( + staging_module.spec, DestinationClientStagingConfiguration + ): raise DestinationNoStagingMode(staging_module.name) self.staging = staging_module or self.staging @@ -1044,7 +1243,9 @@ def _set_destinations(self, destination: TDestinationReferenceArg, staging: TDes self._set_default_normalizers() @contextmanager - def _maybe_destination_capabilities(self, loader_file_format: TLoaderFileFormat = None) -> Iterator[DestinationCapabilitiesContext]: + def _maybe_destination_capabilities( + self, loader_file_format: TLoaderFileFormat = None + ) -> Iterator[DestinationCapabilitiesContext]: try: caps: DestinationCapabilitiesContext = None injected_caps: ContextManager[DestinationCapabilitiesContext] = None @@ -1056,10 +1257,17 @@ def _maybe_destination_capabilities(self, loader_file_format: TLoaderFileFormat caps.preferred_loader_file_format = self._resolve_loader_file_format( self.destination.name, - # DestinationReference.to_name(self.destination), - self.staging.name if self.staging else None, + ( + # DestinationReference.to_name(self.destination), + self.staging.name + if self.staging + else None + ), # DestinationReference.to_name(self.staging) if self.staging else None, - destination_caps, stage_caps, loader_file_format) + destination_caps, + stage_caps, + loader_file_format, + ) caps.supported_loader_file_formats = ( destination_caps.supported_staging_file_formats if stage_caps else None ) or destination_caps.supported_loader_file_formats @@ -1070,17 +1278,21 @@ def _maybe_destination_capabilities(self, loader_file_format: TLoaderFileFormat @staticmethod def _resolve_loader_file_format( - destination: str, - staging: str, - dest_caps: DestinationCapabilitiesContext, - stage_caps: DestinationCapabilitiesContext, - file_format: TLoaderFileFormat) -> TLoaderFileFormat: - + destination: str, + staging: str, + dest_caps: DestinationCapabilitiesContext, + stage_caps: DestinationCapabilitiesContext, + file_format: TLoaderFileFormat, + ) -> TLoaderFileFormat: possible_file_formats = dest_caps.supported_loader_file_formats if stage_caps: if not dest_caps.supported_staging_file_formats: raise DestinationLoadingViaStagingNotSupported(destination) - possible_file_formats = [f for f in dest_caps.supported_staging_file_formats if f in stage_caps.supported_loader_file_formats] + possible_file_formats = [ + f + for f in dest_caps.supported_staging_file_formats + if f in stage_caps.supported_loader_file_formats + ] if not file_format: if not stage_caps: if not dest_caps.preferred_loader_file_format: @@ -1091,7 +1303,12 @@ def _resolve_loader_file_format( else: file_format = possible_file_formats[0] if len(possible_file_formats) > 0 else None if file_format not in possible_file_formats: - raise DestinationIncompatibleLoaderFileFormatException(destination, staging, file_format, set(possible_file_formats) - INTERNAL_LOADER_FILE_FORMATS) + raise DestinationIncompatibleLoaderFileFormatException( + destination, + staging, + file_format, + set(possible_file_formats) - INTERNAL_LOADER_FILE_FORMATS, + ) return file_format def _set_default_normalizers(self) -> None: @@ -1105,7 +1322,9 @@ def _set_dataset_name(self, new_dataset_name: str) -> None: fields = self.destination.spec().get_resolvable_fields() dataset_name_type = fields.get("dataset_name") # if dataset is required (default!) we create a default dataset name - destination_needs_dataset = dataset_name_type is not None and not is_optional_type(dataset_name_type) + destination_needs_dataset = dataset_name_type is not None and not is_optional_type( + dataset_name_type + ) # if destination is not specified - generate dataset if not self.destination or destination_needs_dataset: new_dataset_name = self.pipeline_name + self.DEFAULT_DATASET_SUFFIX @@ -1148,14 +1367,14 @@ def _get_load_info(self, load: Load) -> LoadInfo: def _get_state(self) -> TPipelineState: try: state = json_decode_state(self._pipeline_storage.load(Pipeline.STATE_FILE)) - return migrate_state(self.pipeline_name, state, state["_state_engine_version"], STATE_ENGINE_VERSION) + return migrate_state( + self.pipeline_name, state, state["_state_engine_version"], STATE_ENGINE_VERSION + ) except FileNotFoundError: return { "_state_version": 0, "_state_engine_version": STATE_ENGINE_VERSION, - "_local": { - "first_run": True - } + "_local": {"first_run": True}, } def _optional_sql_job_client(self, schema_name: str) -> Optional[SqlJobClientBase]: @@ -1186,18 +1405,29 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]: if isinstance(job_client, WithStateSync): state = load_state_from_destination(self.pipeline_name, job_client) if state is None: - logger.info(f"The state was not found in the destination {self.destination.name}:{dataset_name}") + logger.info( + "The state was not found in the destination" + f" {self.destination.name}:{dataset_name}" + ) else: - logger.info(f"The state was restored from the destination {self.destination.name}:{dataset_name}") + logger.info( + "The state was restored from the destination" + f" {self.destination.name}:{dataset_name}" + ) else: state = None - logger.info(f"Destination does not support metadata storage {self.destination.name}:{dataset_name}") + logger.info( + "Destination does not support metadata storage" + f" {self.destination.name}:{dataset_name}" + ) return state finally: # restore the use_single_dataset option self.config.use_single_dataset = use_single_dataset - def _get_schemas_from_destination(self, schema_names: Sequence[str], always_download: bool = False) -> Sequence[Schema]: + def _get_schemas_from_destination( + self, schema_names: Sequence[str], always_download: bool = False + ) -> Sequence[Schema]: # check which schemas are present in the pipeline and restore missing schemas restored_schemas: List[Schema] = [] for schema_name in schema_names: @@ -1206,17 +1436,26 @@ def _get_schemas_from_destination(self, schema_names: Sequence[str], always_down if not self._schema_storage.has_schema(schema.name) or always_download: with self._get_destination_clients(schema)[0] as job_client: if not isinstance(job_client, WithStateSync): - logger.info(f"Destination does not support metadata storage {self.destination.name}") + logger.info( + f"Destination does not support metadata storage {self.destination.name}" + ) return restored_schemas schema_info = job_client.get_stored_schema() if schema_info is None: - logger.info(f"The schema {schema.name} was not found in the destination {self.destination.name}:{self.dataset_name}") + logger.info( + f"The schema {schema.name} was not found in the destination" + f" {self.destination.name}:{self.dataset_name}" + ) # try to import schema with contextlib.suppress(FileNotFoundError): self._schema_storage.load_schema(schema.name) else: schema = Schema.from_dict(json.loads(schema_info.schema)) - logger.info(f"The schema {schema.name} version {schema.version} hash {schema.stored_version_hash} was restored from the destination {self.destination.name}:{self.dataset_name}") + logger.info( + f"The schema {schema.name} version {schema.version} hash" + f" {schema.stored_version_hash} was restored from the destination" + f" {self.destination.name}:{self.dataset_name}" + ) restored_schemas.append(schema) return restored_schemas @@ -1267,7 +1506,7 @@ def _state_to_props(self, state: TPipelineState) -> None: if prop in state["_local"] and not prop.startswith("_"): setattr(self, prop, state["_local"][prop]) # type: ignore if "destination" in state: - self._set_destinations(self.destination, self.staging if "staging" in state else None ) + self._set_destinations(self.destination, self.staging if "staging" in state else None) def _props_to_state(self, state: TPipelineState) -> None: """Write pipeline props to `state`""" diff --git a/dlt/pipeline/progress.py b/dlt/pipeline/progress.py index 90fc192bb1..89eda4cac5 100644 --- a/dlt/pipeline/progress.py +++ b/dlt/pipeline/progress.py @@ -1,12 +1,18 @@ """Measure the extract, normalize and load progress""" from typing import Union, Literal -from dlt.common.runtime.collector import TqdmCollector as tqdm, LogCollector as log, EnlightenCollector as enlighten, AliveCollector as alive_progress +from dlt.common.runtime.collector import ( + TqdmCollector as tqdm, + LogCollector as log, + EnlightenCollector as enlighten, + AliveCollector as alive_progress, +) from dlt.common.runtime.collector import Collector as _Collector, NULL_COLLECTOR as _NULL_COLLECTOR TSupportedCollectors = Literal["tqdm", "enlighten", "log", "alive_progress"] TCollectorArg = Union[_Collector, TSupportedCollectors] + def _from_name(collector: TCollectorArg) -> _Collector: """Create default collector by name""" if collector is None: diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index a9603b8f66..29f1e9515c 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -23,31 +23,11 @@ # state table columns STATE_TABLE_COLUMNS: TTableSchemaColumns = { - "version": { - "name": "version", - "data_type": "bigint", - "nullable": False - }, - "engine_version": { - "name": "engine_version", - "data_type": "bigint", - "nullable": False - }, - "pipeline_name": { - "name": "pipeline_name", - "data_type": "text", - "nullable": False - }, - "state": { - "name": "state", - "data_type": "text", - "nullable": False - }, - "created_at": { - "name": "created_at", - "data_type": "timestamp", - "nullable": False - } + "version": {"name": "version", "data_type": "bigint", "nullable": False}, + "engine_version": {"name": "engine_version", "data_type": "bigint", "nullable": False}, + "pipeline_name": {"name": "pipeline_name", "data_type": "text", "nullable": False}, + "state": {"name": "state", "data_type": "text", "nullable": False}, + "created_at": {"name": "created_at", "data_type": "timestamp", "nullable": False}, } @@ -72,7 +52,9 @@ def decompress_state(state_str: str) -> DictStrAny: return json.typed_loadb(state_bytes) # type: ignore[no-any-return] -def merge_state_if_changed(old_state: TPipelineState, new_state: TPipelineState, increase_version: bool = True) -> Optional[TPipelineState]: +def merge_state_if_changed( + old_state: TPipelineState, new_state: TPipelineState, increase_version: bool = True +) -> Optional[TPipelineState]: # we may want to compare hashes like we do with schemas if json.dumps(old_state, sort_keys=True) == json.dumps(new_state, sort_keys=True): return None @@ -89,10 +71,12 @@ def state_resource(state: TPipelineState) -> DltResource: "version": state["_state_version"], "engine_version": state["_state_engine_version"], "pipeline_name": state["pipeline_name"], - "state": state_str, - "created_at": pendulum.now() + "state": state_str, + "created_at": pendulum.now(), } - return dlt.resource([state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS) + return dlt.resource( + [state_doc], name=STATE_TABLE_NAME, write_disposition="append", columns=STATE_TABLE_COLUMNS + ) def load_state_from_destination(pipeline_name: str, client: WithStateSync) -> TPipelineState: @@ -104,7 +88,9 @@ def load_state_from_destination(pipeline_name: str, client: WithStateSync) -> TP return migrate_state(pipeline_name, s, s["_state_engine_version"], STATE_ENGINE_VERSION) -def migrate_state(pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int) -> TPipelineState: +def migrate_state( + pipeline_name: str, state: DictStrAny, from_engine: int, to_engine: int +) -> TPipelineState: if from_engine == to_engine: return cast(TPipelineState, state) if from_engine == 1 and to_engine > 1: @@ -114,6 +100,8 @@ def migrate_state(pipeline_name: str, state: DictStrAny, from_engine: int, to_en # check state engine state["_state_engine_version"] = from_engine if from_engine != to_engine: - raise PipelineStateEngineNoUpgradePathException(pipeline_name, state["_state_engine_version"], from_engine, to_engine) + raise PipelineStateEngineNoUpgradePathException( + pipeline_name, state["_state_engine_version"], from_engine, to_engine + ) return cast(TPipelineState, state) diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 46ab524aa1..e725a2f726 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -3,14 +3,20 @@ import datetime # noqa: 251 import dataclasses from collections.abc import Sequence as C_Sequence -from typing import Any, List, NamedTuple, Optional, Protocol, Sequence +from typing import Any, List, NamedTuple, Optional, Protocol, Sequence import humanize from dlt.common import pendulum from dlt.common.runtime.logger import suppress_and_warn from dlt.common.configuration import is_secret_hint from dlt.common.configuration.utils import _RESOLVED_TRACES -from dlt.common.pipeline import ExtractDataInfo, ExtractInfo, LoadInfo, NormalizeInfo, SupportsPipeline +from dlt.common.pipeline import ( + ExtractDataInfo, + ExtractInfo, + LoadInfo, + NormalizeInfo, + SupportsPipeline, +) from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id @@ -22,9 +28,11 @@ TRACE_ENGINE_VERSION = 1 TRACE_FILE_NAME = "trace.pickle" + # @dataclasses.dataclass(init=True) class SerializableResolvedValueTrace(NamedTuple): """Information on resolved secret and config values""" + key: str value: Any default_value: Any @@ -35,7 +43,7 @@ class SerializableResolvedValueTrace(NamedTuple): def asdict(self) -> StrAny: """A dictionary representation that is safe to load.""" - return {k:v for k,v in self._asdict().items() if k not in ("value", "default_value")} + return {k: v for k, v in self._asdict().items() if k not in ("value", "default_value")} def asstr(self, verbosity: int = 0) -> str: return f"{self.key}->{self.value} in {'.'.join(self.sections)} by {self.provider_name}" @@ -79,6 +87,7 @@ def __str__(self) -> str: class PipelineStepTrace(_PipelineStepTrace): """Trace of particular pipeline step, contains timing information, the step outcome info or exception in case of failing step with custom asdict()""" + def asdict(self) -> DictStrAny: """A dictionary representation of PipelineStepTrace that can be loaded with `dlt`""" d = dataclasses.asdict(self) @@ -91,6 +100,7 @@ def asdict(self) -> DictStrAny: @dataclasses.dataclass(init=True) class PipelineTrace: """Pipeline runtime trace containing data on "extract", "normalize" and "load" steps and resolved config and secret values.""" + transaction_id: str started_at: datetime.datetime steps: List[PipelineStepTrace] @@ -108,7 +118,10 @@ def asstr(self, verbosity: int = 0) -> str: elapsed_str = humanize.precisedelta(elapsed) else: elapsed_str = "---" - msg = f"Run started at {self.started_at} and {completed_str} in {elapsed_str} with {len(self.steps)} steps." + msg = ( + f"Run started at {self.started_at} and {completed_str} in {elapsed_str} with" + f" {len(self.steps)} steps." + ) if verbosity > 0 and len(self.resolved_config_values) > 0: msg += "\nFollowing config and secret values were resolved:\n" msg += "\n".join([s.asstr(verbosity) for s in self.resolved_config_values]) @@ -149,17 +162,23 @@ def __str__(self) -> str: class SupportsTracking(Protocol): - def on_start_trace(self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline) -> None: - ... + def on_start_trace( + self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline + ) -> None: ... - def on_start_trace_step(self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline) -> None: - ... + def on_start_trace_step( + self, trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline + ) -> None: ... - def on_end_trace_step(self, trace: PipelineTrace, step: PipelineStepTrace, pipeline: SupportsPipeline, step_info: Any) -> None: - ... + def on_end_trace_step( + self, + trace: PipelineTrace, + step: PipelineStepTrace, + pipeline: SupportsPipeline, + step_info: Any, + ) -> None: ... - def on_end_trace(self, trace: PipelineTrace, pipeline: SupportsPipeline) -> None: - ... + def on_end_trace(self, trace: PipelineTrace, pipeline: SupportsPipeline) -> None: ... # plug in your own tracking module here @@ -174,14 +193,18 @@ def start_trace(step: TPipelineStep, pipeline: SupportsPipeline) -> PipelineTrac return trace -def start_trace_step(trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline) -> PipelineStepTrace: +def start_trace_step( + trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline +) -> PipelineStepTrace: trace_step = PipelineStepTrace(uniq_id(), step, pendulum.now()) with suppress_and_warn(): TRACKING_MODULE.on_start_trace_step(trace, step, pipeline) return trace_step -def end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: SupportsPipeline, step_info: Any) -> None: +def end_trace_step( + trace: PipelineTrace, step: PipelineStepTrace, pipeline: SupportsPipeline, step_info: Any +) -> None: # saves runtime trace of the pipeline if isinstance(step_info, PipelineStepFailed): step_exception = str(step_info) @@ -199,15 +222,18 @@ def end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: Supp step.step_exception = step_exception step.step_info = step_info - resolved_values = map(lambda v: SerializableResolvedValueTrace( + resolved_values = map( + lambda v: SerializableResolvedValueTrace( v.key, v.value, v.default_value, is_secret_hint(v.hint), v.sections, v.provider_name, - str(type(v.config).__qualname__) - ) , _RESOLVED_TRACES.values()) + str(type(v.config).__qualname__), + ), + _RESOLVED_TRACES.values(), + ) trace.resolved_config_values = list(resolved_values) trace.steps.append(step) @@ -259,17 +285,16 @@ def describe_extract_data(data: Any) -> List[ExtractDataInfo]: def add_item(item: Any) -> bool: if isinstance(item, (DltResource, DltSource)): # record names of sources/resources - data_info.append({ - "name": item.name, - "data_type": "resource" if isinstance(item, DltResource) else "source" - }) + data_info.append( + { + "name": item.name, + "data_type": "resource" if isinstance(item, DltResource) else "source", + } + ) return False else: # anything else - data_info.append({ - "name": "", - "data_type": type(item).__name__ - }) + data_info.append({"name": "", "data_type": type(item).__name__}) return True item: Any = data diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index 07e9a2d137..7670c95163 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -24,6 +24,7 @@ def _add_sentry_tags(span: Span, pipeline: SupportsPipeline) -> None: span.set_tag("destination", pipeline.destination.name) if pipeline.dataset_name: span.set_tag("dataset_name", pipeline.dataset_name) + except ImportError: # sentry is optional dependency and enabled only when RuntimeConfiguration.sentry_dsn is set pass @@ -67,7 +68,9 @@ def on_start_trace(trace: PipelineTrace, step: TPipelineStep, pipeline: Supports transaction.__enter__() -def on_start_trace_step(trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline) -> None: +def on_start_trace_step( + trace: PipelineTrace, step: TPipelineStep, pipeline: SupportsPipeline +) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"START SENTRY SPAN {trace.transaction_id}:{trace_step.span_id} SCOPE: {Hub.current.scope}") span = Hub.current.scope.span.start_child(description=step, op=step).__enter__() @@ -75,7 +78,9 @@ def on_start_trace_step(trace: PipelineTrace, step: TPipelineStep, pipeline: Sup _add_sentry_tags(span, pipeline) -def on_end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: SupportsPipeline, step_info: Any) -> None: +def on_end_trace_step( + trace: PipelineTrace, step: PipelineStepTrace, pipeline: SupportsPipeline, step_info: Any +) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"---END SENTRY SPAN {trace.transaction_id}:{step.span_id}: {step} SCOPE: {Hub.current.scope}") with contextlib.suppress(Exception): @@ -90,8 +95,10 @@ def on_end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: S "destination_name": pipeline.destination.name if pipeline.destination else None, "pipeline_name_hash": digest128(pipeline.pipeline_name), "dataset_name_hash": digest128(pipeline.dataset_name) if pipeline.dataset_name else None, - "default_schema_name_hash": digest128(pipeline.default_schema_name) if pipeline.default_schema_name else None, - "transaction_id": trace.transaction_id + "default_schema_name_hash": ( + digest128(pipeline.default_schema_name) if pipeline.default_schema_name else None + ), + "transaction_id": trace.transaction_id, } # disable automatic slack messaging until we can configure messages themselves if step.step == "extract" and step_info: diff --git a/dlt/reflection/names.py b/dlt/reflection/names.py index 1aee6df52b..dad7bdce92 100644 --- a/dlt/reflection/names.py +++ b/dlt/reflection/names.py @@ -18,5 +18,5 @@ ATTACH: inspect.signature(attach), RUN: inspect.signature(run), SOURCE: inspect.signature(source), - RESOURCE: inspect.signature(resource) -} \ No newline at end of file + RESOURCE: inspect.signature(resource), +} diff --git a/dlt/reflection/script_inspector.py b/dlt/reflection/script_inspector.py index 9899e2b157..d8d96804c8 100644 --- a/dlt/reflection/script_inspector.py +++ b/dlt/reflection/script_inspector.py @@ -22,6 +22,7 @@ def patch__init__(self: Any, *args: Any, **kwargs: Any) -> None: class DummyModule(ModuleType): """A dummy module from which you can import anything""" + def __getattr__(self, key: str) -> Any: if key[0].isupper(): # if imported name is capitalized, import type @@ -29,13 +30,20 @@ def __getattr__(self, key: str) -> Any: else: # otherwise import instance return SimpleNamespace() - __all__: List[Any] = [] # support wildcard imports + + __all__: List[Any] = [] # support wildcard imports def _import_module(name: str, missing_modules: Tuple[str, ...] = ()) -> ModuleType: """Module importer that ignores missing modules by importing a dummy module""" - def _try_import(name: str, _globals: Mapping[str, Any] = None, _locals: Mapping[str, Any] = None, fromlist: Sequence[str] = (), level:int = 0) -> ModuleType: + def _try_import( + name: str, + _globals: Mapping[str, Any] = None, + _locals: Mapping[str, Any] = None, + fromlist: Sequence[str] = (), + level: int = 0, + ) -> ModuleType: """This function works as follows: on ImportError it raises. This import error is then next caught in the main function body and the name is added to exceptions. Next time if the name is on exception list or name is a package on exception list we return DummyModule and do not reraise This excepts only the modules that bubble up ImportError up until our code so any handled import errors are not excepted @@ -63,7 +71,7 @@ def _try_import(name: str, _globals: Mapping[str, Any] = None, _locals: Mapping[ # print(f"ADD {ie.name} {ie.path} vs {name} vs {str(ie)}") if ie.name in missing_modules: raise - missing_modules += (ie.name, ) + missing_modules += (ie.name,) except MissingDependencyException as me: if isinstance(me.__context__, ImportError): if me.__context__.name is None: @@ -72,14 +80,16 @@ def _try_import(name: str, _globals: Mapping[str, Any] = None, _locals: Mapping[ # print(f"{me.__context__.name} IN :/") raise # print(f"ADD {me.__context__.name}") - missing_modules += (me.__context__.name, ) + missing_modules += (me.__context__.name,) else: raise finally: builtins.__import__ = real_import -def load_script_module(module_path:str, script_relative_path: str, ignore_missing_imports: bool = False) -> ModuleType: +def load_script_module( + module_path: str, script_relative_path: str, ignore_missing_imports: bool = False +) -> ModuleType: """Loads a module in `script_relative_path` by splitting it into a script module (file part) and package (folders). `module_path` is added to sys.path Optionally, missing imports will be ignored by importing a dummy module instead. """ @@ -111,12 +121,24 @@ def load_script_module(module_path:str, script_relative_path: str, ignore_missin sys.path.remove(sys_path) -def inspect_pipeline_script(module_path:str, script_relative_path: str, ignore_missing_imports: bool = False) -> ModuleType: +def inspect_pipeline_script( + module_path: str, script_relative_path: str, ignore_missing_imports: bool = False +) -> ModuleType: # patch entry points to pipeline, sources and resources to prevent pipeline from running - with patch.object(Pipeline, '__init__', patch__init__), patch.object(DltSource, '__init__', patch__init__), patch.object(ManagedPipeIterator, '__init__', patch__init__): - return load_script_module(module_path, script_relative_path, ignore_missing_imports=ignore_missing_imports) + with patch.object(Pipeline, "__init__", patch__init__), patch.object( + DltSource, "__init__", patch__init__ + ), patch.object(ManagedPipeIterator, "__init__", patch__init__): + return load_script_module( + module_path, script_relative_path, ignore_missing_imports=ignore_missing_imports + ) class PipelineIsRunning(DltException): def __init__(self, obj: object, args: Tuple[str, ...], kwargs: DictStrAny) -> None: - super().__init__(f"The pipeline script instantiates the pipeline on import. Did you forget to use if __name__ == 'main':? in {obj.__class__.__name__}", obj, args, kwargs) + super().__init__( + "The pipeline script instantiates the pipeline on import. Did you forget to use if" + f" __name__ == 'main':? in {obj.__class__.__name__}", + obj, + args, + kwargs, + ) diff --git a/dlt/reflection/script_visitor.py b/dlt/reflection/script_visitor.py index 7d4e0ea2cd..52b19fe031 100644 --- a/dlt/reflection/script_visitor.py +++ b/dlt/reflection/script_visitor.py @@ -10,7 +10,6 @@ class PipelineScriptVisitor(NodeVisitor): - def __init__(self, source: str): self.source = source self.source_lines: List[str] = ast._splitlines_no_ff(source) # type: ignore @@ -73,7 +72,9 @@ def visit_FunctionDef(self, node: ast.FunctionDef) -> Any: elif isinstance(deco, ast.Call): alias_name = astunparse.unparse(deco.func).strip() else: - raise ValueError(self.source_segment(deco), type(deco), "Unknown decorator form") + raise ValueError( + self.source_segment(deco), type(deco), "Unknown decorator form" + ) fn = self.func_aliases.get(alias_name) if fn == n.SOURCE: self.known_sources[str(node.name)] = node @@ -96,7 +97,9 @@ def visit_Call(self, node: ast.Call) -> Any: sig = n.SIGNATURES[fn] try: # bind the signature where the argument values are the corresponding ast nodes - bound_args = sig.bind(*node.args, **{str(kwd.arg):kwd.value for kwd in node.keywords}) + bound_args = sig.bind( + *node.args, **{str(kwd.arg): kwd.value for kwd in node.keywords} + ) bound_args.apply_defaults() # print(f"ALIAS: {alias_name} of {self.func_aliases.get(alias_name)} with {bound_args}") fun_calls = self.known_calls.setdefault(fn, []) diff --git a/dlt/sources/config.py b/dlt/sources/config.py index d58c210ab6..796a338c02 100644 --- a/dlt/sources/config.py +++ b/dlt/sources/config.py @@ -1,2 +1,2 @@ from dlt.common.configuration.specs import configspec -from dlt.common.configuration.inject import with_config \ No newline at end of file +from dlt.common.configuration.inject import with_config diff --git a/dlt/sources/credentials.py b/dlt/sources/credentials.py index 5815324d56..a7663a857b 100644 --- a/dlt/sources/credentials.py +++ b/dlt/sources/credentials.py @@ -1,4 +1,8 @@ -from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials, GcpCredentials +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + GcpOAuthCredentials, + GcpCredentials, +) from dlt.common.configuration.specs import ConnectionStringCredentials from dlt.common.configuration.specs import OAuth2Credentials from dlt.common.configuration.specs import CredentialsConfiguration, configspec @@ -16,4 +20,3 @@ "FileSystemCredentials", "FilesystemConfiguration", ] - diff --git a/dlt/sources/filesystem.py b/dlt/sources/filesystem.py index bb18a15f20..23fb6a9cf3 100644 --- a/dlt/sources/filesystem.py +++ b/dlt/sources/filesystem.py @@ -1,3 +1,8 @@ -from dlt.common.storages.fsspec_filesystem import FileItem, FileItemDict, fsspec_filesystem, glob_files +from dlt.common.storages.fsspec_filesystem import ( + FileItem, + FileItemDict, + fsspec_filesystem, + glob_files, +) __all__ = ["FileItem", "FileItemDict", "fsspec_filesystem", "glob_files"] diff --git a/dlt/sources/helpers/requests/__init__.py b/dlt/sources/helpers/requests/__init__.py index 39d286e29d..3e29a2cf52 100644 --- a/dlt/sources/helpers/requests/__init__.py +++ b/dlt/sources/helpers/requests/__init__.py @@ -1,6 +1,7 @@ from tenacity import RetryError from requests import ( - Request, Response, + Request, + Response, ConnectionError, ConnectTimeout, FileModeWarning, @@ -19,7 +20,14 @@ client = Client() get, post, put, patch, delete, options, head, request = ( - client.get, client.post, client.put, client.patch, client.delete, client.options, client.head, client.request + client.get, + client.post, + client.put, + client.patch, + client.delete, + client.options, + client.head, + client.request, ) @@ -40,8 +48,18 @@ def init(config: RunConfiguration) -> None: "request", "init", "Session", - "Request", "Response", "ConnectionError", "ConnectTimeout", "FileModeWarning", "HTTPError", "ReadTimeout", - "RequestException", "Timeout", "TooManyRedirects", "URLRequired", "ChunkedEncodingError", "RetryError" - "Client", - "RetryError" + "Request", + "Response", + "ConnectionError", + "ConnectTimeout", + "FileModeWarning", + "HTTPError", + "ReadTimeout", + "RequestException", + "Timeout", + "TooManyRedirects", + "URLRequired", + "ChunkedEncodingError", + "RetryErrorClient", + "RetryError", ] diff --git a/dlt/sources/helpers/requests/retry.py b/dlt/sources/helpers/requests/retry.py index 8f824e0c4f..c9a813598f 100644 --- a/dlt/sources/helpers/requests/retry.py +++ b/dlt/sources/helpers/requests/retry.py @@ -1,13 +1,32 @@ from email.utils import parsedate_tz, mktime_tz import re import time -from typing import Optional, cast, Callable, Type, Union, Sequence, Tuple, List, TYPE_CHECKING, Any, Dict +from typing import ( + Optional, + cast, + Callable, + Type, + Union, + Sequence, + Tuple, + List, + TYPE_CHECKING, + Any, + Dict, +) from threading import local from requests import Response, HTTPError, Session as BaseSession from requests.exceptions import ConnectionError, Timeout, ChunkedEncodingError from requests.adapters import HTTPAdapter -from tenacity import Retrying, retry_if_exception_type, stop_after_attempt, RetryCallState, retry_any, wait_exponential +from tenacity import ( + Retrying, + retry_if_exception_type, + stop_after_attempt, + RetryCallState, + retry_any, + wait_exponential, +) from tenacity.retry import retry_base from dlt.sources.helpers.requests.session import Session, DEFAULT_TIMEOUT @@ -96,7 +115,7 @@ def _make_retry( backoff_factor: float, respect_retry_after_header: bool, max_delay: TimedeltaSeconds, -)-> Retrying: +) -> Retrying: retry_conds = [retry_if_status(status_codes), retry_if_exception_type(tuple(exceptions))] if condition is not None: if callable(condition): @@ -148,12 +167,15 @@ class Client: respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes) """ + _session_attrs: Dict[str, Any] @with_config(spec=RunConfiguration) def __init__( self, - request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT, + request_timeout: Optional[ + Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]] + ] = DEFAULT_TIMEOUT, max_connections: int = 50, raise_for_status: bool = True, status_codes: Sequence[int] = DEFAULT_RETRY_STATUS, @@ -175,7 +197,7 @@ def __init__( condition=retry_condition, backoff_factor=request_backoff_factor, respect_retry_after_header=respect_retry_after_header, - max_delay=request_max_retry_delay + max_delay=request_max_retry_delay, ) self._session_attrs = session_attrs or {} @@ -198,29 +220,31 @@ def __init__( self.options = lambda *a, **kw: self.session.options(*a, **kw) self.request = lambda *a, **kw: self.session.request(*a, **kw) - self._config_version: int = 0 # Incrementing marker to ensure per-thread sessions are recreated on config changes + self._config_version: int = ( + 0 # Incrementing marker to ensure per-thread sessions are recreated on config changes + ) def update_from_config(self, config: RunConfiguration) -> None: """Update session/retry settings from RunConfiguration""" - self._session_kwargs['timeout'] = config.request_timeout - self._retry_kwargs['backoff_factor'] = config.request_backoff_factor - self._retry_kwargs['max_delay'] = config.request_max_retry_delay - self._retry_kwargs['max_attempts'] = config.request_max_attempts + self._session_kwargs["timeout"] = config.request_timeout + self._retry_kwargs["backoff_factor"] = config.request_backoff_factor + self._retry_kwargs["max_delay"] = config.request_max_retry_delay + self._retry_kwargs["max_attempts"] = config.request_max_attempts self._config_version += 1 def _make_session(self) -> Session: session = Session(**self._session_kwargs) # type: ignore[arg-type] for key, value in self._session_attrs.items(): setattr(session, key, value) - session.mount('http://', self._adapter) - session.mount('https://', self._adapter) + session.mount("http://", self._adapter) + session.mount("https://", self._adapter) retry = _make_retry(**self._retry_kwargs) session.request = retry.wraps(session.request) # type: ignore[method-assign] return session @property def session(self) -> Session: - session: Optional[Session] = getattr(self._local, 'session', None) + session: Optional[Session] = getattr(self._local, "session", None) version = self._config_version if session is not None: version = self._local.config_version diff --git a/dlt/sources/helpers/requests/session.py b/dlt/sources/helpers/requests/session.py index b12d8da73f..0a4d277848 100644 --- a/dlt/sources/helpers/requests/session.py +++ b/dlt/sources/helpers/requests/session.py @@ -15,7 +15,11 @@ def _timeout_to_seconds(timeout: TRequestTimeout) -> Optional[Union[Tuple[float, float], float]]: - return (to_seconds(timeout[0]), to_seconds(timeout[1])) if isinstance(timeout, tuple) else to_seconds(timeout) + return ( + (to_seconds(timeout[0]), to_seconds(timeout[1])) + if isinstance(timeout, tuple) + else to_seconds(timeout) + ) class Session(BaseSession): @@ -26,23 +30,28 @@ class Session(BaseSession): May be a single value or a tuple for separate (connect, read) timeout. raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`) """ + def __init__( self, - timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT, + timeout: Optional[ + Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]] + ] = DEFAULT_TIMEOUT, raise_for_status: bool = True, ) -> None: super().__init__() self.timeout = _timeout_to_seconds(timeout) self.raise_for_status = raise_for_status - self.headers.update({ - "User-Agent": f"dlt/{__version__}", - }) + self.headers.update( + { + "User-Agent": f"dlt/{__version__}", + } + ) if TYPE_CHECKING: request = BaseSession.request def request(self, *args, **kwargs): # type: ignore[no-untyped-def,no-redef] - kwargs.setdefault('timeout', self.timeout) + kwargs.setdefault("timeout", self.timeout) resp = super().request(*args, **kwargs) if self.raise_for_status: resp.raise_for_status() diff --git a/dlt/sources/helpers/transform.py b/dlt/sources/helpers/transform.py index 0c2f7c5e39..1975c20586 100644 --- a/dlt/sources/helpers/transform.py +++ b/dlt/sources/helpers/transform.py @@ -5,18 +5,22 @@ def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]: """A filter that takes only first `max_items` from a resource""" count: int = 0 + def _filter(_: TDataItem) -> bool: nonlocal count count += 1 return count <= max_items + return _filter def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]: """A filter that skips first `max_items` from a resource""" count: int = 0 + def _filter(_: TDataItem) -> bool: nonlocal count count += 1 return count > max_items + return _filter diff --git a/docs/examples/archive/_helpers.py b/docs/examples/archive/_helpers.py index 95913d1be1..0f490ff85f 100644 --- a/docs/examples/archive/_helpers.py +++ b/docs/examples/archive/_helpers.py @@ -10,6 +10,12 @@ } # we do not want to have this key verbatim in repo so we decode it here -_bigquery_credentials["private_key"] = bytes([_a ^ _b for _a, _b in zip(base64.b64decode(_bigquery_credentials["private_key"]), b"quickstart-sv"*150)]).decode("utf-8") +_bigquery_credentials["private_key"] = bytes( + [ + _a ^ _b + for _a, _b in zip( + base64.b64decode(_bigquery_credentials["private_key"]), b"quickstart-sv" * 150 + ) + ] +).decode("utf-8") pub_bigquery_credentials = _bigquery_credentials - diff --git a/docs/examples/archive/credentials/explicit.py b/docs/examples/archive/credentials/explicit.py index 6233140459..b1bc25fce6 100644 --- a/docs/examples/archive/credentials/explicit.py +++ b/docs/examples/archive/credentials/explicit.py @@ -4,7 +4,9 @@ @dlt.resource -def simple_data(api_url: str = dlt.config.value, api_secret: dlt.TSecretValue = dlt.secrets.value) -> Iterator[str]: +def simple_data( + api_url: str = dlt.config.value, api_secret: dlt.TSecretValue = dlt.secrets.value +) -> Iterator[str]: # just yield api_url and api_secret to show what was configured in the example yield api_url yield api_secret @@ -29,13 +31,17 @@ def simple_data(api_url: str = dlt.config.value, api_secret: dlt.TSecretValue = print(list(data)) # you are free to pass credentials from custom location to destination -pipeline = dlt.pipeline(destination="postgres", credentials=dlt.secrets["custom.destination.credentials"]) +pipeline = dlt.pipeline( + destination="postgres", credentials=dlt.secrets["custom.destination.credentials"] +) # see nice credentials object print(pipeline.credentials) # you can also pass credentials partially, only the password comes from the secrets or environment -pipeline = dlt.pipeline(destination="postgres", credentials="postgres://loader@localhost:5432/dlt_data") +pipeline = dlt.pipeline( + destination="postgres", credentials="postgres://loader@localhost:5432/dlt_data" +) # now lets compare it with default location for config and credentials data = simple_data() -print(list(data)) \ No newline at end of file +print(list(data)) diff --git a/docs/examples/archive/dbt_run_jaffle.py b/docs/examples/archive/dbt_run_jaffle.py index ad059dcd6d..098b35fff8 100644 --- a/docs/examples/archive/dbt_run_jaffle.py +++ b/docs/examples/archive/dbt_run_jaffle.py @@ -2,7 +2,9 @@ pipeline = dlt.pipeline(destination="duckdb", dataset_name="jaffle_jaffle") -print("create or restore virtual environment in which dbt is installed, use the newest version of dbt") +print( + "create or restore virtual environment in which dbt is installed, use the newest version of dbt" +) venv = dlt.dbt.get_venv(pipeline) print("get runner, optionally pass the venv") @@ -11,13 +13,18 @@ print("run the package (clone/pull repo, deps, seed, source tests, run)") models = dbt.run_all() for m in models: - print(f"Model {m.model_name} materialized in {m.time} with status {m.status} and message {m.message}") + print( + f"Model {m.model_name} materialized in {m.time} with status {m.status} and message" + f" {m.message}" + ) print("") print("test the model") models = dbt.test() for m in models: - print(f"Test {m.model_name} executed in {m.time} with status {m.status} and message {m.message}") + print( + f"Test {m.model_name} executed in {m.time} with status {m.status} and message {m.message}" + ) print("") print("get and display data frame with customers") diff --git a/docs/examples/archive/discord_iterator.py b/docs/examples/archive/discord_iterator.py index a3c59ed2c5..44cbe3b5b1 100644 --- a/docs/examples/archive/discord_iterator.py +++ b/docs/examples/archive/discord_iterator.py @@ -1,4 +1,3 @@ - # from dlt.common import json # from dlt.common.schema import Schema # from dlt.common.typing import DictStrAny diff --git a/docs/examples/archive/google_sheets.py b/docs/examples/archive/google_sheets.py index 93c5658233..26c3d30b54 100644 --- a/docs/examples/archive/google_sheets.py +++ b/docs/examples/archive/google_sheets.py @@ -6,5 +6,7 @@ # see example.secrets.toml to where to put credentials # "2022-05", "model_metadata" -info = google_spreadsheet("11G95oVZjieRhyGqtQMQqlqpxyvWkRXowKE8CtdLtFaU", ["named range", "Second_Copy!1:2"]) +info = google_spreadsheet( + "11G95oVZjieRhyGqtQMQqlqpxyvWkRXowKE8CtdLtFaU", ["named range", "Second_Copy!1:2"] +) print(list(info)) diff --git a/docs/examples/archive/quickstart.py b/docs/examples/archive/quickstart.py index e55e9f6049..6e49f1af7a 100644 --- a/docs/examples/archive/quickstart.py +++ b/docs/examples/archive/quickstart.py @@ -9,9 +9,9 @@ """ # 1. configuration: name your dataset, table, pass credentials -dataset_name = 'dlt_quickstart' -pipeline_name = 'dlt_quickstart' -table_name = 'my_json_doc' +dataset_name = "dlt_quickstart" +pipeline_name = "dlt_quickstart" +table_name = "my_json_doc" gcp_credentials_json = { "type": "service_account", @@ -24,7 +24,14 @@ destination_name = "duckdb" if destination_name == "bigquery": # we do not want to have this key verbatim in repo so we decode it here - gcp_credentials_json["private_key"] = bytes([_a ^ _b for _a, _b in zip(base64.b64decode(gcp_credentials_json["private_key"]), b"quickstart-sv"*150)]).decode("utf-8") + gcp_credentials_json["private_key"] = bytes( + [ + _a ^ _b + for _a, _b in zip( + base64.b64decode(gcp_credentials_json["private_key"]), b"quickstart-sv" * 150 + ) + ] + ).decode("utf-8") credentials: Any = gcp_credentials_json elif destination_name == "redshift": credentials = db_dsn @@ -41,20 +48,26 @@ dataset_name=dataset_name, credentials=credentials, export_schema_path=export_schema_path, - full_refresh=True + full_refresh=True, ) # 3. Pass the data to the pipeline and give it a table name. Optionally normalize and handle schema. -rows = [{"name": "Ana", "age": 30, "id": 456, "children": [{"name": "Bill", "id": 625}, - {"name": "Elli", "id": 591} - ]}, - - {"name": "Bob", "age": 30, "id": 455, "children": [{"name": "Bill", "id": 625}, - {"name": "Dave", "id": 621} - ]} - ] +rows = [ + { + "name": "Ana", + "age": 30, + "id": 456, + "children": [{"name": "Bill", "id": 625}, {"name": "Elli", "id": 591}], + }, + { + "name": "Bob", + "age": 30, + "id": 455, + "children": [{"name": "Bill", "id": 625}, {"name": "Dave", "id": 621}], + }, +] load_info = pipeline.run(rows, table_name=table_name, write_disposition="replace") diff --git a/docs/examples/archive/rasa_example.py b/docs/examples/archive/rasa_example.py index d438ce5e8b..e83e6c61f7 100644 --- a/docs/examples/archive/rasa_example.py +++ b/docs/examples/archive/rasa_example.py @@ -24,9 +24,11 @@ destination=postgres, # export_schema_path=... # uncomment to see the final schema in the folder you want ).run( - rasa(event_files, store_last_timestamp=True), # also store last timestamp so we have no duplicate events - credentials=credentials # if you skip this parameter, the credentials will be injected by the config providers - ) + rasa( + event_files, store_last_timestamp=True + ), # also store last timestamp so we have no duplicate events + credentials=credentials, # if you skip this parameter, the credentials will be injected by the config providers +) print(info) diff --git a/docs/examples/archive/read_table.py b/docs/examples/archive/read_table.py index 291c27bde4..6cccf0efdb 100644 --- a/docs/examples/archive/read_table.py +++ b/docs/examples/archive/read_table.py @@ -9,7 +9,9 @@ source_dsn = "redshift+redshift_connector://loader@chat-analytics.czwteevq7bpe.eu-central-1.redshift.amazonaws.com:5439/chat_analytics_rasa" # get data from table, we preserve method signature from pandas -items = query_table("blocks__transactions", source_dsn, table_schema_name="mainnet_2_ethereum", coerce_float=False) +items = query_table( + "blocks__transactions", source_dsn, table_schema_name="mainnet_2_ethereum", coerce_float=False +) # the data is also an iterator for i in items: @@ -25,5 +27,7 @@ # you can find a docker compose file that spins up required instance in tests/load/postgres # note: run the script without required env variables to see info on possible secret configurations that were tried -info = dlt.pipeline().run(items, destination=postgres, dataset_name="ethereum", table_name="transactions") +info = dlt.pipeline().run( + items, destination=postgres, dataset_name="ethereum", table_name="transactions" +) print(info) diff --git a/docs/examples/archive/restore_pipeline.py b/docs/examples/archive/restore_pipeline.py index f3c013e85b..fc1f92a4c0 100644 --- a/docs/examples/archive/restore_pipeline.py +++ b/docs/examples/archive/restore_pipeline.py @@ -18,4 +18,4 @@ # print(pipeline.list_extracted_loads()) # # just finalize -# pipeline.flush() \ No newline at end of file +# pipeline.flush() diff --git a/docs/examples/archive/singer_tap_example.py b/docs/examples/archive/singer_tap_example.py index d03182339c..a9b105fe93 100644 --- a/docs/examples/archive/singer_tap_example.py +++ b/docs/examples/archive/singer_tap_example.py @@ -11,7 +11,10 @@ # here we use context manager to automatically delete venv after example was run # the dependency is meltano version of csv tap -print("Spawning virtual environment to run singer and installing csv tap from git+https://github.com/MeltanoLabs/tap-csv.git") +print( + "Spawning virtual environment to run singer and installing csv tap from" + " git+https://github.com/MeltanoLabs/tap-csv.git" +) # WARNING: on MACOS you need to have working gcc to use tap-csv, otherwise dependency will not be installed with Venv.create(mkdtemp(), ["git+https://github.com/MeltanoLabs/tap-csv.git"]) as venv: # prep singer config for tap-csv @@ -20,13 +23,13 @@ { "entity": "annotations_202205", "path": os.path.abspath("examples/data/singer_taps/model_annotations.csv"), - "keys": [ - "message id" - ] + "keys": ["message id"], } ] } print("running tap-csv") tap_source = tap(venv, "tap-csv", csv_tap_config, "examples/data/singer_taps/csv_catalog.json") - info = dlt.pipeline("meltano_csv", destination="postgres").run(tap_source, credentials="postgres://loader@localhost:5432/dlt_data") + info = dlt.pipeline("meltano_csv", destination="postgres").run( + tap_source, credentials="postgres://loader@localhost:5432/dlt_data" + ) print(info) diff --git a/docs/examples/archive/singer_tap_jsonl_example.py b/docs/examples/archive/singer_tap_jsonl_example.py index fff64bdb1d..c926a9f153 100644 --- a/docs/examples/archive/singer_tap_jsonl_example.py +++ b/docs/examples/archive/singer_tap_jsonl_example.py @@ -7,11 +7,11 @@ # load hubspot schema stub - it converts all field names with `timestamp` into timestamp type -schema = SchemaStorage.load_schema_file("docs/examples/schemas/", "hubspot", ("yaml", )) +schema = SchemaStorage.load_schema_file("docs/examples/schemas/", "hubspot", ("yaml",)) p = dlt.pipeline(destination="postgres", full_refresh=True) # now load a pipeline created from jsonl resource that feeds messages into singer tap transformer pipe = jsonl_file("docs/examples/data/singer_taps/tap_hubspot.jsonl") | singer_raw_stream() # provide hubspot schema info = p.run(pipe, schema=schema, credentials="postgres://loader@localhost:5432/dlt_data") -print(info) \ No newline at end of file +print(info) diff --git a/docs/examples/archive/sources/google_sheets.py b/docs/examples/archive/sources/google_sheets.py index 8a3d6b1d1c..69855154ae 100644 --- a/docs/examples/archive/sources/google_sheets.py +++ b/docs/examples/archive/sources/google_sheets.py @@ -16,38 +16,52 @@ # TODO: consider using https://github.com/burnash/gspread for spreadsheet discovery -def _initialize_sheets(credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials]) -> Any: +def _initialize_sheets( + credentials: Union[GcpOAuthCredentials, GcpServiceAccountCredentials] +) -> Any: # Build the service object. - service = build('sheets', 'v4', credentials=credentials.to_native_credentials()) + service = build("sheets", "v4", credentials=credentials.to_native_credentials()) return service @dlt.source -def google_spreadsheet(spreadsheet_id: str, sheet_names: Sequence[str], credentials: Union[GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny] = dlt.secrets.value) -> Any: - +def google_spreadsheet( + spreadsheet_id: str, + sheet_names: Sequence[str], + credentials: Union[ + GcpServiceAccountCredentials, GcpOAuthCredentials, str, StrAny + ] = dlt.secrets.value, +) -> Any: sheets = _initialize_sheets(cast(GcpServiceAccountCredentials, credentials)) # import pprint # meta = sheets.spreadsheets().get(spreadsheetId=spreadsheet_id, ranges=sheet_names, includeGridData=True).execute() # pprint.pprint(meta) def get_sheet(sheet_name: str) -> Iterator[DictStrAny]: - # get list of list of typed values - result = sheets.spreadsheets().values().get( - spreadsheetId=spreadsheet_id, - range=sheet_name, - # unformatted returns typed values - valueRenderOption="UNFORMATTED_VALUE", - # will return formatted dates - dateTimeRenderOption="FORMATTED_STRING" - ).execute() + result = ( + sheets.spreadsheets() + .values() + .get( + spreadsheetId=spreadsheet_id, + range=sheet_name, + # unformatted returns typed values + valueRenderOption="UNFORMATTED_VALUE", + # will return formatted dates + dateTimeRenderOption="FORMATTED_STRING", + ) + .execute() + ) # pprint.pprint(result) - values = result.get('values') + values = result.get("values") # yield dicts assuming row 0 contains headers and following rows values and all rows have identical length for v in values[1:]: yield {h: v for h, v in zip(values[0], v)} # create resources from supplied sheet names - return [dlt.resource(get_sheet(name), name=name, write_disposition="replace") for name in sheet_names] + return [ + dlt.resource(get_sheet(name), name=name, write_disposition="replace") + for name in sheet_names + ] diff --git a/docs/examples/archive/sources/jsonl.py b/docs/examples/archive/sources/jsonl.py index 282966d00a..5989d2054f 100644 --- a/docs/examples/archive/sources/jsonl.py +++ b/docs/examples/archive/sources/jsonl.py @@ -7,8 +7,9 @@ from dlt.common.typing import StrAny, StrOrBytesPath -def chunk_jsonl(path: StrOrBytesPath, chunk_size: int = 20) -> Union[Iterator[StrAny], Iterator[List[StrAny]]]: - +def chunk_jsonl( + path: StrOrBytesPath, chunk_size: int = 20 +) -> Union[Iterator[StrAny], Iterator[List[StrAny]]]: with open(path, "rb") as f: def _iter() -> Iterator[StrAny]: @@ -24,9 +25,13 @@ def _iter() -> Iterator[StrAny]: else: break + jsonl_file = dlt.resource(chunk_jsonl, name="jsonl", spec=BaseConfiguration) + @dlt.resource(name="jsonl") -def jsonl_files(paths: Sequence[StrOrBytesPath], chunk_size: int = 20) -> Union[Iterator[StrAny], Iterator[List[StrAny]]]: +def jsonl_files( + paths: Sequence[StrOrBytesPath], chunk_size: int = 20 +) -> Union[Iterator[StrAny], Iterator[List[StrAny]]]: for path in paths: yield from chunk_jsonl(path, chunk_size) diff --git a/docs/examples/archive/sources/rasa/__init__.py b/docs/examples/archive/sources/rasa/__init__.py index acd214368a..3a274af671 100644 --- a/docs/examples/archive/sources/rasa/__init__.py +++ b/docs/examples/archive/sources/rasa/__init__.py @@ -1 +1 @@ -from .rasa import rasa \ No newline at end of file +from .rasa import rasa diff --git a/docs/examples/archive/sources/rasa/rasa.py b/docs/examples/archive/sources/rasa/rasa.py index b498f9c3de..60643fe17e 100644 --- a/docs/examples/archive/sources/rasa/rasa.py +++ b/docs/examples/archive/sources/rasa/rasa.py @@ -13,7 +13,7 @@ def rasa( source_env: str = None, initial_timestamp: float = None, end_timestamp: float = None, - store_last_timestamp: bool = True + store_last_timestamp: bool = True, ) -> Any: """Transforms the base resource provided in `data_from` into a rasa tracker store raw dataset where each event type get it's own table. The resource is a stream resource and it generates tables dynamically from data. The source uses `rasa.schema.yaml` file to initialize the schema @@ -34,7 +34,9 @@ def rasa( def events(source_events: TDataItems) -> Iterator[TDataItem]: # recover start_timestamp from state if given if store_last_timestamp: - start_timestamp = max(initial_timestamp or 0, dlt.current.source_state().get("start_timestamp", 0)) + start_timestamp = max( + initial_timestamp or 0, dlt.current.source_state().get("start_timestamp", 0) + ) # we expect tracker store events here last_timestamp: int = None @@ -51,7 +53,7 @@ def _proc_event(source_event: TDataItem) -> Iterator[TDataItem]: event = { "sender_id": source_event["sender_id"], "timestamp": last_timestamp, - "event": event_type + "event": event_type, } if source_env: event["source"] = source_env diff --git a/docs/examples/archive/sources/singer_tap.py b/docs/examples/archive/sources/singer_tap.py index 41db2c09f5..3c733c33f1 100644 --- a/docs/examples/archive/sources/singer_tap.py +++ b/docs/examples/archive/sources/singer_tap.py @@ -12,6 +12,7 @@ FilePathOrDict = Union[StrAny, StrOrBytesPath] + class SingerMessage(TypedDict): type: str # noqa: A003 @@ -24,6 +25,7 @@ class SingerRecord(SingerMessage): class SingerState(SingerMessage): value: DictStrAny + # try: # from singer import parse_message_from_obj, Message, RecordMessage, StateMessage # except ImportError: @@ -33,7 +35,9 @@ class SingerState(SingerMessage): # pip install ../singer/singer-python # https://github.com/datamill-co/singer-runner/tree/master/singer_runner # https://techgaun.github.io/active-forks/index.html#singer-io/singer-python -def get_source_from_stream(singer_messages: Iterator[SingerMessage], state: DictStrAny = None) -> Iterator[TDataItem]: +def get_source_from_stream( + singer_messages: Iterator[SingerMessage], state: DictStrAny = None +) -> Iterator[TDataItem]: last_state = {} for msg in singer_messages: if msg["type"] == "RECORD": @@ -57,7 +61,13 @@ def singer_raw_stream(singer_messages: TDataItems, use_state: bool = True) -> It @dlt.source(spec=BaseConfiguration) # use BaseConfiguration spec to prevent injections -def tap(venv: Venv, tap_name: str, config_file: FilePathOrDict, catalog_file: FilePathOrDict, use_state: bool = True) -> Any: +def tap( + venv: Venv, + tap_name: str, + config_file: FilePathOrDict, + catalog_file: FilePathOrDict, + use_state: bool = True, +) -> Any: # TODO: generate append/replace dispositions and some table/column hints from catalog files def as_config_file(config: FilePathOrDict) -> StrOrBytesPath: @@ -87,14 +97,15 @@ def singer_messages() -> Iterator[TDataItem]: else: state_params = () # type: ignore - pipe_iterator = singer_process_pipe(venv, - tap_name, - "--config", - os.path.abspath(config_file_path), - "--catalog", - os.path.abspath(catalog_file_path), - *state_params - ) + pipe_iterator = singer_process_pipe( + venv, + tap_name, + "--config", + os.path.abspath(config_file_path), + "--catalog", + os.path.abspath(catalog_file_path), + *state_params + ) yield from get_source_from_stream(pipe_iterator, state) return singer_messages diff --git a/docs/examples/archive/sources/sql_query.py b/docs/examples/archive/sources/sql_query.py index effa8740d5..8cd60992b2 100644 --- a/docs/examples/archive/sources/sql_query.py +++ b/docs/examples/archive/sources/sql_query.py @@ -12,23 +12,30 @@ # import gracefully and produce nice exception that explains the user what to do import pandas except ImportError: - raise MissingDependencyException("SQL Query Source", ["pandas"], "SQL Query Source temporarily uses pandas as DB interface") + raise MissingDependencyException( + "SQL Query Source", ["pandas"], "SQL Query Source temporarily uses pandas as DB interface" + ) try: from sqlalchemy.exc import NoSuchModuleError except ImportError: - raise MissingDependencyException("SQL Query Source", ["sqlalchemy"], "SQL Query Source temporarily uses pandas as DB interface") + raise MissingDependencyException( + "SQL Query Source", + ["sqlalchemy"], + "SQL Query Source temporarily uses pandas as DB interface", + ) -def _query_data( - f: AnyFun -) -> Iterator[DictStrAny]: - +def _query_data(f: AnyFun) -> Iterator[DictStrAny]: try: items = f() except NoSuchModuleError as m_exc: if "redshift.redshift_connector" in str(m_exc): - raise MissingDependencyException("SQL Query Source", ["sqlalchemy-redshift", "redshift_connector"], "Redshift dialect support for SqlAlchemy") + raise MissingDependencyException( + "SQL Query Source", + ["sqlalchemy-redshift", "redshift_connector"], + "Redshift dialect support for SqlAlchemy", + ) raise for i in items: @@ -46,11 +53,21 @@ def query_table( coerce_float: bool = True, parse_dates: Any = None, columns: List[str] = None, - chunk_size: int = 1000 + chunk_size: int = 1000, ) -> Any: print(credentials) assert isinstance(credentials, ConnectionStringCredentials) - f = partial(pandas.read_sql_table, table_name, credentials.to_native_representation(), table_schema_name, None, coerce_float, parse_dates, columns, chunksize=chunk_size) + f = partial( + pandas.read_sql_table, + table_name, + credentials.to_native_representation(), + table_schema_name, + None, + coerce_float, + parse_dates, + columns, + chunksize=chunk_size, + ) # if resource is returned from decorator function, it will override the hints from decorator return dlt.resource(_query_data(f), name=table_name) @@ -62,8 +79,18 @@ def query_sql( coerce_float: bool = True, parse_dates: Any = None, chunk_size: int = 1000, - dtype: Any = None + dtype: Any = None, ) -> Iterator[TDataItem]: assert isinstance(credentials, ConnectionStringCredentials) - f = partial(pandas.read_sql_query, sql, credentials.to_native_representation(), None, coerce_float, None, parse_dates, chunk_size, dtype) + f = partial( + pandas.read_sql_query, + sql, + credentials.to_native_representation(), + None, + coerce_float, + None, + parse_dates, + chunk_size, + dtype, + ) yield from _query_data(f) diff --git a/docs/examples/chess/chess.py b/docs/examples/chess/chess.py index f136e49a0a..84fbf3cb07 100644 --- a/docs/examples/chess/chess.py +++ b/docs/examples/chess/chess.py @@ -10,8 +10,13 @@ @dlt.source -def chess(chess_url: str = dlt.config.value, title: str = "GM", max_players: int = 2, year: int = 2022, month: int = 10) -> Any: - +def chess( + chess_url: str = dlt.config.value, + title: str = "GM", + max_players: int = 2, + year: int = 2022, + month: int = 10, +) -> Any: def _get_data_with_retry(path: str) -> StrAny: r = client.get(f"{chess_url}{path}") return r.json() # type: ignore @@ -29,7 +34,7 @@ def players() -> Iterator[TDataItems]: @dlt.defer def players_profiles(username: Any) -> TDataItems: print(f"getting {username} profile via thread {threading.current_thread().name}") - sleep(1) # add some latency to show parallel runs + sleep(1) # add some latency to show parallel runs return _get_data_with_retry(f"player/{username}") # this resource takes data from players and returns games for the last month if not specified otherwise @@ -41,6 +46,7 @@ def players_games(username: Any) -> Iterator[TDataItems]: return players(), players_profiles, players_games + if __name__ == "__main__": print("You must run this from the docs/examples/chess folder") assert os.getcwd().endswith("chess") @@ -48,12 +54,7 @@ def players_games(username: Any) -> Iterator[TDataItems]: # look for parallel run configuration in `config.toml`! # mind the full_refresh: it makes the pipeline to load to a distinct dataset each time it is run and always is resetting the schema and state info = dlt.pipeline( - pipeline_name="chess_games", - destination="postgres", - dataset_name="chess", - full_refresh=True - ).run( - chess(max_players=5, month=9) - ) + pipeline_name="chess_games", destination="postgres", dataset_name="chess", full_refresh=True + ).run(chess(max_players=5, month=9)) # display where the data went print(info) diff --git a/docs/examples/chess/chess_dbt.py b/docs/examples/chess/chess_dbt.py index 4ee51f6b50..f453e53a38 100644 --- a/docs/examples/chess/chess_dbt.py +++ b/docs/examples/chess/chess_dbt.py @@ -21,4 +21,3 @@ # run all the tests tests = transforms.test() print(tests) - diff --git a/docs/examples/chess_production/chess.py b/docs/examples/chess_production/chess.py index 79b573fe43..5b767f0eb6 100644 --- a/docs/examples/chess_production/chess.py +++ b/docs/examples/chess_production/chess.py @@ -6,6 +6,7 @@ from dlt.common.typing import StrAny, TDataItems from dlt.sources.helpers.requests import client + @dlt.source def chess( chess_url: str = dlt.config.value, @@ -31,9 +32,7 @@ def players() -> Iterator[TDataItems]: @dlt.transformer(data_from=players, write_disposition="replace") @dlt.defer def players_profiles(username: Any) -> TDataItems: - print( - f"getting {username} profile via thread {threading.current_thread().name}" - ) + print(f"getting {username} profile via thread {threading.current_thread().name}") sleep(1) # add some latency to show parallel runs return _get_data_with_retry(f"player/{username}") @@ -61,6 +60,7 @@ def players_games(username: Any) -> Iterator[TDataItems]: MAX_PLAYERS = 5 + def load_data_with_retry(pipeline, data): try: for attempt in Retrying( @@ -70,9 +70,7 @@ def load_data_with_retry(pipeline, data): reraise=True, ): with attempt: - logger.info( - f"Running the pipeline, attempt={attempt.retry_state.attempt_number}" - ) + logger.info(f"Running the pipeline, attempt={attempt.retry_state.attempt_number}") load_info = pipeline.run(data) logger.info(str(load_info)) @@ -80,16 +78,12 @@ def load_data_with_retry(pipeline, data): load_info.raise_on_failed_jobs() # send notification send_slack_message( - pipeline.runtime_config.slack_incoming_hook, - "Data was successfully loaded!" + pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" ) except Exception: # we get here after all the failed retries # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, - "Something went wrong!" - ) + send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Something went wrong!") raise # we get here after a successful attempt @@ -98,18 +92,14 @@ def load_data_with_retry(pipeline, data): # print the information on the first load package and all jobs inside logger.info(f"First load package info: {load_info.load_packages[0]}") # print the information on the first completed job in first load package - logger.info( - f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}" - ) + logger.info(f"First completed job info: {load_info.load_packages[0].jobs['completed_jobs'][0]}") # check for schema updates: schema_updates = [p.schema_update for p in load_info.load_packages] # send notifications if there are schema updates if schema_updates: # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, "Schema was updated!" - ) + send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Schema was updated!") # To run simple tests with `sql_client`, such as checking table counts and # warning if there is no data, you can use the `execute_query` method @@ -160,4 +150,4 @@ def load_data_with_retry(pipeline, data): ) # get data for a few famous players data = chess(chess_url="https://api.chess.com/pub/", max_players=MAX_PLAYERS) - load_data_with_retry(pipeline, data) \ No newline at end of file + load_data_with_retry(pipeline, data) diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py index 3f433e3fef..6113f98793 100644 --- a/docs/examples/incremental_loading/zendesk.py +++ b/docs/examples/incremental_loading/zendesk.py @@ -9,7 +9,7 @@ @dlt.source(max_table_nesting=2) def zendesk_support( - credentials: Dict[str, str]=dlt.secrets.value, + credentials: Dict[str, str] = dlt.secrets.value, start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008 end_date: Optional[TAnyDateTime] = None, ): @@ -101,9 +101,7 @@ def get_pages( # make request and keep looping until there is no next page get_url = f"{url}{endpoint}" while get_url: - response = client.get( - get_url, headers=headers, auth=auth, params=params - ) + response = client.get(get_url, headers=headers, auth=auth, params=params) response.raise_for_status() response_json = response.json() result = response_json[data_point_name] @@ -122,4 +120,4 @@ def get_pages( ) load_info = pipeline.run(zendesk_support()) - print(load_info) \ No newline at end of file + print(load_info) diff --git a/docs/examples/nested_data/nested_data.py b/docs/examples/nested_data/nested_data.py index 3e4a1295c3..7f85f0522e 100644 --- a/docs/examples/nested_data/nested_data.py +++ b/docs/examples/nested_data/nested_data.py @@ -13,6 +13,7 @@ CHUNK_SIZE = 10000 + # You can limit how deep dlt goes when generating child tables. # By default, the library will descend and generate child tables # for all nested lists, without a limit. @@ -81,6 +82,7 @@ def load_documents(self) -> Iterator[TDataItem]: while docs_slice := list(islice(cursor, CHUNK_SIZE)): yield map_nested_in_place(convert_mongo_objs, docs_slice) + def convert_mongo_objs(value: Any) -> Any: if isinstance(value, (ObjectId, Decimal128)): return str(value) @@ -98,9 +100,7 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="unpacked_data", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") load_info = pipeline.run(source_data) print(load_info) @@ -114,9 +114,7 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="not_unpacked_data", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") source_data.max_table_nesting = 0 load_info = pipeline.run(source_data) print(load_info) @@ -130,9 +128,7 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="unpacked_data_without_cast", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}}) load_info = pipeline.run(source_data) print(load_info) diff --git a/docs/examples/transformers/pokemon.py b/docs/examples/transformers/pokemon.py index ce8cc0142c..97b9a98b11 100644 --- a/docs/examples/transformers/pokemon.py +++ b/docs/examples/transformers/pokemon.py @@ -45,10 +45,8 @@ def species(pokemon_details): # 2. send pokemon details into `species` transformer to get species details # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once - return ( - pokemon_list | pokemon, - pokemon_list | pokemon | species - ) + return (pokemon_list | pokemon, pokemon_list | pokemon | species) + if __name__ == "__main__": # build duck db pipeline @@ -58,4 +56,4 @@ def species(pokemon_details): # the pokemon_list resource does not need to be loaded load_info = pipeline.run(source("https://pokeapi.co/api/v2/pokemon")) - print(load_info) \ No newline at end of file + print(load_info) diff --git a/docs/website/docs/conftest.py b/docs/website/docs/conftest.py index c97ce2700b..d108089342 100644 --- a/docs/website/docs/conftest.py +++ b/docs/website/docs/conftest.py @@ -7,30 +7,52 @@ from dlt.common.configuration.container import Container # patch which providers to enable -from dlt.common.configuration.providers import StringTomlProvider, EnvironProvider, SecretsTomlProvider, ConfigTomlProvider -from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext, ConfigProvidersConfiguration - -from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, duckdb_pipeline_location, wipe_pipeline +from dlt.common.configuration.providers import ( + StringTomlProvider, + EnvironProvider, + SecretsTomlProvider, + ConfigTomlProvider, +) +from dlt.common.configuration.specs.config_providers_context import ( + ConfigProvidersContext, + ConfigProvidersConfiguration, +) + +from tests.utils import ( + patch_home_dir, + autouse_test_storage, + preserve_environ, + duckdb_pipeline_location, + wipe_pipeline, +) @pytest.fixture(autouse=True) def setup_tests(request): # always set working dir to main website folder - dname = os.path.dirname(request.module.__file__) + dname = os.path.dirname(request.module.__file__) config_dir = dname + "/.dlt" # inject provider context so the original providers are restored at the end def _initial_providers(): - return [EnvironProvider(), SecretsTomlProvider(project_dir=config_dir, add_global_config=False), ConfigTomlProvider(project_dir=config_dir, add_global_config=False)] + return [ + EnvironProvider(), + SecretsTomlProvider(project_dir=config_dir, add_global_config=False), + ConfigTomlProvider(project_dir=config_dir, add_global_config=False), + ] glob_ctx = ConfigProvidersContext() glob_ctx.providers = _initial_providers() - with set_working_dir(dname), Container().injectable_context(glob_ctx), patch("dlt.common.configuration.specs.config_providers_context.ConfigProvidersContext.initial_providers", _initial_providers): + with set_working_dir(dname), Container().injectable_context(glob_ctx), patch( + "dlt.common.configuration.specs.config_providers_context.ConfigProvidersContext.initial_providers", + _initial_providers, + ): yield - def pytest_configure(config): # push sentry to ci - os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + os.environ["RUNTIME__SENTRY_DSN"] = ( + "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + ) diff --git a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py index beb1c862cc..4cb960b19f 100644 --- a/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py +++ b/docs/website/docs/dlt-ecosystem/transformations/dbt/dbt-snippets.py @@ -10,11 +10,14 @@ def run_dbt_standalone_snippet() -> None: working_dir=".", # the package below will be cloned to current dir package_location="https://github.com/dbt-labs/jaffle_shop.git", package_profiles_dir=os.path.abspath("."), # profiles.yml must be placed in this dir - package_profile_name="duckdb_dlt_dbt_test" # name of the profile + package_profile_name="duckdb_dlt_dbt_test", # name of the profile ) models = runner.run_all() # @@@DLT_SNIPPET_END run_dbt_standalone for m in models: - print(f"Model {m.model_name} materialized in {m.time} with status {m.status} and message {m.message}") + print( + f"Model {m.model_name} materialized in {m.time} with status {m.status} and message" + f" {m.message}" + ) diff --git a/docs/website/docs/examples/chess_production/code/chess-snippets.py b/docs/website/docs/examples/chess_production/code/chess-snippets.py index 1cd1c86aed..f6d752c911 100644 --- a/docs/website/docs/examples/chess_production/code/chess-snippets.py +++ b/docs/website/docs/examples/chess_production/code/chess-snippets.py @@ -38,9 +38,7 @@ def players() -> Iterator[TDataItems]: @dlt.transformer(data_from=players, write_disposition="replace") @dlt.defer def players_profiles(username: Any) -> TDataItems: - print( - f"getting {username} profile via thread {threading.current_thread().name}" - ) + print(f"getting {username} profile via thread {threading.current_thread().name}") sleep(1) # add some latency to show parallel runs return _get_data_with_retry(f"player/{username}") @@ -89,16 +87,12 @@ def load_data_with_retry(pipeline, data): load_info.raise_on_failed_jobs() # send notification send_slack_message( - pipeline.runtime_config.slack_incoming_hook, - "Data was successfully loaded!" + pipeline.runtime_config.slack_incoming_hook, "Data was successfully loaded!" ) except Exception: # we get here after all the failed retries # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, - "Something went wrong!" - ) + send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Something went wrong!") raise # we get here after a successful attempt @@ -116,9 +110,7 @@ def load_data_with_retry(pipeline, data): # send notifications if there are schema updates if schema_updates: # send notification - send_slack_message( - pipeline.runtime_config.slack_incoming_hook, "Schema was updated!" - ) + send_slack_message(pipeline.runtime_config.slack_incoming_hook, "Schema was updated!") # To run simple tests with `sql_client`, such as checking table counts and # warning if there is no data, you can use the `execute_query` method diff --git a/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py b/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py index 488cb8a7cb..e86d9e5560 100644 --- a/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py +++ b/docs/website/docs/examples/connector_x_arrow/code/load_arrow-snippets.py @@ -1,5 +1,4 @@ def connector_x_snippet() -> None: - # @@@DLT_SNIPPET_START markdown_source import connectorx as cx @@ -8,26 +7,24 @@ def connector_x_snippet() -> None: from dlt.sources.credentials import ConnectionStringCredentials def read_sql_x( - conn_str: ConnectionStringCredentials = dlt.secrets.value, - query: str = dlt.config.value + conn_str: ConnectionStringCredentials = dlt.secrets.value, query: str = dlt.config.value ): - yield cx.read_sql(conn_str.to_native_representation(), query, return_type="arrow2", protocol="binary") + yield cx.read_sql( + conn_str.to_native_representation(), query, return_type="arrow2", protocol="binary" + ) # create genome resource with merge on `upid` primary key genome = dlt.resource( - name="genome", - write_disposition="merge", - primary_key="upid", - standalone=True + name="genome", write_disposition="merge", primary_key="upid", standalone=True )(read_sql_x)( "mysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam", # type: ignore[arg-type] - "SELECT * FROM genome ORDER BY created LIMIT 1000" + "SELECT * FROM genome ORDER BY created LIMIT 1000", ) # add incremental on created at genome.apply_hints(incremental=dlt.sources.incremental("created")) # @@@DLT_SNIPPET_END markdown_source # @@@DLT_SNIPPET_START markdown_pipeline - __name__ = "__main__" # @@@DLT_REMOVE + __name__ = "__main__" # @@@DLT_REMOVE if __name__ == "__main__": pipeline = dlt.pipeline(destination="duckdb") print(pipeline.run(genome)) @@ -38,4 +35,3 @@ def read_sql_x( # check that stuff was loaded row_counts = pipeline.last_trace.last_normalize_info.row_counts assert row_counts["genome"] == 1000 - diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 4c3d3f0b3a..569b554f16 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -7,7 +7,6 @@ @skipifgithubfork def incremental_snippet() -> None: - # @@@DLT_SNIPPET_START example # @@@DLT_SNIPPET_START markdown_source from typing import Optional, Dict, Any, Tuple @@ -18,11 +17,12 @@ def incremental_snippet() -> None: from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client - @dlt.source(max_table_nesting=2) def zendesk_support( - credentials: Dict[str, str]=dlt.secrets.value, - start_date: Optional[TAnyDateTime] = pendulum.datetime(year=2000, month=1, day=1), # noqa: B008 + credentials: Dict[str, str] = dlt.secrets.value, + start_date: Optional[TAnyDateTime] = pendulum.datetime( + year=2000, month=1, day=1 + ), # noqa: B008 end_date: Optional[TAnyDateTime] = None, ): """ @@ -114,9 +114,7 @@ def get_pages( # make request and keep looping until there is no next page get_url = f"{url}{endpoint}" while get_url: - response = client.get( - get_url, headers=headers, auth=auth, params=params - ) + response = client.get(get_url, headers=headers, auth=auth, params=params) response.raise_for_status() response_json = response.json() result = response_json[data_point_name] @@ -127,9 +125,8 @@ def get_pages( if not response_json["end_of_stream"]: get_url = response_json["next_page"] - # @@@DLT_SNIPPET_START markdown_pipeline - __name__ = "__main__" # @@@DLT_REMOVE + __name__ = "__main__" # @@@DLT_REMOVE if __name__ == "__main__": # create dlt pipeline pipeline = dlt.pipeline( @@ -144,4 +141,3 @@ def get_pages( # check that stuff was loaded row_counts = pipeline.last_trace.last_normalize_info.row_counts assert row_counts["ticket_events"] == 24 - diff --git a/docs/website/docs/examples/nested_data/code/nested_data-snippets.py b/docs/website/docs/examples/nested_data/code/nested_data-snippets.py index d6328fc2c5..e360dc4534 100644 --- a/docs/website/docs/examples/nested_data/code/nested_data-snippets.py +++ b/docs/website/docs/examples/nested_data/code/nested_data-snippets.py @@ -104,14 +104,12 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="unpacked_data", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") load_info = pipeline.run(source_data) print(load_info) tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert (len(tables) == 7), pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE + assert len(tables) == 7, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE # The second method involves setting the max_table_nesting attribute directly # on the source data object. @@ -123,15 +121,13 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="not_unpacked_data", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") source_data.max_table_nesting = 0 load_info = pipeline.run(source_data) print(load_info) tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert (len(tables) == 1), pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE + assert len(tables) == 1, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE # The third method involves applying data type hints to specific columns in the data. # In this case, we tell dlt that column 'cast' (containing a list of actors) @@ -142,15 +138,13 @@ def convert_mongo_objs(value: Any) -> Any: destination="duckdb", dataset_name="unpacked_data_without_cast", ) - source_data = mongodb_collection( - collection="movies", write_disposition="replace" - ) + source_data = mongodb_collection(collection="movies", write_disposition="replace") source_data.movies.apply_hints(columns={"cast": {"data_type": "complex"}}) load_info = pipeline.run(source_data) print(load_info) tables = pipeline.last_trace.last_normalize_info.row_counts # @@@DLT_REMOVE tables.pop("_dlt_pipeline_state") # @@@DLT_REMOVE - assert (len(tables) == 6), pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE + assert len(tables) == 6, pipeline.last_trace.last_normalize_info # @@@DLT_REMOVE # @@@DLT_SNIPPET_END nested_data_run # @@@DLT_SNIPPET_END example diff --git a/docs/website/docs/examples/transformers/code/pokemon-snippets.py b/docs/website/docs/examples/transformers/code/pokemon-snippets.py index 726bcf7e2e..d8fe4f41ba 100644 --- a/docs/website/docs/examples/transformers/code/pokemon-snippets.py +++ b/docs/website/docs/examples/transformers/code/pokemon-snippets.py @@ -1,11 +1,8 @@ - def transformers_snippet() -> None: - # @@@DLT_SNIPPET_START example import dlt from dlt.sources.helpers import requests - @dlt.source(max_table_nesting=2) def source(pokemon_api_url: str): """""" @@ -49,12 +46,9 @@ def species(pokemon_details): # 2. send pokemon details into `species` transformer to get species details # NOTE: dlt is smart enough to get data from pokemon_list and pokemon details once - return ( - pokemon_list | pokemon, - pokemon_list | pokemon | species - ) + return (pokemon_list | pokemon, pokemon_list | pokemon | species) - __name__ = "__main__" # @@@DLT_REMOVE + __name__ = "__main__" # @@@DLT_REMOVE if __name__ == "__main__": # build duck db pipeline pipeline = dlt.pipeline( diff --git a/docs/website/docs/getting-started-snippets.py b/docs/website/docs/getting-started-snippets.py index be21a7f757..618ba1a406 100644 --- a/docs/website/docs/getting-started-snippets.py +++ b/docs/website/docs/getting-started-snippets.py @@ -3,19 +3,13 @@ def start_snippet() -> None: - # @@@DLT_SNIPPET_START start import dlt - data = [ - {'id': 1, 'name': 'Alice'}, - {'id': 2, 'name': 'Bob'} - ] + data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] pipeline = dlt.pipeline( - pipeline_name='quick_start', - destination='duckdb', - dataset_name='mydata' + pipeline_name="quick_start", destination="duckdb", dataset_name="mydata" ) load_info = pipeline.run(data, table_name="users") @@ -26,19 +20,18 @@ def start_snippet() -> None: def json_snippet() -> None: - # @@@DLT_SNIPPET_START json import dlt from dlt.common import json - with open("./assets/json_file.json", 'rb') as file: + with open("./assets/json_file.json", "rb") as file: data = json.load(file) pipeline = dlt.pipeline( - pipeline_name='from_json', - destination='duckdb', - dataset_name='mydata', + pipeline_name="from_json", + destination="duckdb", + dataset_name="mydata", ) # NOTE: test data that we load is just a dictionary so we enclose it in a list @@ -52,19 +45,18 @@ def json_snippet() -> None: def csv_snippet() -> None: - # @@@DLT_SNIPPET_START csv import dlt import pandas as pd owid_disasters_csv = "https://raw.githubusercontent.com/owid/owid-datasets/master/datasets/Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020)/Natural%20disasters%20from%201900%20to%202019%20-%20EMDAT%20(2020).csv" df = pd.read_csv(owid_disasters_csv) - data = df.to_dict(orient='records') + data = df.to_dict(orient="records") pipeline = dlt.pipeline( - pipeline_name='from_csv', - destination='duckdb', - dataset_name='mydata', + pipeline_name="from_csv", + destination="duckdb", + dataset_name="mydata", ) load_info = pipeline.run(data, table_name="natural_disasters") @@ -75,7 +67,6 @@ def csv_snippet() -> None: def api_snippet() -> None: - # @@@DLT_SNIPPET_START api import dlt from dlt.sources.helpers import requests @@ -87,9 +78,9 @@ def api_snippet() -> None: response.raise_for_status() pipeline = dlt.pipeline( - pipeline_name='from_api', - destination='duckdb', - dataset_name='github_data', + pipeline_name="from_api", + destination="duckdb", + dataset_name="github_data", ) # the response contains a list of issues load_info = pipeline.run(response.json(), table_name="issues") @@ -101,7 +92,6 @@ def api_snippet() -> None: def db_snippet() -> None: - # @@@DLT_SNIPPET_START db import dlt from sqlalchemy import create_engine @@ -112,19 +102,18 @@ def db_snippet() -> None: engine = create_engine("mysql+pymysql://rfamro@mysql-rfam-public.ebi.ac.uk:4497/Rfam") with engine.connect() as conn: # select genome table, stream data in batches of 100 elements - rows = conn.execution_options(yield_per=100).exec_driver_sql("SELECT * FROM genome LIMIT 1000") + rows = conn.execution_options(yield_per=100).exec_driver_sql( + "SELECT * FROM genome LIMIT 1000" + ) pipeline = dlt.pipeline( - pipeline_name='from_database', - destination='duckdb', - dataset_name='genome_data', + pipeline_name="from_database", + destination="duckdb", + dataset_name="genome_data", ) # here we convert the rows into dictionaries on the fly with a map function - load_info = pipeline.run( - map(lambda row: dict(row._mapping), rows), - table_name="genome" - ) + load_info = pipeline.run(map(lambda row: dict(row._mapping), rows), table_name="genome") print(load_info) # @@@DLT_SNIPPET_END db @@ -133,19 +122,15 @@ def db_snippet() -> None: def replace_snippet() -> None: - # @@@DLT_SNIPPET_START replace import dlt - data = [ - {'id': 1, 'name': 'Alice'}, - {'id': 2, 'name': 'Bob'} - ] + data = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] pipeline = dlt.pipeline( - pipeline_name='replace_data', - destination='duckdb', - dataset_name='mydata', + pipeline_name="replace_data", + destination="duckdb", + dataset_name="mydata", ) load_info = pipeline.run(data, table_name="users", write_disposition="replace") @@ -156,7 +141,6 @@ def replace_snippet() -> None: def incremental_snippet() -> None: - # @@@DLT_SNIPPET_START incremental import dlt from dlt.sources.helpers import requests @@ -183,11 +167,10 @@ def get_issues( break url = response.links["next"]["url"] - pipeline = dlt.pipeline( - pipeline_name='github_issues_incremental', - destination='duckdb', - dataset_name='github_data_append', + pipeline_name="github_issues_incremental", + destination="duckdb", + dataset_name="github_data_append", ) load_info = pipeline.run(get_issues) row_counts = pipeline.last_trace.last_normalize_info @@ -201,7 +184,6 @@ def get_issues( def incremental_merge_snippet() -> None: - # @@@DLT_SNIPPET_START incremental_merge import dlt from dlt.sources.helpers import requests @@ -212,7 +194,7 @@ def incremental_merge_snippet() -> None: primary_key="id", ) def get_issues( - updated_at = dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") + updated_at=dlt.sources.incremental("updated_at", initial_value="1970-01-01T00:00:00Z") ): # NOTE: we read only open issues to minimize number of calls to the API. There's a limit of ~50 calls for not authenticated Github users url = f"https://api.github.com/repos/dlt-hub/dlt/issues?since={updated_at.last_value}&per_page=100&sort=updated&directions=desc&state=open" @@ -228,9 +210,9 @@ def get_issues( url = response.links["next"]["url"] pipeline = dlt.pipeline( - pipeline_name='github_issues_merge', - destination='duckdb', - dataset_name='github_data_merge', + pipeline_name="github_issues_merge", + destination="duckdb", + dataset_name="github_data_merge", ) load_info = pipeline.run(get_issues) row_counts = pipeline.last_trace.last_normalize_info @@ -244,15 +226,12 @@ def get_issues( def table_dispatch_snippet() -> None: - # @@@DLT_SNIPPET_START table_dispatch import dlt from dlt.sources.helpers import requests @dlt.resource(primary_key="id", table_name=lambda i: i["type"], write_disposition="append") - def repo_events( - last_created_at = dlt.sources.incremental("created_at") - ): + def repo_events(last_created_at=dlt.sources.incremental("created_at")): url = "https://api.github.com/repos/dlt-hub/dlt/events?per_page=100" while True: @@ -271,9 +250,9 @@ def repo_events( url = response.links["next"]["url"] pipeline = dlt.pipeline( - pipeline_name='github_events', - destination='duckdb', - dataset_name='github_events_data', + pipeline_name="github_events", + destination="duckdb", + dataset_name="github_events_data", ) load_info = pipeline.run(repo_events) row_counts = pipeline.last_trace.last_normalize_info @@ -285,6 +264,7 @@ def repo_events( assert_load_info(load_info) + def pdf_to_weaviate_snippet() -> None: # @@@DLT_SNIPPET_START pdf_to_weaviate import os @@ -293,7 +273,6 @@ def pdf_to_weaviate_snippet() -> None: from dlt.destinations.impl.weaviate import weaviate_adapter from PyPDF2 import PdfReader - @dlt.resource(selected=False) def list_files(folder_path: str): folder_path = os.path.abspath(folder_path) @@ -302,10 +281,9 @@ def list_files(folder_path: str): yield { "file_name": filename, "file_path": file_path, - "mtime": os.path.getmtime(file_path) + "mtime": os.path.getmtime(file_path), } - @dlt.transformer(primary_key="page_id", write_disposition="merge") def pdf_to_text(file_item, separate_pages: bool = False): if not separate_pages: @@ -319,10 +297,7 @@ def pdf_to_text(file_item, separate_pages: bool = False): page_item["page_id"] = file_item["file_name"] + "_" + str(page_no) yield page_item - pipeline = dlt.pipeline( - pipeline_name='pdf_to_text', - destination='weaviate' - ) + pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate") # this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf" # (3) sends them to pdf_to_text transformer with pipe (|) operator @@ -335,9 +310,7 @@ def pdf_to_text(file_item, separate_pages: bool = False): pdf_pipeline.table_name = "InvoiceText" # use weaviate_adapter to tell destination to vectorize "text" column - load_info = pipeline.run( - weaviate_adapter(pdf_pipeline, vectorize="text") - ) + load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text")) row_counts = pipeline.last_trace.last_normalize_info print(row_counts) print("------") @@ -353,4 +326,3 @@ def pdf_to_text(file_item, separate_pages: bool = False): # get text of all the invoices in InvoiceText class we just created above print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do()) # @@@DLT_SNIPPET_END pdf_to_weaviate_read - diff --git a/docs/website/docs/intro-snippets.py b/docs/website/docs/intro-snippets.py index 2924cd34de..bef758d0aa 100644 --- a/docs/website/docs/intro-snippets.py +++ b/docs/website/docs/intro-snippets.py @@ -1,25 +1,24 @@ from tests.pipeline.utils import assert_load_info -def intro_snippet() -> None: +def intro_snippet() -> None: # @@@DLT_SNIPPET_START index import dlt from dlt.sources.helpers import requests + # Create a dlt pipeline that will load # chess player data to the DuckDB destination pipeline = dlt.pipeline( - pipeline_name='chess_pipeline', - destination='duckdb', - dataset_name='player_data' + pipeline_name="chess_pipeline", destination="duckdb", dataset_name="player_data" ) # Grab some player data from Chess.com API data = [] - for player in ['magnuscarlsen', 'rpragchess']: - response = requests.get(f'https://api.chess.com/pub/player/{player}') + for player in ["magnuscarlsen", "rpragchess"]: + response = requests.get(f"https://api.chess.com/pub/player/{player}") response.raise_for_status() data.append(response.json()) # Extract, normalize, and load the data - load_info = pipeline.run(data, table_name='player') + load_info = pipeline.run(data, table_name="player") # @@@DLT_SNIPPET_END index assert_load_info(load_info) diff --git a/docs/website/docs/reference/performance_snippets/performance-snippets.py b/docs/website/docs/reference/performance_snippets/performance-snippets.py index d0c2c46acd..621cf21032 100644 --- a/docs/website/docs/reference/performance_snippets/performance-snippets.py +++ b/docs/website/docs/reference/performance_snippets/performance-snippets.py @@ -1,7 +1,7 @@ from utils import parse_toml_file -def parallel_config_snippet() -> None: +def parallel_config_snippet() -> None: # @@@DLT_SNIPPET_START parallel_config import os import dlt @@ -13,7 +13,10 @@ def read_table(limit): rows = iter(range(limit)) while item_slice := list(islice(rows, 1000)): now = pendulum.now().isoformat() - yield [{"row": _id, "description": "this is row with id {_id}", "timestamp": now} for _id in item_slice] + yield [ + {"row": _id, "description": "this is row with id {_id}", "timestamp": now} + for _id in item_slice + ] # this prevents process pool to run the initialization code again if __name__ == "__main__" or "PYTEST_CURRENT_TEST" in os.environ: @@ -55,7 +58,6 @@ def get_details(item_id): # just return the results, if you yield, generator will be evaluated in main thread return {"row": item_id} - # evaluate the pipeline and print all the items # resources are iterators and they are evaluated in the same way in the pipeline.run print(list(list_items(0, 10) | get_details)) @@ -72,7 +74,6 @@ async def a_get_details(item_id): # just return the results, if you yield, generator will be evaluated in main thread return {"row": item_id} - print(list(list_items(0, 10) | a_get_details)) # @@@DLT_SNIPPET_END parallel_extract_awaitables @@ -88,6 +89,7 @@ def get_rows(limit): def database_cursor(): # here we yield each row returned from database separately yield from get_rows(10000) + # @@@DLT_SNIPPET_END performance_chunking # @@@DLT_SNIPPET_START performance_chunking_chunk @@ -100,6 +102,7 @@ def database_cursor_chunked(): while item_slice := list(islice(rows, 1000)): print(f"got chunk of length {len(item_slice)}") yield item_slice + # @@@DLT_SNIPPET_END performance_chunking_chunk assert len(list(database_cursor())) == 10000 @@ -108,6 +111,3 @@ def database_cursor_chunked(): def test_toml_snippets() -> None: parse_toml_file("./toml-snippets.toml") - - - diff --git a/docs/website/docs/utils.py b/docs/website/docs/utils.py index 36ae49ca65..ce609a61c2 100644 --- a/docs/website/docs/utils.py +++ b/docs/website/docs/utils.py @@ -3,6 +3,7 @@ DLT_MARKER = "@@@DLT_" + def parse_toml_file(filename: str) -> None: # test toml file by going snippet by snippet with open(filename, "r", encoding="utf-8") as f: @@ -17,8 +18,10 @@ def parse_toml_file(filename: str) -> None: try: tomlkit.loads(toml_snippet) except Exception as e: - print(f"Error while testing snippet bewteen: {current_marker} and {line.strip()}") + print( + f"Error while testing snippet bewteen: {current_marker} and {line.strip()}" + ) raise e current_lines = [] current_marker = line.strip() - current_lines.append(line) \ No newline at end of file + current_lines.append(line) diff --git a/docs/website/pydoc_markdown_dlt.py b/docs/website/pydoc_markdown_dlt.py index ff970ef3a2..ed30189dbc 100644 --- a/docs/website/pydoc_markdown_dlt.py +++ b/docs/website/pydoc_markdown_dlt.py @@ -22,4 +22,4 @@ def _process(self, node): c = sub(r"(\n\s*)(>>> ?)", r"\1", c) node.docstring.content = c - return super()._process(node) \ No newline at end of file + return super()._process(node) diff --git a/tests/cases.py b/tests/cases.py index 70c20d74af..62db7ba2b8 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -9,7 +9,12 @@ from dlt.common.data_types import TDataType from dlt.common.typing import StrAny from dlt.common.wei import Wei -from dlt.common.time import ensure_pendulum_datetime, reduce_pendulum_datetime_precision, ensure_pendulum_time, ensure_pendulum_date +from dlt.common.time import ( + ensure_pendulum_datetime, + reduce_pendulum_datetime_precision, + ensure_pendulum_time, + ensure_pendulum_date, +) from dlt.common.schema import TColumnSchema, TTableSchemaColumns @@ -20,14 +25,16 @@ JSON_TYPED_DICT: StrAny = { "str": "string", "decimal": Decimal("21.37"), - "big_decimal": Decimal("115792089237316195423570985008687907853269984665640564039457584007913129639935.1"), + "big_decimal": Decimal( + "115792089237316195423570985008687907853269984665640564039457584007913129639935.1" + ), "datetime": pendulum.parse("2005-04-02T20:37:37.358236Z"), "date": ensure_pendulum_date("2022-02-02"), # "uuid": UUID(_UUID), "hexbytes": HexBytes("0x2137"), - "bytes": b'2137', + "bytes": b"2137", "wei": Wei.from_int256(2137, decimals=2), - "time": ensure_pendulum_time("20:37:37.358236") + "time": ensure_pendulum_time("20:37:37.358236"), } # TODO: a version after PUA decoder (time is not yet implemented end to end) JSON_TYPED_DICT_DECODED = dict(JSON_TYPED_DICT) @@ -42,185 +49,76 @@ "hexbytes": "binary", "bytes": "binary", "wei": "wei", - "time": "time" + "time": "time", } JSON_TYPED_DICT_NESTED = { "dict": dict(JSON_TYPED_DICT), "list_dicts": [dict(JSON_TYPED_DICT), dict(JSON_TYPED_DICT)], "list": list(JSON_TYPED_DICT.values()), - **JSON_TYPED_DICT + **JSON_TYPED_DICT, } JSON_TYPED_DICT_NESTED_DECODED = { "dict": dict(JSON_TYPED_DICT_DECODED), "list_dicts": [dict(JSON_TYPED_DICT_DECODED), dict(JSON_TYPED_DICT_DECODED)], "list": list(JSON_TYPED_DICT_DECODED.values()), - **JSON_TYPED_DICT_DECODED + **JSON_TYPED_DICT_DECODED, } TABLE_UPDATE: List[TColumnSchema] = [ - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }, - { - "name": "col2", - "data_type": "double", - "nullable": False - }, - { - "name": "col3", - "data_type": "bool", - "nullable": False - }, - { - "name": "col4", - "data_type": "timestamp", - "nullable": False - }, - { - "name": "col5", - "data_type": "text", - "nullable": False - }, - { - "name": "col6", - "data_type": "decimal", - "nullable": False - }, - { - "name": "col7", - "data_type": "binary", - "nullable": False - }, - { - "name": "col8", - "data_type": "wei", - "nullable": False - }, - { - "name": "col9", - "data_type": "complex", - "nullable": False, - "variant": True - }, - { - "name": "col10", - "data_type": "date", - "nullable": False - }, - { - "name": "col11", - "data_type": "time", - "nullable": False - }, - { - "name": "col1_null", - "data_type": "bigint", - "nullable": True - }, - { - "name": "col2_null", - "data_type": "double", - "nullable": True - }, - { - "name": "col3_null", - "data_type": "bool", - "nullable": True - }, - { - "name": "col4_null", - "data_type": "timestamp", - "nullable": True - }, - { - "name": "col5_null", - "data_type": "text", - "nullable": True - }, - { - "name": "col6_null", - "data_type": "decimal", - "nullable": True - }, - { - "name": "col7_null", - "data_type": "binary", - "nullable": True - }, - { - "name": "col8_null", - "data_type": "wei", - "nullable": True - }, - { - "name": "col9_null", - "data_type": "complex", - "nullable": True, - "variant": True - }, - { - "name": "col10_null", - "data_type": "date", - "nullable": True - }, - { - "name": "col11_null", - "data_type": "time", - "nullable": True - }, - { - "name": "col1_precision", - "data_type": "bigint", - "precision": 16, - "nullable": False - }, - { - "name": "col4_precision", - "data_type": "timestamp", - "precision": 3, - "nullable": False - }, - { - "name": "col5_precision", - "data_type": "text", - "precision": 25, - "nullable": False - }, + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "col2", "data_type": "double", "nullable": False}, + {"name": "col3", "data_type": "bool", "nullable": False}, + {"name": "col4", "data_type": "timestamp", "nullable": False}, + {"name": "col5", "data_type": "text", "nullable": False}, + {"name": "col6", "data_type": "decimal", "nullable": False}, + {"name": "col7", "data_type": "binary", "nullable": False}, + {"name": "col8", "data_type": "wei", "nullable": False}, + {"name": "col9", "data_type": "complex", "nullable": False, "variant": True}, + {"name": "col10", "data_type": "date", "nullable": False}, + {"name": "col11", "data_type": "time", "nullable": False}, + {"name": "col1_null", "data_type": "bigint", "nullable": True}, + {"name": "col2_null", "data_type": "double", "nullable": True}, + {"name": "col3_null", "data_type": "bool", "nullable": True}, + {"name": "col4_null", "data_type": "timestamp", "nullable": True}, + {"name": "col5_null", "data_type": "text", "nullable": True}, + {"name": "col6_null", "data_type": "decimal", "nullable": True}, + {"name": "col7_null", "data_type": "binary", "nullable": True}, + {"name": "col8_null", "data_type": "wei", "nullable": True}, + {"name": "col9_null", "data_type": "complex", "nullable": True, "variant": True}, + {"name": "col10_null", "data_type": "date", "nullable": True}, + {"name": "col11_null", "data_type": "time", "nullable": True}, + {"name": "col1_precision", "data_type": "bigint", "precision": 16, "nullable": False}, + {"name": "col4_precision", "data_type": "timestamp", "precision": 3, "nullable": False}, + {"name": "col5_precision", "data_type": "text", "precision": 25, "nullable": False}, { "name": "col6_precision", "data_type": "decimal", "precision": 6, "scale": 2, - "nullable": False - }, - { - "name": "col7_precision", - "data_type": "binary", - "precision": 19, - "nullable": False - }, - { - "name": "col11_precision", - "data_type": "time", - "precision": 3, - "nullable": False + "nullable": False, }, + {"name": "col7_precision", "data_type": "binary", "precision": 19, "nullable": False}, + {"name": "col11_precision", "data_type": "time", "precision": 3, "nullable": False}, ] -TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]:t for t in TABLE_UPDATE} +TABLE_UPDATE_COLUMNS_SCHEMA: TTableSchemaColumns = {t["name"]: t for t in TABLE_UPDATE} -TABLE_ROW_ALL_DATA_TYPES = { +TABLE_ROW_ALL_DATA_TYPES = { "col1": 989127831, "col2": 898912.821982, "col3": True, "col4": "2022-05-23T13:26:45.176451+00:00", "col5": "string data \n \r \x8e 🦆", "col6": Decimal("2323.34"), - "col7": b'binary data \n \r \x8e', + "col7": b"binary data \n \r \x8e", "col8": 2**56 + 92093890840, - "col9": {"complex":[1,2,3,"a"], "link": "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6 \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085"}, + "col9": { + "complex": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\012 \6" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + }, "col10": "2023-02-27", "col11": "13:26:45.176451", "col1_null": None, @@ -238,12 +136,14 @@ "col4_precision": "2022-05-23T13:26:46.167231+00:00", "col5_precision": "string data 2 \n \r \x8e 🦆", "col6_precision": Decimal("2323.34"), - "col7_precision": b'binary data 2 \n \r \x8e', + "col7_precision": b"binary data 2 \n \r \x8e", "col11_precision": "13:26:45.176451", } -def table_update_and_row(exclude_types: Sequence[TDataType] = None, exclude_columns: Sequence[str] = None) -> Tuple[TTableSchemaColumns, StrAny]: +def table_update_and_row( + exclude_types: Sequence[TDataType] = None, exclude_columns: Sequence[str] = None +) -> Tuple[TTableSchemaColumns, StrAny]: """Get a table schema and a row with all possible data types. Optionally exclude some data types from the schema and row. """ @@ -251,7 +151,9 @@ def table_update_and_row(exclude_types: Sequence[TDataType] = None, exclude_colu data_row = deepcopy(TABLE_ROW_ALL_DATA_TYPES) exclude_col_names = list(exclude_columns or []) if exclude_types: - exclude_col_names.extend([key for key, value in column_schemas.items() if value["data_type"] in exclude_types]) + exclude_col_names.extend( + [key for key, value in column_schemas.items() if value["data_type"] in exclude_types] + ) for col_name in set(exclude_col_names): del column_schemas[col_name] del data_row[col_name] @@ -262,7 +164,7 @@ def assert_all_data_types_row( db_row: List[Any], parse_complex_strings: bool = False, allow_base64_binary: bool = False, - timestamp_precision:int = 6, + timestamp_precision: int = 6, schema: TTableSchemaColumns = None, ) -> None: # content must equal @@ -276,24 +178,22 @@ def assert_all_data_types_row( if "col4" in expected_rows: parsed_date = pendulum.instance(db_mapping["col4"]) db_mapping["col4"] = reduce_pendulum_datetime_precision(parsed_date, timestamp_precision) - expected_rows['col4'] = reduce_pendulum_datetime_precision( + expected_rows["col4"] = reduce_pendulum_datetime_precision( ensure_pendulum_datetime(expected_rows["col4"]), # type: ignore[arg-type] - timestamp_precision + timestamp_precision, ) if "col4_precision" in expected_rows: parsed_date = pendulum.instance(db_mapping["col4_precision"]) db_mapping["col4_precision"] = reduce_pendulum_datetime_precision(parsed_date, 3) - expected_rows['col4_precision'] = reduce_pendulum_datetime_precision( - ensure_pendulum_datetime(expected_rows["col4_precision"]), # type: ignore[arg-type] - 3 + expected_rows["col4_precision"] = reduce_pendulum_datetime_precision( + ensure_pendulum_datetime(expected_rows["col4_precision"]), 3 # type: ignore[arg-type] ) if "col11_precision" in expected_rows: parsed_time = ensure_pendulum_time(db_mapping["col11_precision"]) db_mapping["col11_precision"] = reduce_pendulum_datetime_precision(parsed_time, 3) - expected_rows['col11_precision'] = reduce_pendulum_datetime_precision( - ensure_pendulum_time(expected_rows["col11_precision"]), # type: ignore[arg-type] - 3 + expected_rows["col11_precision"] = reduce_pendulum_datetime_precision( + ensure_pendulum_time(expected_rows["col11_precision"]), 3 # type: ignore[arg-type] ) # redshift and bigquery return strings from structured fields @@ -307,9 +207,7 @@ def assert_all_data_types_row( except ValueError: if not allow_base64_binary: raise - db_mapping[binary_col] = base64.b64decode( - db_mapping[binary_col], validate=True - ) + db_mapping[binary_col] = base64.b64decode(db_mapping[binary_col], validate=True) else: db_mapping[binary_col] = bytes(db_mapping[binary_col]) @@ -339,7 +237,7 @@ def arrow_table_all_data_types( include_time: bool = True, include_not_normalized_name: bool = True, include_name_clash: bool = False, - num_rows: int = 3 + num_rows: int = 3, ) -> Tuple[Any, List[Dict[str, Any]]]: """Create an arrow object or pandas dataframe with all supported data types. @@ -358,7 +256,7 @@ def arrow_table_all_data_types( "decimal": [Decimal(str(round(random.uniform(0, 100), 4))) for _ in range(num_rows)], "bool": [random.choice([True, False]) for _ in range(num_rows)], "string_null": [random.choice(ascii_lowercase) for _ in range(num_rows - 1)] + [None], - "null": pd.Series( [None for _ in range(num_rows)]) + "null": pd.Series([None for _ in range(num_rows)]), } if include_name_clash: @@ -375,9 +273,15 @@ def arrow_table_all_data_types( df = pd.DataFrame(data) # records have normalized identifiers for comparing - rows = df.rename(columns={ - "Pre Normalized Column": "pre_normalized_column", - }).drop(columns=['null']).to_dict("records") + rows = ( + df.rename( + columns={ + "Pre Normalized Column": "pre_normalized_column", + } + ) + .drop(columns=["null"]) + .to_dict("records") + ) if object_format == "pandas": return df, rows elif object_format == "table": diff --git a/tests/cli/cases/deploy_pipeline/debug_pipeline.py b/tests/cli/cases/deploy_pipeline/debug_pipeline.py index 8d87c8ac3d..c49e8b524d 100644 --- a/tests/cli/cases/deploy_pipeline/debug_pipeline.py +++ b/tests/cli/cases/deploy_pipeline/debug_pipeline.py @@ -7,14 +7,17 @@ def example_resource(api_url=dlt.config.value, api_key=dlt.secrets.value, last_i @dlt.source -def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id = 0): +def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id=0): # return all the resources to be loaded return example_resource(api_url, api_key, last_id) -if __name__ == '__main__': - p = dlt.pipeline(pipeline_name="debug_pipeline", destination="postgres", dataset_name="debug_pipeline_data", full_refresh=False) - load_info = p.run( - example_source(last_id=819273998) +if __name__ == "__main__": + p = dlt.pipeline( + pipeline_name="debug_pipeline", + destination="postgres", + dataset_name="debug_pipeline_data", + full_refresh=False, ) + load_info = p.run(example_source(last_id=819273998)) print(load_info) diff --git a/tests/cli/cases/deploy_pipeline/dummy_pipeline.py b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py index 48e13c35cd..f78e1b2b81 100644 --- a/tests/cli/cases/deploy_pipeline/dummy_pipeline.py +++ b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py @@ -7,14 +7,12 @@ def example_resource(api_url=dlt.config.value, api_key=dlt.secrets.value, last_i @dlt.source -def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id = 0): +def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id=0): # return all the resources to be loaded return example_resource(api_url, api_key, last_id) -if __name__ == '__main__': +if __name__ == "__main__": p = dlt.pipeline(pipeline_name="dummy_pipeline", destination="dummy") - load_info = p.run( - example_source(last_id=819273998) - ) + load_info = p.run(example_source(last_id=819273998)) print(load_info) diff --git a/tests/cli/common/test_cli_invoke.py b/tests/cli/common/test_cli_invoke.py index e3a7676ad1..d367a97261 100644 --- a/tests/cli/common/test_cli_invoke.py +++ b/tests/cli/common/test_cli_invoke.py @@ -18,40 +18,40 @@ def test_invoke_basic(script_runner: ScriptRunner) -> None: - result = script_runner.run(['dlt', '--version']) + result = script_runner.run(["dlt", "--version"]) assert result.returncode == 0 assert result.stdout.startswith("dlt ") - assert result.stderr == '' + assert result.stderr == "" - result = script_runner.run(['dlt', '--version'], shell=True) + result = script_runner.run(["dlt", "--version"], shell=True) assert result.returncode == 0 assert result.stdout.startswith("dlt ") - assert result.stderr == '' + assert result.stderr == "" for command in BASE_COMMANDS: - result = script_runner.run(['dlt', command, '--help']) + result = script_runner.run(["dlt", command, "--help"]) assert result.returncode == 0 assert result.stdout.startswith(f"usage: dlt {command}") - result = script_runner.run(['dlt', "N/A", '--help']) + result = script_runner.run(["dlt", "N/A", "--help"]) assert result.returncode != 0 def test_invoke_list_pipelines(script_runner: ScriptRunner) -> None: - result = script_runner.run(['dlt', 'pipeline', '--list-pipelines']) + result = script_runner.run(["dlt", "pipeline", "--list-pipelines"]) # directory does not exist (we point to TEST_STORAGE) assert result.returncode == -2 # create empty os.makedirs(get_dlt_pipelines_dir()) - result = script_runner.run(['dlt', 'pipeline', '--list-pipelines']) + result = script_runner.run(["dlt", "pipeline", "--list-pipelines"]) assert result.returncode == 0 assert "No pipelines found in" in result.stdout def test_invoke_pipeline(script_runner: ScriptRunner) -> None: # info on non existing pipeline - result = script_runner.run(['dlt', 'pipeline', 'debug_pipeline', 'info']) + result = script_runner.run(["dlt", "pipeline", "debug_pipeline", "info"]) assert result.returncode == -1 assert "the pipeline was not found in" in result.stderr @@ -66,25 +66,30 @@ def test_invoke_pipeline(script_runner: ScriptRunner) -> None: venv = Venv.restore_current() venv.run_script("dummy_pipeline.py") # we check output test_pipeline_command else - result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'info']) + result = script_runner.run(["dlt", "pipeline", "dummy_pipeline", "info"]) assert result.returncode == 0 - result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'trace']) + result = script_runner.run(["dlt", "pipeline", "dummy_pipeline", "trace"]) assert result.returncode == 0 - result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'failed-jobs']) + result = script_runner.run(["dlt", "pipeline", "dummy_pipeline", "failed-jobs"]) assert result.returncode == 0 - result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package']) + result = script_runner.run(["dlt", "pipeline", "dummy_pipeline", "load-package"]) assert result.returncode == 0 - result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + result = script_runner.run( + ["dlt", "pipeline", "dummy_pipeline", "load-package", "NON EXISTENT"] + ) assert result.returncode == -2 try: # use debug flag to raise an exception - result = script_runner.run(['dlt', '--debug', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + result = script_runner.run( + ["dlt", "--debug", "pipeline", "dummy_pipeline", "load-package", "NON EXISTENT"] + ) # exception terminates command assert result.returncode == 1 assert "LoadPackageNotFound" in result.stderr finally: # reset debug flag so other tests may pass from dlt.cli import _dlt + _dlt.DEBUG_FLAG = False @@ -92,17 +97,17 @@ def test_invoke_init_chess_and_template(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): - result = script_runner.run(['dlt', 'init', 'chess', 'dummy']) + result = script_runner.run(["dlt", "init", "chess", "dummy"]) assert "Verified source chess was added to your project!" in result.stdout assert result.returncode == 0 - result = script_runner.run(['dlt', 'init', 'debug_pipeline', 'dummy']) + result = script_runner.run(["dlt", "init", "debug_pipeline", "dummy"]) assert "Your new pipeline debug_pipeline is ready to be customized!" in result.stdout assert result.returncode == 0 def test_invoke_list_verified_sources(script_runner: ScriptRunner) -> None: known_sources = ["chess", "sql_database", "google_sheets", "pipedrive"] - result = script_runner.run(['dlt', 'init', '--list-verified-sources']) + result = script_runner.run(["dlt", "init", "--list-verified-sources"]) assert result.returncode == 0 for known_source in known_sources: assert known_source in result.stdout @@ -112,25 +117,31 @@ def test_invoke_deploy_project(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): # store dlt data in test storage (like patch_home_dir) with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): - result = script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'github-action', '--schedule', '@daily']) + result = script_runner.run( + ["dlt", "deploy", "debug_pipeline.py", "github-action", "--schedule", "@daily"] + ) assert result.returncode == -4 assert "The pipeline script does not exist" in result.stderr - result = script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'airflow-composer']) + result = script_runner.run(["dlt", "deploy", "debug_pipeline.py", "airflow-composer"]) assert result.returncode == -4 assert "The pipeline script does not exist" in result.stderr # now init - result = script_runner.run(['dlt', 'init', 'chess', 'dummy']) + result = script_runner.run(["dlt", "init", "chess", "dummy"]) assert result.returncode == 0 - result = script_runner.run(['dlt', 'deploy', 'chess_pipeline.py', 'github-action', '--schedule', '@daily']) + result = script_runner.run( + ["dlt", "deploy", "chess_pipeline.py", "github-action", "--schedule", "@daily"] + ) assert "NOTE: You must run the pipeline locally" in result.stdout - result = script_runner.run(['dlt', 'deploy', 'chess_pipeline.py', 'airflow-composer']) + result = script_runner.run(["dlt", "deploy", "chess_pipeline.py", "airflow-composer"]) assert "NOTE: You must run the pipeline locally" in result.stdout def test_invoke_deploy_mock(script_runner: ScriptRunner) -> None: # NOTE: you can mock only once per test with ScriptRunner !! with patch("dlt.cli.deploy_command.deploy_command") as _deploy_command: - script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'github-action', '--schedule', '@daily']) + script_runner.run( + ["dlt", "deploy", "debug_pipeline.py", "github-action", "--schedule", "@daily"] + ) assert _deploy_command.called assert _deploy_command.call_args[1] == { "pipeline_script_path": "debug_pipeline.py", @@ -140,11 +151,25 @@ def test_invoke_deploy_mock(script_runner: ScriptRunner) -> None: "command": "deploy", "schedule": "@daily", "run_manually": True, - "run_on_push": False + "run_on_push": False, } _deploy_command.reset_mock() - script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'github-action', '--schedule', '@daily', '--location', 'folder', '--branch', 'branch', '--run-on-push']) + script_runner.run( + [ + "dlt", + "deploy", + "debug_pipeline.py", + "github-action", + "--schedule", + "@daily", + "--location", + "folder", + "--branch", + "branch", + "--run-on-push", + ] + ) assert _deploy_command.called assert _deploy_command.call_args[1] == { "pipeline_script_path": "debug_pipeline.py", @@ -154,17 +179,17 @@ def test_invoke_deploy_mock(script_runner: ScriptRunner) -> None: "command": "deploy", "schedule": "@daily", "run_manually": True, - "run_on_push": True + "run_on_push": True, } # no schedule fails _deploy_command.reset_mock() - result = script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'github-action']) + result = script_runner.run(["dlt", "deploy", "debug_pipeline.py", "github-action"]) assert not _deploy_command.called assert result.returncode != 0 assert "the following arguments are required: --schedule" in result.stderr # airflow without schedule works _deploy_command.reset_mock() - result = script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'airflow-composer']) + result = script_runner.run(["dlt", "deploy", "debug_pipeline.py", "airflow-composer"]) assert _deploy_command.called assert result.returncode == 0 assert _deploy_command.call_args[1] == { @@ -173,11 +198,13 @@ def test_invoke_deploy_mock(script_runner: ScriptRunner) -> None: "repo_location": "https://github.com/dlt-hub/dlt-deploy-template.git", "branch": None, "command": "deploy", - 'secrets_format': 'toml' + "secrets_format": "toml", } # env secrets format _deploy_command.reset_mock() - result = script_runner.run(['dlt', 'deploy', 'debug_pipeline.py', 'airflow-composer', "--secrets-format", "env"]) + result = script_runner.run( + ["dlt", "deploy", "debug_pipeline.py", "airflow-composer", "--secrets-format", "env"] + ) assert _deploy_command.called assert result.returncode == 0 assert _deploy_command.call_args[1] == { @@ -186,5 +213,5 @@ def test_invoke_deploy_mock(script_runner: ScriptRunner) -> None: "repo_location": "https://github.com/dlt-hub/dlt-deploy-template.git", "branch": None, "command": "deploy", - 'secrets_format': 'env' + "secrets_format": "env", } diff --git a/tests/cli/common/test_telemetry_command.py b/tests/cli/common/test_telemetry_command.py index 4a3a0f4be1..076a9c3749 100644 --- a/tests/cli/common/test_telemetry_command.py +++ b/tests/cli/common/test_telemetry_command.py @@ -30,7 +30,12 @@ def _initial_providers(): glob_ctx = ConfigProvidersContext() glob_ctx.providers = _initial_providers() - with set_working_dir(test_storage.make_full_path("project")), Container().injectable_context(glob_ctx), patch("dlt.common.configuration.specs.config_providers_context.ConfigProvidersContext.initial_providers", _initial_providers): + with set_working_dir(test_storage.make_full_path("project")), Container().injectable_context( + glob_ctx + ), patch( + "dlt.common.configuration.specs.config_providers_context.ConfigProvidersContext.initial_providers", + _initial_providers, + ): # no config files: status is ON with io.StringIO() as buf, contextlib.redirect_stdout(buf): telemetry_status_command() @@ -75,7 +80,6 @@ def _initial_providers(): def test_command_instrumentation() -> None: - @track_command("instrument_ok", False, "in_ok_param", "in_ok_param_2") def instrument_ok(in_ok_param: str, in_ok_param_2: int) -> int: return 0 @@ -126,7 +130,15 @@ def instrument_raises_2(in_raises_2: bool) -> int: def test_instrumentation_wrappers() -> None: - from dlt.cli._dlt import init_command_wrapper, list_verified_sources_command_wrapper, DEFAULT_VERIFIED_SOURCES_REPO, pipeline_command_wrapper, deploy_command_wrapper, COMMAND_DEPLOY_REPO_LOCATION, DeploymentMethods + from dlt.cli._dlt import ( + init_command_wrapper, + list_verified_sources_command_wrapper, + DEFAULT_VERIFIED_SOURCES_REPO, + pipeline_command_wrapper, + deploy_command_wrapper, + COMMAND_DEPLOY_REPO_LOCATION, + DeploymentMethods, + ) from dlt.common.exceptions import UnknownDestinationModule with patch("dlt.common.runtime.segment.before_send", _mock_before_send): @@ -155,16 +167,22 @@ def test_instrumentation_wrappers() -> None: # assert msg["properties"]["operation"] == "list" SENT_ITEMS.clear() - deploy_command_wrapper("list.py", DeploymentMethods.github_actions.value, COMMAND_DEPLOY_REPO_LOCATION, schedule="* * * * *") + deploy_command_wrapper( + "list.py", + DeploymentMethods.github_actions.value, + COMMAND_DEPLOY_REPO_LOCATION, + schedule="* * * * *", + ) msg = SENT_ITEMS[0] assert msg["event"] == "command_deploy" assert msg["properties"]["deployment_method"] == DeploymentMethods.github_actions.value assert msg["properties"]["success"] is False - SENT_ITEMS = [] + + def _mock_before_send(event: DictStrAny, _unused_hint: Any = None) -> DictStrAny: SENT_ITEMS.append(event) # do not send this - return None \ No newline at end of file + return None diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py index e3a47f6202..78efcd03c4 100644 --- a/tests/cli/conftest.py +++ b/tests/cli/conftest.py @@ -1 +1 @@ -from tests.utils import preserve_environ, autouse_test_storage, unload_modules, wipe_pipeline \ No newline at end of file +from tests.utils import preserve_environ, autouse_test_storage, unload_modules, wipe_pipeline diff --git a/tests/cli/test_config_toml_writer.py b/tests/cli/test_config_toml_writer.py index 5d08b23c05..8ccac21f99 100644 --- a/tests/cli/test_config_toml_writer.py +++ b/tests/cli/test_config_toml_writer.py @@ -15,10 +15,24 @@ def example_toml(): def test_write_value(example_toml): toml_table = example_toml - write_value(toml_table, "species", str, overwrite_existing=True, default_value="Homo sapiens", is_default_of_interest=True) + write_value( + toml_table, + "species", + str, + overwrite_existing=True, + default_value="Homo sapiens", + is_default_of_interest=True, + ) assert toml_table["species"] == "Homo sapiens" - write_value(toml_table, "species", str, overwrite_existing=False, default_value="Mus musculus", is_default_of_interest=True) + write_value( + toml_table, + "species", + str, + overwrite_existing=False, + default_value="Mus musculus", + is_default_of_interest=True, + ) assert toml_table["species"] == "Homo sapiens" # Test with is_default_of_interest=True and non-optional, non-final hint @@ -26,24 +40,42 @@ def test_write_value(example_toml): assert toml_table["species"] == "species" # Test with is_default_of_interest=False and non-optional, non-final hint, and no default - write_value(toml_table, "population", int, overwrite_existing=True, is_default_of_interest=False) + write_value( + toml_table, "population", int, overwrite_existing=True, is_default_of_interest=False + ) # non default get typed example value assert "population" in toml_table # Test with optional hint - write_value(toml_table, "habitat", Optional[str], overwrite_existing=True, is_default_of_interest=False) + write_value( + toml_table, "habitat", Optional[str], overwrite_existing=True, is_default_of_interest=False + ) assert "habitat" not in toml_table # test with optional hint of interest - write_value(toml_table, "habitat", Optional[str], overwrite_existing=True, is_default_of_interest=True) + write_value( + toml_table, "habitat", Optional[str], overwrite_existing=True, is_default_of_interest=True + ) assert "habitat" in toml_table # Test with final hint - write_value(toml_table, "immutable_trait", Final[str], overwrite_existing=True, is_default_of_interest=False) + write_value( + toml_table, + "immutable_trait", + Final[str], + overwrite_existing=True, + is_default_of_interest=False, + ) assert "immutable_trait" not in toml_table # Test with final hint of interest - write_value(toml_table, "immutable_trait", Final[str], overwrite_existing=True, is_default_of_interest=True) + write_value( + toml_table, + "immutable_trait", + Final[str], + overwrite_existing=True, + is_default_of_interest=True, + ) assert "immutable_trait" in toml_table @@ -61,7 +93,9 @@ def test_write_values(example_toml): new_values = [ WritableConfigValue("species", str, "Canis lupus", ("taxonomy", "genus")), - WritableConfigValue("species", str, "Canis lupus familiaris", ("taxonomy", "genus", "subgenus")), + WritableConfigValue( + "species", str, "Canis lupus familiaris", ("taxonomy", "genus", "subgenus") + ), WritableConfigValue("genome_size", float, 2.8, ("genomic_info",)), ] write_values(example_toml, new_values, overwrite_existing=False) @@ -118,7 +152,10 @@ def test_write_values_without_defaults(example_toml): assert example_toml["animal_info"]["is_animal"] is True assert example_toml["genomic_info"]["chromosome_data"]["chromosomes"] == ["a", "b", "c"] - assert example_toml["genomic_info"]["chromosome_data"]["chromosomes"].trivia.comment == EXAMPLE_COMMENT + assert ( + example_toml["genomic_info"]["chromosome_data"]["chromosomes"].trivia.comment + == EXAMPLE_COMMENT + ) assert example_toml["genomic_info"]["gene_data"]["genes"] == {"key": "value"} - assert example_toml["genomic_info"]["gene_data"]["genes"].trivia.comment == EXAMPLE_COMMENT \ No newline at end of file + assert example_toml["genomic_info"]["gene_data"]["genes"].trivia.comment == EXAMPLE_COMMENT diff --git a/tests/cli/test_deploy_command.py b/tests/cli/test_deploy_command.py index de84c5c307..685921ca6e 100644 --- a/tests/cli/test_deploy_command.py +++ b/tests/cli/test_deploy_command.py @@ -26,26 +26,40 @@ ("github-action", {"schedule": "*/30 * * * *", "run_on_push": True, "run_manually": True}), ("airflow-composer", {"secrets_format": "toml"}), ("airflow-composer", {"secrets_format": "env"}), - ] +] @pytest.mark.parametrize("deployment_method,deployment_args", DEPLOY_PARAMS) -def test_deploy_command_no_repo(test_storage: FileStorage, deployment_method: str, deployment_args: StrAny) -> None: +def test_deploy_command_no_repo( + test_storage: FileStorage, deployment_method: str, deployment_args: StrAny +) -> None: pipeline_wf = tempfile.mkdtemp() shutil.copytree("tests/cli/cases/deploy_pipeline", pipeline_wf, dirs_exist_ok=True) with set_working_dir(pipeline_wf): # we do not have repo with pytest.raises(InvalidGitRepositoryError): - deploy_command.deploy_command("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + deploy_command.deploy_command( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) # test wrapper - rc = _dlt.deploy_command_wrapper("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + rc = _dlt.deploy_command_wrapper( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert rc == -3 @pytest.mark.parametrize("deployment_method,deployment_args", DEPLOY_PARAMS) -def test_deploy_command(test_storage: FileStorage, deployment_method: str, deployment_args: StrAny) -> None: +def test_deploy_command( + test_storage: FileStorage, deployment_method: str, deployment_args: StrAny +) -> None: # drop pipeline p = dlt.pipeline(pipeline_name="debug_pipeline") p._wipe_working_folder() @@ -59,16 +73,36 @@ def test_deploy_command(test_storage: FileStorage, deployment_method: str, deplo with Repo.init(".") as repo: # test no origin with pytest.raises(CliCommandException) as py_ex: - deploy_command.deploy_command("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + deploy_command.deploy_command( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert "Your current repository has no origin set" in py_ex.value.args[0] - rc = _dlt.deploy_command_wrapper("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + rc = _dlt.deploy_command_wrapper( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert rc == -5 # we have a repo that was never run Remote.create(repo, "origin", "git@github.com:rudolfix/dlt-cmd-test-2.git") with pytest.raises(CannotRestorePipelineException): - deploy_command.deploy_command("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) - rc = _dlt.deploy_command_wrapper("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + deploy_command.deploy_command( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) + rc = _dlt.deploy_command_wrapper( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert rc == -2 # run the script with wrong credentials (it is postgres there) @@ -80,9 +114,19 @@ def test_deploy_command(test_storage: FileStorage, deployment_method: str, deplo venv.run_script("debug_pipeline.py") # print(py_ex.value.output) with pytest.raises(deploy_command.PipelineWasNotRun) as py_ex2: - deploy_command.deploy_command("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + deploy_command.deploy_command( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert "The last pipeline run ended with error" in py_ex2.value.args[0] - rc = _dlt.deploy_command_wrapper("debug_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + rc = _dlt.deploy_command_wrapper( + "debug_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert rc == -2 os.environ["DESTINATION__POSTGRES__CREDENTIALS"] = pg_credentials @@ -103,8 +147,8 @@ def test_deploy_command(test_storage: FileStorage, deployment_method: str, deplo _out = buf.getvalue() print(_out) # make sure our secret and config values are all present - assert 'api_key_9x3ehash' in _out - assert 'dlt_data' in _out + assert "api_key_9x3ehash" in _out + assert "dlt_data" in _out if "schedule" in deployment_args: assert get_schedule_description(deployment_args["schedule"]) secrets_format = deployment_args.get("secrets_format", "env") @@ -115,8 +159,17 @@ def test_deploy_command(test_storage: FileStorage, deployment_method: str, deplo # non existing script name with pytest.raises(NoSuchPathError): - deploy_command.deploy_command("no_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + deploy_command.deploy_command( + "no_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) with echo.always_choose(False, always_choose_value=True): - rc = _dlt.deploy_command_wrapper("no_pipeline.py", deployment_method, deploy_command.COMMAND_DEPLOY_REPO_LOCATION, **deployment_args) + rc = _dlt.deploy_command_wrapper( + "no_pipeline.py", + deployment_method, + deploy_command.COMMAND_DEPLOY_REPO_LOCATION, + **deployment_args + ) assert rc == -4 - diff --git a/tests/cli/test_init_command.py b/tests/cli/test_init_command.py index 5dd24f4aaf..532a14957c 100644 --- a/tests/cli/test_init_command.py +++ b/tests/cli/test_init_command.py @@ -24,13 +24,25 @@ from dlt.cli import init_command, echo -from dlt.cli.init_command import SOURCES_MODULE_NAME, utils as cli_utils, files_ops, _select_source_files +from dlt.cli.init_command import ( + SOURCES_MODULE_NAME, + utils as cli_utils, + files_ops, + _select_source_files, +) from dlt.cli.exceptions import CliCommandException from dlt.cli.requirements import SourceRequirements from dlt.reflection.script_visitor import PipelineScriptVisitor from dlt.reflection import names as n -from tests.cli.utils import echo_default_choice, repo_dir, project_files, cloned_init_repo, get_repo_dir, get_project_files +from tests.cli.utils import ( + echo_default_choice, + repo_dir, + project_files, + cloned_init_repo, + get_repo_dir, + get_project_files, +) from tests.common.utils import modify_and_commit_file from tests.utils import IMPLEMENTED_DESTINATIONS, clean_test_storage @@ -83,7 +95,9 @@ def test_init_command_chess_verified_source(repo_dir: str, project_files: FileSt print(e) # now run the pipeline - os.environ.pop("DESTINATION__DUCKDB__CREDENTIALS", None) # settings from local project (secrets.toml etc.) + os.environ.pop( + "DESTINATION__DUCKDB__CREDENTIALS", None + ) # settings from local project (secrets.toml etc.) venv = Venv.restore_current() try: print(venv.run_script("chess_pipeline.py")) @@ -105,7 +119,9 @@ def test_init_list_verified_pipelines(repo_dir: str, project_files: FileStorage) init_command.list_verified_sources_command(repo_dir) -def test_init_list_verified_pipelines_update_warning(repo_dir: str, project_files: FileStorage) -> None: +def test_init_list_verified_pipelines_update_warning( + repo_dir: str, project_files: FileStorage +) -> None: """Sources listed include a warning if a different dlt version is required""" with mock.patch.object(SourceRequirements, "current_dlt_version", return_value="0.0.1"): with io.StringIO() as buf, contextlib.redirect_stdout(buf): @@ -121,7 +137,7 @@ def test_init_list_verified_pipelines_update_warning(repo_dir: str, project_file assert match # Try parsing the printed requiremnt string to verify it's valid parsed_requirement = Requirement(match.group(1)) - assert '0.0.1' not in parsed_requirement.specifier + assert "0.0.1" not in parsed_requirement.specifier def test_init_all_verified_sources_together(repo_dir: str, project_files: FileStorage) -> None: @@ -166,8 +182,10 @@ def test_init_all_verified_sources_isolated(cloned_init_repo: FileStorage) -> No assert_index_version_constraint(files, candidate) -@pytest.mark.parametrize('destination_name', IMPLEMENTED_DESTINATIONS) -def test_init_all_destinations(destination_name: str, project_files: FileStorage, repo_dir: str) -> None: +@pytest.mark.parametrize("destination_name", IMPLEMENTED_DESTINATIONS) +def test_init_all_destinations( + destination_name: str, project_files: FileStorage, repo_dir: str +) -> None: pipeline_name = f"generic_{destination_name}_pipeline" init_command.init_command(pipeline_name, destination_name, True, repo_dir) assert_init_files(project_files, pipeline_name, destination_name) @@ -189,7 +207,9 @@ def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) sources_storage.delete(del_file_path) source_files = files_ops.get_verified_source_files(sources_storage, "pipedrive") - remote_index = files_ops.get_remote_source_index(sources_storage.storage_path, source_files.files, ">=0.3.5") + remote_index = files_ops.get_remote_source_index( + sources_storage.storage_path, source_files.files, ">=0.3.5" + ) assert mod_file_path in remote_index["files"] assert remote_index["is_dirty"] is True assert remote_index["files"][mod_file_path]["sha3_256"] == new_content_hash @@ -200,7 +220,7 @@ def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) new, modified, deleted = files_ops.gen_index_diff(local_index, remote_index) # remote file entry in new assert new[new_file_path] == remote_index["files"][new_file_path] - #no git sha yet + # no git sha yet assert new[new_file_path]["git_sha"] is None # remote file entry in modified assert modified[mod_file_path] == remote_index["files"][mod_file_path] @@ -210,7 +230,9 @@ def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) assert deleted[del_file_path] == local_index["files"][del_file_path] # get conflicts - conflict_modified, conflict_deleted = files_ops.find_conflict_files(local_index, new, modified, deleted, project_files) + conflict_modified, conflict_deleted = files_ops.find_conflict_files( + local_index, new, modified, deleted, project_files + ) assert conflict_modified == [] assert conflict_deleted == [] @@ -231,30 +253,40 @@ def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) sources_storage.save(mod_file_path_2, local_content) local_index = files_ops.load_verified_sources_local_index("pipedrive") source_files = files_ops.get_verified_source_files(sources_storage, "pipedrive") - remote_index = files_ops.get_remote_source_index(sources_storage.storage_path, source_files.files, ">=0.3.5") + remote_index = files_ops.get_remote_source_index( + sources_storage.storage_path, source_files.files, ">=0.3.5" + ) new, modified, deleted = files_ops.gen_index_diff(local_index, remote_index) assert mod_file_path_2 in new - conflict_modified, conflict_deleted = files_ops.find_conflict_files(local_index, new, modified, deleted, project_files) + conflict_modified, conflict_deleted = files_ops.find_conflict_files( + local_index, new, modified, deleted, project_files + ) assert set(conflict_modified) == set([mod_file_path, new_file_path]) assert set(conflict_deleted) == set([del_file_path]) modified.update(new) # resolve conflicts in three different ways # skip option (the default) - res, sel_modified, sel_deleted = _select_source_files("pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted) + res, sel_modified, sel_deleted = _select_source_files( + "pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted + ) # noting is written, including non-conflicting file assert res == "s" assert sel_modified == {} assert sel_deleted == {} # Apply option - local changes will be lost with echo.always_choose(False, "a"): - res, sel_modified, sel_deleted = _select_source_files("pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted) + res, sel_modified, sel_deleted = _select_source_files( + "pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted + ) assert res == "a" assert sel_modified == modified assert sel_deleted == deleted # merge only non conflicting changes are applied with echo.always_choose(False, "m"): - res, sel_modified, sel_deleted = _select_source_files("pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted) + res, sel_modified, sel_deleted = _select_source_files( + "pipedrive", deepcopy(modified), deepcopy(deleted), conflict_modified, conflict_deleted + ) assert res == "m" assert len(sel_modified) == 1 and mod_file_path_2 in sel_modified assert sel_deleted == {} @@ -264,18 +296,26 @@ def test_init_code_update_index_diff(repo_dir: str, project_files: FileStorage) sources_storage.save(mod_file_path, local_content) project_files.delete(del_file_path) source_files = files_ops.get_verified_source_files(sources_storage, "pipedrive") - remote_index = files_ops.get_remote_source_index(sources_storage.storage_path, source_files.files, ">=0.3.5") + remote_index = files_ops.get_remote_source_index( + sources_storage.storage_path, source_files.files, ">=0.3.5" + ) new, modified, deleted = files_ops.gen_index_diff(local_index, remote_index) - conflict_modified, conflict_deleted = files_ops.find_conflict_files(local_index, new, modified, deleted, project_files) + conflict_modified, conflict_deleted = files_ops.find_conflict_files( + local_index, new, modified, deleted, project_files + ) assert conflict_modified == [] assert conflict_deleted == [] # generate a conflict by deleting file locally that is modified on remote project_files.delete(mod_file_path) source_files = files_ops.get_verified_source_files(sources_storage, "pipedrive") - remote_index = files_ops.get_remote_source_index(sources_storage.storage_path, source_files.files, ">=0.3.5") + remote_index = files_ops.get_remote_source_index( + sources_storage.storage_path, source_files.files, ">=0.3.5" + ) new, modified, deleted = files_ops.gen_index_diff(local_index, remote_index) - conflict_modified, conflict_deleted = files_ops.find_conflict_files(local_index, new, modified, deleted, project_files) + conflict_modified, conflict_deleted = files_ops.find_conflict_files( + local_index, new, modified, deleted, project_files + ) assert conflict_modified == [mod_file_path] @@ -306,8 +346,14 @@ def test_init_code_update_no_conflict(repo_dir: str, project_files: FileStorage) assert new_local_index["is_dirty"] is False assert new_local_index["last_commit_sha"] == commit.hexsha assert new_local_index["files"][mod_local_path]["commit_sha"] == commit.hexsha - assert new_local_index["files"][mod_local_path]["sha3_256"] == hashlib.sha3_256(bytes(new_content, encoding="ascii")).hexdigest() - assert new_local_index["files"][mod_local_path]["git_sha"] != local_index["files"][mod_local_path]["git_sha"] + assert ( + new_local_index["files"][mod_local_path]["sha3_256"] + == hashlib.sha3_256(bytes(new_content, encoding="ascii")).hexdigest() + ) + assert ( + new_local_index["files"][mod_local_path]["git_sha"] + != local_index["files"][mod_local_path]["git_sha"] + ) # all the other files must keep the old hashes for old_f, new_f in zip(local_index["files"].items(), new_local_index["files"].items()): # assert new_f[1]["commit_sha"] == commit.hexsha @@ -349,7 +395,9 @@ def test_init_code_update_no_conflict(repo_dir: str, project_files: FileStorage) @pytest.mark.parametrize("resolution", ["s", "a", "m"]) -def test_init_code_update_conflict(repo_dir: str, project_files: FileStorage, resolution: str) -> None: +def test_init_code_update_conflict( + repo_dir: str, project_files: FileStorage, resolution: str +) -> None: init_command.init_command("pipedrive", "duckdb", False, repo_dir) repo_storage = FileStorage(repo_dir) mod_local_path = os.path.join("pipedrive", "__init__.py") @@ -406,12 +454,16 @@ def test_init_requirements_text(repo_dir: str, project_files: FileStorage) -> No assert "pip3 install" in _out -def test_pipeline_template_sources_in_single_file(repo_dir: str, project_files: FileStorage) -> None: +def test_pipeline_template_sources_in_single_file( + repo_dir: str, project_files: FileStorage +) -> None: init_command.init_command("debug_pipeline", "bigquery", False, repo_dir) # _SOURCES now contains the sources from pipeline.py which simulates loading from two places with pytest.raises(CliCommandException) as cli_ex: init_command.init_command("generic_pipeline", "redshift", True, repo_dir) - assert "In init scripts you must declare all sources and resources in single file." in str(cli_ex.value) + assert "In init scripts you must declare all sources and resources in single file." in str( + cli_ex.value + ) def test_incompatible_dlt_version_warning(repo_dir: str, project_files: FileStorage) -> None: @@ -420,11 +472,18 @@ def test_incompatible_dlt_version_warning(repo_dir: str, project_files: FileStor init_command.init_command("facebook_ads", "bigquery", False, repo_dir) _out = buf.getvalue() - assert "WARNING: This pipeline requires a newer version of dlt than your installed version (0.1.1)." in _out + assert ( + "WARNING: This pipeline requires a newer version of dlt than your installed version" + " (0.1.1)." + in _out + ) def assert_init_files( - project_files: FileStorage, pipeline_name: str, destination_name: str, dependency_destination: Optional[str] = None + project_files: FileStorage, + pipeline_name: str, + destination_name: str, + dependency_destination: Optional[str] = None, ) -> PipelineScriptVisitor: visitor, _ = assert_common_files(project_files, pipeline_name + ".py", destination_name) assert not project_files.has_folder(pipeline_name) @@ -437,7 +496,9 @@ def assert_requirements_txt(project_files: FileStorage, destination_name: str) - assert project_files.has_file(cli_utils.REQUIREMENTS_TXT) assert "dlt" in project_files.load(cli_utils.REQUIREMENTS_TXT) # dlt dependency specifies destination_name as extra - source_requirements = SourceRequirements.from_string(project_files.load(cli_utils.REQUIREMENTS_TXT)) + source_requirements = SourceRequirements.from_string( + project_files.load(cli_utils.REQUIREMENTS_TXT) + ) assert destination_name in source_requirements.dlt_requirement.extras # Check that atleast some version range is specified assert len(source_requirements.dlt_requirement.specifier) >= 1 @@ -447,11 +508,23 @@ def assert_index_version_constraint(project_files: FileStorage, source_name: str # check dlt version constraint in .sources index for given source matches the one in requirements.txt local_index = files_ops.load_verified_sources_local_index(source_name) index_constraint = local_index["dlt_version_constraint"] - assert index_constraint == SourceRequirements.from_string(project_files.load(cli_utils.REQUIREMENTS_TXT)).dlt_version_constraint() - - -def assert_source_files(project_files: FileStorage, source_name: str, destination_name: str, has_source_section: bool = True) -> Tuple[PipelineScriptVisitor, SecretsTomlProvider]: - visitor, secrets = assert_common_files(project_files, source_name + "_pipeline.py", destination_name) + assert ( + index_constraint + == SourceRequirements.from_string( + project_files.load(cli_utils.REQUIREMENTS_TXT) + ).dlt_version_constraint() + ) + + +def assert_source_files( + project_files: FileStorage, + source_name: str, + destination_name: str, + has_source_section: bool = True, +) -> Tuple[PipelineScriptVisitor, SecretsTomlProvider]: + visitor, secrets = assert_common_files( + project_files, source_name + "_pipeline.py", destination_name + ) assert project_files.has_folder(source_name) source_secrets = secrets.get_value(source_name, type, None, source_name) if has_source_section: @@ -472,7 +545,9 @@ def assert_source_files(project_files: FileStorage, source_name: str, destinatio return visitor, secrets -def assert_common_files(project_files: FileStorage, pipeline_script: str, destination_name: str) -> Tuple[PipelineScriptVisitor, SecretsTomlProvider]: +def assert_common_files( + project_files: FileStorage, pipeline_script: str, destination_name: str +) -> Tuple[PipelineScriptVisitor, SecretsTomlProvider]: # cwd must be project files - otherwise assert won't work assert os.getcwd() == project_files.storage_path assert project_files.has_file(make_dlt_settings_path(SECRETS_TOML)) @@ -480,7 +555,9 @@ def assert_common_files(project_files: FileStorage, pipeline_script: str, destin assert project_files.has_file(".gitignore") assert project_files.has_file(pipeline_script) # inspect script - visitor = cli_utils.parse_init_script("test", project_files.load(pipeline_script), pipeline_script) + visitor = cli_utils.parse_init_script( + "test", project_files.load(pipeline_script), pipeline_script + ) # check destinations for args in visitor.known_calls[n.PIPELINE]: assert args.arguments["destination"].value == destination_name @@ -490,7 +567,13 @@ def assert_common_files(project_files: FileStorage, pipeline_script: str, destin # destination is there assert secrets.get_value(destination_name, type, None, "destination") is not None # certain values are never there - for not_there in ["dataset_name", "destination_name", "default_schema_name", "as_staging", "staging_config"]: + for not_there in [ + "dataset_name", + "destination_name", + "default_schema_name", + "as_staging", + "staging_config", + ]: assert secrets.get_value(not_there, type, None, "destination", destination_name)[0] is None return visitor, secrets diff --git a/tests/cli/test_pipeline_command.py b/tests/cli/test_pipeline_command.py index 19bb5fa277..eba0e897c5 100644 --- a/tests/cli/test_pipeline_command.py +++ b/tests/cli/test_pipeline_command.py @@ -10,7 +10,14 @@ from dlt.cli import echo, init_command, pipeline_command -from tests.cli.utils import echo_default_choice, repo_dir, project_files, cloned_init_repo, get_repo_dir, get_project_files +from tests.cli.utils import ( + echo_default_choice, + repo_dir, + project_files, + cloned_init_repo, + get_repo_dir, + get_project_files, +) def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) -> None: @@ -24,7 +31,9 @@ def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) print(e) # now run the pipeline - os.environ.pop("DESTINATION__DUCKDB__CREDENTIALS", None) # settings from local project (secrets.toml etc.) + os.environ.pop( + "DESTINATION__DUCKDB__CREDENTIALS", None + ) # settings from local project (secrets.toml etc.) venv = Venv.restore_current() try: print(venv.run_script("chess_pipeline.py")) @@ -114,7 +123,9 @@ def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) with io.StringIO() as buf, contextlib.redirect_stdout(buf): with echo.always_choose(False, True): - pipeline_command.pipeline_command("drop", "chess_pipeline", None, 0, resources=["players_games"]) + pipeline_command.pipeline_command( + "drop", "chess_pipeline", None, 0, resources=["players_games"] + ) _out = buf.getvalue() assert "Selected resource(s): ['players_games']" in _out @@ -125,9 +136,17 @@ def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) with io.StringIO() as buf, contextlib.redirect_stdout(buf): # Test sync destination and drop when local state is missing - pipeline._pipeline_storage.delete_folder('', recursively=True) + pipeline._pipeline_storage.delete_folder("", recursively=True) with echo.always_choose(False, True): - pipeline_command.pipeline_command("drop", "chess_pipeline", None, 0, destination=pipeline.destination, dataset_name=pipeline.dataset_name, resources=["players_profiles"]) + pipeline_command.pipeline_command( + "drop", + "chess_pipeline", + None, + 0, + destination=pipeline.destination, + dataset_name=pipeline.dataset_name, + resources=["players_profiles"], + ) _out = buf.getvalue() assert "could not be restored: the pipeline was not found in " in _out @@ -192,18 +211,18 @@ def test_pipeline_command_drop_partial_loads(repo_dir: str, project_files: FileS pipeline_command.pipeline_command("info", "chess_pipeline", None, 1) _out = buf.getvalue() # one package is partially loaded - assert 'This package is partially loaded' in _out + assert "This package is partially loaded" in _out print(_out) with io.StringIO() as buf, contextlib.redirect_stdout(buf): with echo.always_choose(False, True): pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) _out = buf.getvalue() - assert 'Pending packages deleted' in _out + assert "Pending packages deleted" in _out print(_out) with io.StringIO() as buf, contextlib.redirect_stdout(buf): pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) _out = buf.getvalue() - assert 'No pending packages found' in _out - print(_out) \ No newline at end of file + assert "No pending packages found" in _out + print(_out) diff --git a/tests/cli/utils.py b/tests/cli/utils.py index eb3b4e3b84..56c614e3ae 100644 --- a/tests/cli/utils.py +++ b/tests/cli/utils.py @@ -30,7 +30,9 @@ def echo_default_choice() -> Iterator[None]: @pytest.fixture(scope="module") def cloned_init_repo() -> FileStorage: - return git.get_fresh_repo_files(INIT_REPO_LOCATION, get_dlt_repos_dir(), branch=INIT_REPO_BRANCH) + return git.get_fresh_repo_files( + INIT_REPO_LOCATION, get_dlt_repos_dir(), branch=INIT_REPO_BRANCH + ) @pytest.fixture @@ -46,7 +48,9 @@ def project_files() -> Iterator[FileStorage]: def get_repo_dir(cloned_init_repo: FileStorage) -> str: - repo_dir = os.path.abspath(os.path.join(TEST_STORAGE_ROOT, f"verified_sources_repo_{uniq_id()}")) + repo_dir = os.path.abspath( + os.path.join(TEST_STORAGE_ROOT, f"verified_sources_repo_{uniq_id()}") + ) # copy the whole repo into TEST_STORAGE_ROOT shutil.copytree(cloned_init_repo.storage_path, repo_dir) return repo_dir diff --git a/tests/common/cases/modules/uniq_mod_121.py b/tests/common/cases/modules/uniq_mod_121.py index 893d08d178..810eb35840 100644 --- a/tests/common/cases/modules/uniq_mod_121.py +++ b/tests/common/cases/modules/uniq_mod_121.py @@ -1,8 +1,10 @@ import inspect from dlt.common.utils import get_module_name + def find_my_module(): pass + if __name__ == "__main__": print(get_module_name(inspect.getmodule(find_my_module))) diff --git a/tests/common/configuration/test_accessors.py b/tests/common/configuration/test_accessors.py index e641afd22a..4fda3b27a9 100644 --- a/tests/common/configuration/test_accessors.py +++ b/tests/common/configuration/test_accessors.py @@ -6,9 +6,16 @@ from dlt.common import json from dlt.common.configuration.exceptions import ConfigFieldMissingException -from dlt.common.configuration.providers import EnvironProvider, ConfigTomlProvider, SecretsTomlProvider +from dlt.common.configuration.providers import ( + EnvironProvider, + ConfigTomlProvider, + SecretsTomlProvider, +) from dlt.common.configuration.resolve import resolve_configuration -from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults, ConnectionStringCredentials +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentialsWithoutDefaults, + ConnectionStringCredentials, +) from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.utils import get_resolved_traces, ResolvedValueTrace from dlt.common.runners.configuration import PoolRunnerConfiguration @@ -39,19 +46,29 @@ def test_getter_accessor(toml_providers: ConfigProvidersContext, environment: An environment["VALUE"] = "{SET" assert dlt.config["value"] == "{SET" - assert RESOLVED_TRACES[".value"] == ResolvedValueTrace("value", "{SET", None, AnyType, [], EnvironProvider().name, None) + assert RESOLVED_TRACES[".value"] == ResolvedValueTrace( + "value", "{SET", None, AnyType, [], EnvironProvider().name, None + ) assert dlt.secrets["value"] == "{SET" - assert RESOLVED_TRACES[".value"] == ResolvedValueTrace("value", "{SET", None, TSecretValue, [], EnvironProvider().name, None) + assert RESOLVED_TRACES[".value"] == ResolvedValueTrace( + "value", "{SET", None, TSecretValue, [], EnvironProvider().name, None + ) # get sectioned values assert dlt.config["typecheck.str_val"] == "test string" - assert RESOLVED_TRACES["typecheck.str_val"] == ResolvedValueTrace("str_val", "test string", None, AnyType, ["typecheck"], ConfigTomlProvider().name, None) + assert RESOLVED_TRACES["typecheck.str_val"] == ResolvedValueTrace( + "str_val", "test string", None, AnyType, ["typecheck"], ConfigTomlProvider().name, None + ) environment["DLT__THIS__VALUE"] = "embedded" assert dlt.config["dlt.this.value"] == "embedded" - assert RESOLVED_TRACES["dlt.this.value"] == ResolvedValueTrace("value", "embedded", None, AnyType, ["dlt", "this"], EnvironProvider().name, None) + assert RESOLVED_TRACES["dlt.this.value"] == ResolvedValueTrace( + "value", "embedded", None, AnyType, ["dlt", "this"], EnvironProvider().name, None + ) assert dlt.secrets["dlt.this.value"] == "embedded" - assert RESOLVED_TRACES["dlt.this.value"] == ResolvedValueTrace("value", "embedded", None, TSecretValue, ["dlt", "this"], EnvironProvider().name, None) + assert RESOLVED_TRACES["dlt.this.value"] == ResolvedValueTrace( + "value", "embedded", None, TSecretValue, ["dlt", "this"], EnvironProvider().name, None + ) def test_getter_auto_cast(toml_providers: ConfigProvidersContext, environment: Any) -> None: @@ -83,7 +100,7 @@ def test_getter_auto_cast(toml_providers: ConfigProvidersContext, environment: A assert dlt.config["value"] == {"a": 1} assert dlt.config["value"]["a"] == 1 # if not dict or list then original string must be returned, null is a JSON -> None - environment["VALUE"] = 'null' + environment["VALUE"] = "null" assert dlt.config["value"] == "null" # typed values are returned as they are @@ -91,11 +108,32 @@ def test_getter_auto_cast(toml_providers: ConfigProvidersContext, environment: A # access dict from toml services_json_dict = dlt.secrets["destination.bigquery"] - assert dlt.secrets["destination.bigquery"]["client_email"] == "loader@a7513.iam.gserviceaccount.com" - assert RESOLVED_TRACES["destination.bigquery"] == ResolvedValueTrace("bigquery", services_json_dict, None, TSecretValue, ["destination"], SecretsTomlProvider().name, None) + assert ( + dlt.secrets["destination.bigquery"]["client_email"] + == "loader@a7513.iam.gserviceaccount.com" + ) + assert RESOLVED_TRACES["destination.bigquery"] == ResolvedValueTrace( + "bigquery", + services_json_dict, + None, + TSecretValue, + ["destination"], + SecretsTomlProvider().name, + None, + ) # equivalent - assert dlt.secrets["destination.bigquery.client_email"] == "loader@a7513.iam.gserviceaccount.com" - assert RESOLVED_TRACES["destination.bigquery.client_email"] == ResolvedValueTrace("client_email", "loader@a7513.iam.gserviceaccount.com", None, TSecretValue, ["destination", "bigquery"], SecretsTomlProvider().name, None) + assert ( + dlt.secrets["destination.bigquery.client_email"] == "loader@a7513.iam.gserviceaccount.com" + ) + assert RESOLVED_TRACES["destination.bigquery.client_email"] == ResolvedValueTrace( + "client_email", + "loader@a7513.iam.gserviceaccount.com", + None, + TSecretValue, + ["destination", "bigquery"], + SecretsTomlProvider().name, + None, + ) def test_getter_accessor_typed(toml_providers: ConfigProvidersContext, environment: Any) -> None: @@ -104,7 +142,9 @@ def test_getter_accessor_typed(toml_providers: ConfigProvidersContext, environme # the typed version coerces the value into desired type, in this case "dict" -> "str" assert dlt.secrets.get("credentials", str) == credentials_str # note that trace keeps original value of "credentials" which was of dictionary type - assert RESOLVED_TRACES[".credentials"] == ResolvedValueTrace("credentials", json.loads(credentials_str), None, str, [], SecretsTomlProvider().name, None) + assert RESOLVED_TRACES[".credentials"] == ResolvedValueTrace( + "credentials", json.loads(credentials_str), None, str, [], SecretsTomlProvider().name, None + ) # unchanged type assert isinstance(dlt.secrets.get("credentials"), dict) # fail on type coercion @@ -148,8 +188,8 @@ def test_setter(toml_providers: ConfigProvidersContext, environment: Any) -> Non # mod the config and use it to resolve the configuration dlt.config["pool"] = {"pool_type": "process", "workers": 21} - c = resolve_configuration(PoolRunnerConfiguration(), sections=("pool", )) - assert dict(c) == {"pool_type": "process", "workers": 21, 'run_sleep': 0.1} + c = resolve_configuration(PoolRunnerConfiguration(), sections=("pool",)) + assert dict(c) == {"pool_type": "process", "workers": 21, "run_sleep": 0.1} def test_secrets_separation(toml_providers: ConfigProvidersContext) -> None: @@ -163,13 +203,19 @@ def test_secrets_separation(toml_providers: ConfigProvidersContext) -> None: def test_access_injection(toml_providers: ConfigProvidersContext) -> None: - @dlt.source - def the_source(api_type=dlt.config.value, credentials: GcpServiceAccountCredentialsWithoutDefaults=dlt.secrets.value, databricks_creds: ConnectionStringCredentials=dlt.secrets.value): + def the_source( + api_type=dlt.config.value, + credentials: GcpServiceAccountCredentialsWithoutDefaults = dlt.secrets.value, + databricks_creds: ConnectionStringCredentials = dlt.secrets.value, + ): assert api_type == "REST" assert credentials.client_email == "loader@a7513.iam.gserviceaccount.com" assert databricks_creds.drivername == "databricks+connector" - return dlt.resource([1,2,3], name="data") + return dlt.resource([1, 2, 3], name="data") # inject first argument, the rest pass explicitly - the_source(credentials=dlt.secrets["destination.credentials"], databricks_creds=dlt.secrets["databricks.credentials"]) + the_source( + credentials=dlt.secrets["destination.credentials"], + databricks_creds=dlt.secrets["databricks.credentials"], + ) diff --git a/tests/common/configuration/test_configuration.py b/tests/common/configuration/test_configuration.py index fc009d8444..d8da21503d 100644 --- a/tests/common/configuration/test_configuration.py +++ b/tests/common/configuration/test_configuration.py @@ -1,52 +1,99 @@ import pytest import datetime # noqa: I251 from unittest.mock import patch -from typing import Any, Dict, Final, List, Mapping, MutableMapping, NewType, Optional, Type, Union, TYPE_CHECKING +from typing import ( + Any, + Dict, + Final, + List, + Mapping, + MutableMapping, + NewType, + Optional, + Type, + Union, + TYPE_CHECKING, +) from dlt.common import json, pendulum, Decimal, Wei from dlt.common.configuration.providers.provider import ConfigProvider -from dlt.common.configuration.specs.gcp_credentials import GcpServiceAccountCredentialsWithoutDefaults +from dlt.common.configuration.specs.gcp_credentials import ( + GcpServiceAccountCredentialsWithoutDefaults, +) from dlt.common.utils import custom_environ from dlt.common.typing import AnyType, DictStrAny, StrAny, TSecretValue, extract_inner_type from dlt.common.configuration.exceptions import ( - ConfigFieldMissingTypeHintException, ConfigFieldTypeHintNotSupported, - InvalidNativeValue, LookupTrace, ValueNotSecretException, UnmatchedConfigHintResolversException + ConfigFieldMissingTypeHintException, + ConfigFieldTypeHintNotSupported, + InvalidNativeValue, + LookupTrace, + ValueNotSecretException, + UnmatchedConfigHintResolversException, +) +from dlt.common.configuration import ( + configspec, + ConfigFieldMissingException, + ConfigValueCannotBeCoercedException, + resolve, + is_valid_hint, + resolve_type, +) +from dlt.common.configuration.specs import ( + BaseConfiguration, + RunConfiguration, + ConnectionStringCredentials, ) -from dlt.common.configuration import configspec, ConfigFieldMissingException, ConfigValueCannotBeCoercedException, resolve, is_valid_hint, resolve_type -from dlt.common.configuration.specs import BaseConfiguration, RunConfiguration, ConnectionStringCredentials from dlt.common.configuration.providers import environ as environ_provider, toml -from dlt.common.configuration.utils import get_resolved_traces, ResolvedValueTrace, serialize_value, deserialize_value, add_config_dict_to_env, add_config_to_env +from dlt.common.configuration.utils import ( + get_resolved_traces, + ResolvedValueTrace, + serialize_value, + deserialize_value, + add_config_dict_to_env, + add_config_to_env, +) from tests.utils import preserve_environ from tests.common.configuration.utils import ( - MockProvider, CoercionTestConfiguration, COERCIONS, SecretCredentials, WithCredentialsConfiguration, WrongConfiguration, SecretConfiguration, - SectionedConfiguration, environment, mock_provider, env_provider, reset_resolved_traces) + MockProvider, + CoercionTestConfiguration, + COERCIONS, + SecretCredentials, + WithCredentialsConfiguration, + WrongConfiguration, + SecretConfiguration, + SectionedConfiguration, + environment, + mock_provider, + env_provider, + reset_resolved_traces, +) INVALID_COERCIONS = { # 'STR_VAL': 'test string', # string always OK - 'int_val': "a12345", - 'bool_val': "not_bool", # bool overridden by string - that is the most common problem - 'list_val': {"2": 1, "3": 3.0}, - 'dict_val': "{'a': 1, 'b', '2'}", - 'bytes_val': 'Hello World!', - 'float_val': "invalid", + "int_val": "a12345", + "bool_val": "not_bool", # bool overridden by string - that is the most common problem + "list_val": {"2": 1, "3": 3.0}, + "dict_val": "{'a': 1, 'b', '2'}", + "bytes_val": "Hello World!", + "float_val": "invalid", "tuple_val": "{1:2}", "date_val": "01 May 2022", - "dec_val": True + "dec_val": True, } EXCEPTED_COERCIONS = { # allows to use int for float - 'float_val': 10, + "float_val": 10, # allows to use float for str - 'str_val': 10.0 + "str_val": 10.0, } COERCED_EXCEPTIONS = { # allows to use int for float - 'float_val': 10.0, + "float_val": 10.0, # allows to use float for str - 'str_val': "10.0" + "str_val": "10.0", } @@ -82,8 +129,8 @@ class FieldWithNoDefaultConfiguration(RunConfiguration): no_default: str if TYPE_CHECKING: - def __init__(self, no_default: str = None, sentry_dsn: str = None) -> None: - ... + + def __init__(self, no_default: str = None, sentry_dsn: str = None) -> None: ... @configspec @@ -110,8 +157,8 @@ def on_resolved(self) -> None: raise RuntimeError("Head over heels") if TYPE_CHECKING: - def __init__(self, head: str = None, tube: List[str] = None, heels: str = None) -> None: - ... + + def __init__(self, head: str = None, tube: List[str] = None, heels: str = None) -> None: ... @configspec @@ -121,8 +168,13 @@ class EmbeddedConfiguration(BaseConfiguration): sectioned: SectionedConfiguration if TYPE_CHECKING: - def __init__(self, default: str = None, instrumented: InstrumentedConfiguration = None, sectioned: SectionedConfiguration = None) -> None: - ... + + def __init__( + self, + default: str = None, + instrumented: InstrumentedConfiguration = None, + sectioned: SectionedConfiguration = None, + ) -> None: ... @configspec @@ -162,26 +214,26 @@ class ConfigWithDynamicType(BaseConfiguration): discriminator: str embedded_config: BaseConfiguration - @resolve_type('embedded_config') + @resolve_type("embedded_config") def resolve_embedded_type(self) -> Type[BaseConfiguration]: - if self.discriminator == 'a': + if self.discriminator == "a": return DynamicConfigA - elif self.discriminator == 'b': + elif self.discriminator == "b": return DynamicConfigB return BaseConfiguration @configspec class ConfigWithInvalidDynamicType(BaseConfiguration): - @resolve_type('a') + @resolve_type("a") def resolve_a_type(self) -> Type[BaseConfiguration]: return DynamicConfigA - @resolve_type('b') + @resolve_type("b") def resolve_b_type(self) -> Type[BaseConfiguration]: return DynamicConfigB - @resolve_type('c') + @resolve_type("c") def resolve_c_type(self) -> Type[BaseConfiguration]: return DynamicConfigC @@ -191,13 +243,13 @@ class SubclassConfigWithDynamicType(ConfigWithDynamicType): is_number: bool dynamic_type_field: Any - @resolve_type('embedded_config') + @resolve_type("embedded_config") def resolve_embedded_type(self) -> Type[BaseConfiguration]: - if self.discriminator == 'c': + if self.discriminator == "c": return DynamicConfigC return super().resolve_embedded_type() - @resolve_type('dynamic_type_field') + @resolve_type("dynamic_type_field") def resolve_dynamic_type_field(self) -> Type[Union[int, str]]: if self.is_number: return int @@ -221,7 +273,9 @@ def test_initial_config_state() -> None: def test_set_default_config_value(environment: Any) -> None: # set from init method - c = resolve.resolve_configuration(InstrumentedConfiguration(head="h", tube=["a", "b"], heels="he")) + c = resolve.resolve_configuration( + InstrumentedConfiguration(head="h", tube=["a", "b"], heels="he") + ) assert c.to_native_representation() == "h>a>b>he" # set from native form c = resolve.resolve_configuration(InstrumentedConfiguration(), explicit_value="h>a>b>he") @@ -229,7 +283,10 @@ def test_set_default_config_value(environment: Any) -> None: assert c.tube == ["a", "b"] assert c.heels == "he" # set from dictionary - c = resolve.resolve_configuration(InstrumentedConfiguration(), explicit_value={"head": "h", "tube": ["tu", "be"], "heels": "xhe"}) + c = resolve.resolve_configuration( + InstrumentedConfiguration(), + explicit_value={"head": "h", "tube": ["tu", "be"], "heels": "xhe"}, + ) assert c.to_native_representation() == "h>tu>be>xhe" @@ -238,9 +295,14 @@ def test_explicit_values(environment: Any) -> None: environment["PIPELINE_NAME"] = "env name" environment["CREATED_VAL"] = "12837" # set explicit values and allow partial config - c = resolve.resolve_configuration(CoercionTestConfiguration(), - explicit_value={"pipeline_name": "initial name", "none_val": type(environment), "bytes_val": b"str"}, - accept_partial=True + c = resolve.resolve_configuration( + CoercionTestConfiguration(), + explicit_value={ + "pipeline_name": "initial name", + "none_val": type(environment), + "bytes_val": b"str", + }, + accept_partial=True, ) # explicit assert c.pipeline_name == "initial name" @@ -249,13 +311,17 @@ def test_explicit_values(environment: Any) -> None: assert c.none_val == type(environment) # unknown field in explicit value dict is ignored - c = resolve.resolve_configuration(CoercionTestConfiguration(), explicit_value={"created_val": "3343"}, accept_partial=True) + c = resolve.resolve_configuration( + CoercionTestConfiguration(), explicit_value={"created_val": "3343"}, accept_partial=True + ) assert "created_val" not in c def test_explicit_values_false_when_bool() -> None: # values like 0, [], "" all coerce to bool False - c = resolve.resolve_configuration(InstrumentedConfiguration(), explicit_value={"head": "", "tube": [], "heels": ""}) + c = resolve.resolve_configuration( + InstrumentedConfiguration(), explicit_value={"head": "", "tube": [], "heels": ""} + ) assert c.head == "" assert c.tube == [] assert c.heels == "" @@ -280,7 +346,6 @@ def test_default_values(environment: Any) -> None: def test_raises_on_final_value_change(environment: Any) -> None: - @configspec class FinalConfiguration(BaseConfiguration): pipeline_name: Final[str] = "comp" @@ -313,7 +378,10 @@ def test_explicit_native_always_skips_resolve(environment: Any) -> None: # explicit representation environment["INS"] = "h>a>b>he" - c = resolve.resolve_configuration(InstrumentedConfiguration(), explicit_value={"head": "h", "tube": ["tu", "be"], "heels": "uhe"}) + c = resolve.resolve_configuration( + InstrumentedConfiguration(), + explicit_value={"head": "h", "tube": ["tu", "be"], "heels": "uhe"}, + ) assert c.heels == "uhe" # also the native explicit value @@ -336,7 +404,10 @@ def test_skip_lookup_native_config_value_if_no_config_section(environment: Any) # the INSTRUMENTED is not looked up because InstrumentedConfiguration has no section with custom_environ({"INSTRUMENTED": "he>tu>u>be>h"}): with pytest.raises(ConfigFieldMissingException) as py_ex: - resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"default": "set", "sectioned": {"password": "pwd"}}) + resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={"default": "set", "sectioned": {"password": "pwd"}}, + ) assert py_ex.value.spec_name == "InstrumentedConfiguration" assert py_ex.value.fields == ["head", "tube", "heels"] @@ -360,14 +431,28 @@ def test_on_resolved(environment: Any) -> None: def test_embedded_config(environment: Any) -> None: # resolve all embedded config, using explicit value for instrumented config and explicit dict for sectioned config - C = resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"default": "set", "instrumented": "h>tu>be>xhe", "sectioned": {"password": "pwd"}}) + C = resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={ + "default": "set", + "instrumented": "h>tu>be>xhe", + "sectioned": {"password": "pwd"}, + }, + ) assert C.default == "set" assert C.instrumented.to_native_representation() == "h>tu>be>xhe" assert C.sectioned.password == "pwd" # resolve but providing values via env with custom_environ( - {"INSTRUMENTED__HEAD": "h", "INSTRUMENTED__TUBE": '["tu", "u", "be"]', "INSTRUMENTED__HEELS": "xhe", "SECTIONED__PASSWORD": "passwd", "DEFAULT": "DEF"}): + { + "INSTRUMENTED__HEAD": "h", + "INSTRUMENTED__TUBE": '["tu", "u", "be"]', + "INSTRUMENTED__HEELS": "xhe", + "SECTIONED__PASSWORD": "passwd", + "DEFAULT": "DEF", + } + ): C = resolve.resolve_configuration(EmbeddedConfiguration()) assert C.default == "DEF" assert C.instrumented.to_native_representation() == "h>tu>u>be>xhe" @@ -391,11 +476,23 @@ def test_embedded_config(environment: Any) -> None: with patch.object(InstrumentedConfiguration, "__section__", "instrumented"): with custom_environ({"INSTRUMENTED": "he>tu>u>be>h"}): with pytest.raises(RuntimeError): - resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"default": "set", "sectioned": {"password": "pwd"}}) + resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={"default": "set", "sectioned": {"password": "pwd"}}, + ) # part via env part via explicit values - with custom_environ({"INSTRUMENTED__HEAD": "h", "INSTRUMENTED__TUBE": '["tu", "u", "be"]', "INSTRUMENTED__HEELS": "xhe"}): - C = resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"default": "set", "sectioned": {"password": "pwd"}}) + with custom_environ( + { + "INSTRUMENTED__HEAD": "h", + "INSTRUMENTED__TUBE": '["tu", "u", "be"]', + "INSTRUMENTED__HEELS": "xhe", + } + ): + C = resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={"default": "set", "sectioned": {"password": "pwd"}}, + ) assert C.instrumented.to_native_representation() == "h>tu>u>be>xhe" @@ -404,7 +501,11 @@ def test_embedded_explicit_value_over_provider(environment: Any) -> None: with patch.object(InstrumentedConfiguration, "__section__", "instrumented"): with custom_environ({"INSTRUMENTED": "h>tu>u>be>he"}): # explicit value over the env - c = resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"instrumented": "h>tu>be>xhe"}, accept_partial=True) + c = resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={"instrumented": "h>tu>be>xhe"}, + accept_partial=True, + ) assert c.instrumented.to_native_representation() == "h>tu>be>xhe" # parent configuration is not resolved assert not c.is_resolved() @@ -421,7 +522,9 @@ def test_provider_values_over_embedded_default(environment: Any) -> None: with custom_environ({"INSTRUMENTED": "h>tu>u>be>he"}): # read from env - over the default values InstrumentedConfiguration().parse_native_representation("h>tu>be>xhe") - c = resolve.resolve_configuration(EmbeddedConfiguration(instrumented=None), accept_partial=True) + c = resolve.resolve_configuration( + EmbeddedConfiguration(instrumented=None), accept_partial=True + ) assert c.instrumented.to_native_representation() == "h>tu>u>be>he" # parent configuration is not resolved assert not c.is_resolved() @@ -438,30 +541,30 @@ def test_run_configuration_gen_name(environment: Any) -> None: def test_configuration_is_mutable_mapping(environment: Any, env_provider: ConfigProvider) -> None: - @configspec class _SecretCredentials(RunConfiguration): pipeline_name: Optional[str] = "secret" secret_value: TSecretValue = None config_files_storage_path: str = "storage" - # configurations provide full MutableMapping support # here order of items in dict matters expected_dict = { - 'pipeline_name': 'secret', - 'sentry_dsn': None, - 'slack_incoming_hook': None, - 'dlthub_telemetry': True, - 'dlthub_telemetry_segment_write_key': 'TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB', - 'log_format': '{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}', - 'log_level': 'WARNING', - 'request_timeout': 60, - 'request_max_attempts': 5, - 'request_backoff_factor': 1, - 'request_max_retry_delay': 300, - 'config_files_storage_path': 'storage', - "secret_value": None + "pipeline_name": "secret", + "sentry_dsn": None, + "slack_incoming_hook": None, + "dlthub_telemetry": True, + "dlthub_telemetry_segment_write_key": "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB", + "log_format": ( + "{asctime}|[{levelname:<21}]|{process}|{name}|{filename}|{funcName}:{lineno}|{message}" + ), + "log_level": "WARNING", + "request_timeout": 60, + "request_max_attempts": 5, + "request_backoff_factor": 1, + "request_max_retry_delay": 300, + "config_files_storage_path": "storage", + "secret_value": None, } assert dict(_SecretCredentials()) == expected_dict @@ -525,9 +628,10 @@ def test_init_method_gen(environment: Any) -> None: def test_multi_derivation_defaults(environment: Any) -> None: - @configspec - class MultiConfiguration(SectionedConfiguration, MockProdConfiguration, ConfigurationWithOptionalTypes): + class MultiConfiguration( + SectionedConfiguration, MockProdConfiguration, ConfigurationWithOptionalTypes + ): pass # apparently dataclasses set default in reverse mro so MockProdConfiguration overwrites @@ -564,12 +668,19 @@ def test_raises_on_many_unresolved_fields(environment: Any, env_provider: Config resolve.resolve_configuration(CoercionTestConfiguration()) assert cf_missing_exc.value.spec_name == "CoercionTestConfiguration" # get all fields that must be set - val_fields = [f for f in CoercionTestConfiguration().get_resolvable_fields() if f.lower().endswith("_val")] + val_fields = [ + f for f in CoercionTestConfiguration().get_resolvable_fields() if f.lower().endswith("_val") + ] traces = cf_missing_exc.value.traces assert len(traces) == len(val_fields) for tr_field, exp_field in zip(traces, val_fields): assert len(traces[tr_field]) == 1 - assert traces[tr_field][0] == LookupTrace("Environment Variables", [], environ_provider.EnvironProvider.get_key_name(exp_field), None) + assert traces[tr_field][0] == LookupTrace( + "Environment Variables", + [], + environ_provider.EnvironProvider.get_key_name(exp_field), + None, + ) # assert traces[tr_field][1] == LookupTrace("secrets.toml", [], toml.TomlFileProvider.get_key_name(exp_field), None) # assert traces[tr_field][2] == LookupTrace("config.toml", [], toml.TomlFileProvider.get_key_name(exp_field), None) @@ -581,7 +692,9 @@ def test_accepts_optional_missing_fields(environment: Any) -> None: # make optional config resolve.resolve_configuration(ConfigurationWithOptionalTypes()) # make config with optional values - resolve.resolve_configuration(ProdConfigurationWithOptionalTypes(), explicit_value={"int_val": None}) + resolve.resolve_configuration( + ProdConfigurationWithOptionalTypes(), explicit_value={"int_val": None} + ) # make config with optional embedded config C2 = resolve.resolve_configuration(EmbeddedOptionalConfiguration()) # embedded config was not fully resolved @@ -591,14 +704,18 @@ def test_accepts_optional_missing_fields(environment: Any) -> None: def test_find_all_keys() -> None: keys = VeryWrongConfiguration().get_resolvable_fields() # assert hints and types: LOG_COLOR had it hint overwritten in derived class - assert set({'str_val': str, 'int_val': int, 'NoneConfigVar': str, 'log_color': str}.items()).issubset(keys.items()) + assert set( + {"str_val": str, "int_val": int, "NoneConfigVar": str, "log_color": str}.items() + ).issubset(keys.items()) def test_coercion_to_hint_types(environment: Any) -> None: add_config_dict_to_env(COERCIONS) C = CoercionTestConfiguration() - resolve._resolve_config_fields(C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False) + resolve._resolve_config_fields( + C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False + ) for key in COERCIONS: assert getattr(C, key) == COERCIONS[key] @@ -659,7 +776,13 @@ def test_invalid_coercions(environment: Any) -> None: add_config_dict_to_env(INVALID_COERCIONS) for key, value in INVALID_COERCIONS.items(): try: - resolve._resolve_config_fields(C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False) + resolve._resolve_config_fields( + C, + explicit_values=None, + explicit_sections=(), + embedded_sections=(), + accept_partial=False, + ) except ConfigValueCannotBeCoercedException as coerc_exc: # must fail exactly on expected value if coerc_exc.field_name != key: @@ -674,7 +797,9 @@ def test_excepted_coercions(environment: Any) -> None: C = CoercionTestConfiguration() add_config_dict_to_env(COERCIONS) add_config_dict_to_env(EXCEPTED_COERCIONS, overwrite_keys=True) - resolve._resolve_config_fields(C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False) + resolve._resolve_config_fields( + C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False + ) for key in EXCEPTED_COERCIONS: assert getattr(C, key) == COERCED_EXCEPTIONS[key] @@ -686,6 +811,7 @@ def test_config_with_unsupported_types_in_hints(environment: Any) -> None: class InvalidHintConfiguration(BaseConfiguration): tuple_val: tuple = None # type: ignore set_val: set = None # type: ignore + InvalidHintConfiguration() @@ -695,6 +821,7 @@ def test_config_with_no_hints(environment: Any) -> None: @configspec class NoHintConfiguration(BaseConfiguration): tuple_val = None + NoHintConfiguration() @@ -703,8 +830,8 @@ def test_config_with_non_templated_complex_hints(environment: Any) -> None: environment["TUPLE_VAL"] = "(1,2,3)" environment["DICT_VAL"] = '{"a": 1}' c = resolve.resolve_configuration(NonTemplatedComplexTypesConfiguration()) - assert c.list_val == [1,2,3] - assert c.tuple_val == (1,2,3) + assert c.list_val == [1, 2, 3] + assert c.tuple_val == (1, 2, 3) assert c.dict_val == {"a": 1} @@ -718,7 +845,7 @@ def test_resolve_configuration(environment: Any) -> None: def test_dataclass_instantiation(environment: Any) -> None: # resolve_configuration works on instances of dataclasses and types are not modified - environment['SECRET_VALUE'] = "1" + environment["SECRET_VALUE"] = "1" C = resolve.resolve_configuration(SecretConfiguration()) # auto derived type holds the value assert C.secret_value == "1" @@ -778,14 +905,13 @@ def test_is_valid_hint() -> None: def test_configspec_auto_base_config_derivation() -> None: - @configspec class AutoBaseDerivationConfiguration: auto: str if TYPE_CHECKING: - def __init__(self, auto: str=None) -> None: - ... + + def __init__(self, auto: str = None) -> None: ... assert issubclass(AutoBaseDerivationConfiguration, BaseConfiguration) assert hasattr(AutoBaseDerivationConfiguration, "auto") @@ -873,30 +999,59 @@ def test_last_resolve_exception(environment: Any) -> None: def test_resolved_trace(environment: Any) -> None: with custom_environ( - {"INSTRUMENTED__HEAD": "h", "INSTRUMENTED__TUBE": '["tu", "u", "be"]', "INSTRUMENTED__HEELS": "xhe", "SECTIONED__PASSWORD": "passwd", "DEFAULT": "DEF"}): + { + "INSTRUMENTED__HEAD": "h", + "INSTRUMENTED__TUBE": '["tu", "u", "be"]', + "INSTRUMENTED__HEELS": "xhe", + "SECTIONED__PASSWORD": "passwd", + "DEFAULT": "DEF", + } + ): c = resolve.resolve_configuration(EmbeddedConfiguration(default="_DEFF")) traces = get_resolved_traces() prov_name = environ_provider.EnvironProvider().name - assert traces[".default"] == ResolvedValueTrace("default", "DEF", "_DEFF", str, [], prov_name, c) - assert traces["instrumented.head"] == ResolvedValueTrace("head", "h", None, str, ["instrumented"], prov_name, c.instrumented) + assert traces[".default"] == ResolvedValueTrace( + "default", "DEF", "_DEFF", str, [], prov_name, c + ) + assert traces["instrumented.head"] == ResolvedValueTrace( + "head", "h", None, str, ["instrumented"], prov_name, c.instrumented + ) # value is before casting - assert traces["instrumented.tube"] == ResolvedValueTrace("tube", '["tu", "u", "be"]', None, List[str], ["instrumented"], prov_name, c.instrumented) - assert deserialize_value("tube", traces["instrumented.tube"].value, resolve.extract_inner_hint(List[str])) == ["tu", "u", "be"] - assert traces["instrumented.heels"] == ResolvedValueTrace("heels", "xhe", None, str, ["instrumented"], prov_name, c.instrumented) - assert traces["sectioned.password"] == ResolvedValueTrace("password", "passwd", None, str, ["sectioned"], prov_name, c.sectioned) + assert traces["instrumented.tube"] == ResolvedValueTrace( + "tube", '["tu", "u", "be"]', None, List[str], ["instrumented"], prov_name, c.instrumented + ) + assert deserialize_value( + "tube", traces["instrumented.tube"].value, resolve.extract_inner_hint(List[str]) + ) == ["tu", "u", "be"] + assert traces["instrumented.heels"] == ResolvedValueTrace( + "heels", "xhe", None, str, ["instrumented"], prov_name, c.instrumented + ) + assert traces["sectioned.password"] == ResolvedValueTrace( + "password", "passwd", None, str, ["sectioned"], prov_name, c.sectioned + ) assert len(traces) == 5 # try to get native representation with patch.object(InstrumentedConfiguration, "__section__", "snake"): with custom_environ( - {"INSTRUMENTED": "h>t>t>t>he", "SECTIONED__PASSWORD": "pass", "DEFAULT": "UNDEF", "SNAKE": "h>t>t>t>he"}): + { + "INSTRUMENTED": "h>t>t>t>he", + "SECTIONED__PASSWORD": "pass", + "DEFAULT": "UNDEF", + "SNAKE": "h>t>t>t>he", + } + ): c = resolve.resolve_configuration(EmbeddedConfiguration()) resolve.resolve_configuration(InstrumentedConfiguration()) assert traces[".default"] == ResolvedValueTrace("default", "UNDEF", None, str, [], prov_name, c) - assert traces[".instrumented"] == ResolvedValueTrace("instrumented", "h>t>t>t>he", None, InstrumentedConfiguration, [], prov_name, c) + assert traces[".instrumented"] == ResolvedValueTrace( + "instrumented", "h>t>t>t>he", None, InstrumentedConfiguration, [], prov_name, c + ) - assert traces[".snake"] == ResolvedValueTrace("snake", "h>t>t>t>he", None, InstrumentedConfiguration, [], prov_name, None) + assert traces[".snake"] == ResolvedValueTrace( + "snake", "h>t>t>t>he", None, InstrumentedConfiguration, [], prov_name, None + ) def test_extract_inner_hint() -> None: @@ -944,49 +1099,47 @@ def coerce_single_value(key: str, value: str, hint: Type[Any]) -> Any: def test_dynamic_type_hint(environment: Dict[str, str]) -> None: - """Test dynamic type hint using @resolve_type decorator - """ - environment['DUMMY__DISCRIMINATOR'] = 'b' - environment['DUMMY__EMBEDDED_CONFIG__FIELD_FOR_B'] = 'some_value' + """Test dynamic type hint using @resolve_type decorator""" + environment["DUMMY__DISCRIMINATOR"] = "b" + environment["DUMMY__EMBEDDED_CONFIG__FIELD_FOR_B"] = "some_value" - config = resolve.resolve_configuration(ConfigWithDynamicType(), sections=('dummy', )) + config = resolve.resolve_configuration(ConfigWithDynamicType(), sections=("dummy",)) assert isinstance(config.embedded_config, DynamicConfigB) - assert config.embedded_config.field_for_b == 'some_value' + assert config.embedded_config.field_for_b == "some_value" def test_dynamic_type_hint_subclass(environment: Dict[str, str]) -> None: - """Test overriding @resolve_type method in subclass - """ - environment['DUMMY__IS_NUMBER'] = 'true' - environment['DUMMY__DYNAMIC_TYPE_FIELD'] = '22' + """Test overriding @resolve_type method in subclass""" + environment["DUMMY__IS_NUMBER"] = "true" + environment["DUMMY__DYNAMIC_TYPE_FIELD"] = "22" # Test extended resolver method is applied - environment['DUMMY__DISCRIMINATOR'] = 'c' - environment['DUMMY__EMBEDDED_CONFIG__FIELD_FOR_C'] = 'some_value' + environment["DUMMY__DISCRIMINATOR"] = "c" + environment["DUMMY__EMBEDDED_CONFIG__FIELD_FOR_C"] = "some_value" - config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=('dummy', )) + config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=("dummy",)) assert isinstance(config.embedded_config, DynamicConfigC) - assert config.embedded_config.field_for_c == 'some_value' + assert config.embedded_config.field_for_c == "some_value" # Test super() call is applied correctly - environment['DUMMY__DISCRIMINATOR'] = 'b' - environment['DUMMY__EMBEDDED_CONFIG__FIELD_FOR_B'] = 'some_value' + environment["DUMMY__DISCRIMINATOR"] = "b" + environment["DUMMY__EMBEDDED_CONFIG__FIELD_FOR_B"] = "some_value" - config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=('dummy', )) + config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=("dummy",)) assert isinstance(config.embedded_config, DynamicConfigB) - assert config.embedded_config.field_for_b == 'some_value' + assert config.embedded_config.field_for_b == "some_value" # Test second dynamic field added in subclass - environment['DUMMY__IS_NUMBER'] = 'true' - environment['DUMMY__DYNAMIC_TYPE_FIELD'] = 'some' + environment["DUMMY__IS_NUMBER"] = "true" + environment["DUMMY__DYNAMIC_TYPE_FIELD"] = "some" with pytest.raises(ConfigValueCannotBeCoercedException) as e: - config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=('dummy', )) + config = resolve.resolve_configuration(SubclassConfigWithDynamicType(), sections=("dummy",)) - assert e.value.field_name == 'dynamic_type_field' + assert e.value.field_name == "dynamic_type_field" assert e.value.hint == int @@ -1005,31 +1158,49 @@ def test_add_config_to_env(environment: Dict[str, str]) -> None: EmbeddedConfiguration( instrumented="h>tu>u>be>he", # type: ignore[arg-type] sectioned=SectionedConfiguration(password="PASS"), - default="BUBA") + default="BUBA", + ) ) - add_config_to_env(c, ("dlt", )) + add_config_to_env(c, ("dlt",)) # must contain dlt prefix everywhere, INSTRUMENTED section taken from key and DLT_TEST taken from password - assert environment.items() >= { - 'DLT__DEFAULT': 'BUBA', - 'DLT__INSTRUMENTED__HEAD': 'h', 'DLT__INSTRUMENTED__TUBE': '["tu","u","be"]', 'DLT__INSTRUMENTED__HEELS': 'he', - 'DLT__DLT_TEST__PASSWORD': 'PASS' - }.items() + assert ( + environment.items() + >= { + "DLT__DEFAULT": "BUBA", + "DLT__INSTRUMENTED__HEAD": "h", + "DLT__INSTRUMENTED__TUBE": '["tu","u","be"]', + "DLT__INSTRUMENTED__HEELS": "he", + "DLT__DLT_TEST__PASSWORD": "PASS", + }.items() + ) # no dlt environment.clear() add_config_to_env(c) - assert environment.items() == { - 'DEFAULT': 'BUBA', - 'INSTRUMENTED__HEAD': 'h', 'INSTRUMENTED__TUBE': '["tu","u","be"]', 'INSTRUMENTED__HEELS': 'he', - 'DLT_TEST__PASSWORD': 'PASS' - }.items() + assert ( + environment.items() + == { + "DEFAULT": "BUBA", + "INSTRUMENTED__HEAD": "h", + "INSTRUMENTED__TUBE": '["tu","u","be"]', + "INSTRUMENTED__HEELS": "he", + "DLT_TEST__PASSWORD": "PASS", + }.items() + ) # starts with sectioned environment.clear() add_config_to_env(c.sectioned) - assert environment == {'DLT_TEST__PASSWORD': 'PASS'} + assert environment == {"DLT_TEST__PASSWORD": "PASS"} def test_configuration_copy() -> None: - c = resolve.resolve_configuration(EmbeddedConfiguration(), explicit_value={"default": "set", "instrumented": "h>tu>be>xhe", "sectioned": {"password": "pwd"}}) + c = resolve.resolve_configuration( + EmbeddedConfiguration(), + explicit_value={ + "default": "set", + "instrumented": "h>tu>be>xhe", + "sectioned": {"password": "pwd"}, + }, + ) assert c.is_resolved() copy_c = c.copy() assert copy_c.is_resolved() @@ -1042,7 +1213,9 @@ def test_configuration_copy() -> None: cred.parse_native_representation("postgresql://loader:loader@localhost:5432/dlt_data") copy_cred = cred.copy() assert dict(copy_cred) == dict(cred) - assert copy_cred.to_native_representation() == "postgresql://loader:loader@localhost:5432/dlt_data" + assert ( + copy_cred.to_native_representation() == "postgresql://loader:loader@localhost:5432/dlt_data" + ) # resolve the copy assert not copy_cred.is_resolved() resolved_cred_copy = c = resolve.resolve_configuration(copy_cred) # type: ignore[assignment] @@ -1050,7 +1223,6 @@ def test_configuration_copy() -> None: def test_configuration_with_configuration_as_default() -> None: - instrumented_default = InstrumentedConfiguration() instrumented_default.parse_native_representation("h>a>b>he") cred = ConnectionStringCredentials() diff --git a/tests/common/configuration/test_container.py b/tests/common/configuration/test_container.py index 928af63195..21c8de5782 100644 --- a/tests/common/configuration/test_container.py +++ b/tests/common/configuration/test_container.py @@ -6,7 +6,11 @@ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.specs import BaseConfiguration, ContainerInjectableContext from dlt.common.configuration.container import Container -from dlt.common.configuration.exceptions import ConfigFieldMissingException, ContainerInjectableContextMangled, ContextDefaultCannotBeCreated +from dlt.common.configuration.exceptions import ( + ConfigFieldMissingException, + ContainerInjectableContextMangled, + ContextDefaultCannotBeCreated, +) from tests.utils import preserve_environ from tests.common.configuration.utils import environment @@ -20,8 +24,8 @@ def parse_native_representation(self, native_value: Any) -> None: raise ValueError(native_value) if TYPE_CHECKING: - def __init__(self, current_value: str = None) -> None: - ... + + def __init__(self, current_value: str = None) -> None: ... @configspec @@ -31,7 +35,6 @@ class EmbeddedWithInjectableContext(BaseConfiguration): @configspec class NoDefaultInjectableContext(ContainerInjectableContext): - can_create_default: ClassVar[bool] = False diff --git a/tests/common/configuration/test_credentials.py b/tests/common/configuration/test_credentials.py index adf5ac829d..ae9b96e903 100644 --- a/tests/common/configuration/test_credentials.py +++ b/tests/common/configuration/test_credentials.py @@ -4,8 +4,21 @@ import pytest from dlt.common.configuration import resolve_configuration from dlt.common.configuration.exceptions import ConfigFieldMissingException -from dlt.common.configuration.specs import ConnectionStringCredentials, GcpServiceAccountCredentialsWithoutDefaults, GcpServiceAccountCredentials, GcpOAuthCredentialsWithoutDefaults, GcpOAuthCredentials, AwsCredentials -from dlt.common.configuration.specs.exceptions import InvalidConnectionString, InvalidGoogleNativeCredentialsType, InvalidGoogleOauth2Json, InvalidGoogleServicesJson, OAuth2ScopesRequired +from dlt.common.configuration.specs import ( + ConnectionStringCredentials, + GcpServiceAccountCredentialsWithoutDefaults, + GcpServiceAccountCredentials, + GcpOAuthCredentialsWithoutDefaults, + GcpOAuthCredentials, + AwsCredentials, +) +from dlt.common.configuration.specs.exceptions import ( + InvalidConnectionString, + InvalidGoogleNativeCredentialsType, + InvalidGoogleOauth2Json, + InvalidGoogleServicesJson, + OAuth2ScopesRequired, +) from dlt.common.configuration.specs.run_configuration import RunConfiguration from tests.utils import preserve_environ @@ -155,7 +168,10 @@ def test_gcp_service_credentials_native_representation(environment) -> None: assert GcpServiceAccountCredentials.__config_gen_annotations__ == [] gcpc = GcpServiceAccountCredentials() - gcpc.parse_native_representation(SERVICE_JSON % '"private_key": "-----BEGIN PRIVATE KEY-----\\n\\n-----END PRIVATE KEY-----\\n",') + gcpc.parse_native_representation( + SERVICE_JSON + % '"private_key": "-----BEGIN PRIVATE KEY-----\\n\\n-----END PRIVATE KEY-----\\n",' + ) assert gcpc.private_key == "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n" assert gcpc.project_id == "chat-analytics" assert gcpc.client_email == "loader@iam.gserviceaccount.com" @@ -191,7 +207,6 @@ def test_gcp_service_credentials_resolved_from_native_representation(environment def test_gcp_oauth_credentials_native_representation(environment) -> None: - with pytest.raises(InvalidGoogleNativeCredentialsType): GcpOAuthCredentials().parse_native_representation(1) @@ -205,13 +220,15 @@ def test_gcp_oauth_credentials_native_representation(environment) -> None: # but is not partial - all required fields are present assert gcoauth.is_partial() is False assert gcoauth.project_id == "level-dragon-333983" - assert gcoauth.client_id == "921382012504-3mtjaj1s7vuvf53j88mgdq4te7akkjm3.apps.googleusercontent.com" + assert ( + gcoauth.client_id + == "921382012504-3mtjaj1s7vuvf53j88mgdq4te7akkjm3.apps.googleusercontent.com" + ) assert gcoauth.client_secret == "gOCSPX-XdY5znbrvjSMEG3pkpA_GHuLPPth" assert gcoauth.refresh_token == "refresh_token" assert gcoauth.token is None assert gcoauth.scopes == ["email", "service"] - # get native representation, it will also location _repr = gcoauth.to_native_representation() assert "localhost" in _repr @@ -289,16 +306,16 @@ def test_run_configuration_slack_credentials(environment: Any) -> None: def test_aws_credentials_resolved(environment: Dict[str, str]) -> None: - environment['CREDENTIALS__AWS_ACCESS_KEY_ID'] = 'fake_access_key' - environment['CREDENTIALS__AWS_SECRET_ACCESS_KEY'] = 'fake_secret_key' - environment['CREDENTIALS__AWS_SESSION_TOKEN'] = 'fake_session_token' - environment['CREDENTIALS__PROFILE_NAME'] = 'fake_profile' - environment['CREDENTIALS__REGION_NAME'] = 'eu-central' + environment["CREDENTIALS__AWS_ACCESS_KEY_ID"] = "fake_access_key" + environment["CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = "fake_secret_key" + environment["CREDENTIALS__AWS_SESSION_TOKEN"] = "fake_session_token" + environment["CREDENTIALS__PROFILE_NAME"] = "fake_profile" + environment["CREDENTIALS__REGION_NAME"] = "eu-central" config = resolve_configuration(AwsCredentials()) - assert config.aws_access_key_id == 'fake_access_key' - assert config.aws_secret_access_key == 'fake_secret_key' - assert config.aws_session_token == 'fake_session_token' - assert config.profile_name == 'fake_profile' + assert config.aws_access_key_id == "fake_access_key" + assert config.aws_secret_access_key == "fake_secret_key" + assert config.aws_session_token == "fake_session_token" + assert config.profile_name == "fake_profile" assert config.region_name == "eu-central" diff --git a/tests/common/configuration/test_environ_provider.py b/tests/common/configuration/test_environ_provider.py index ccac6c54eb..0608ea1d7a 100644 --- a/tests/common/configuration/test_environ_provider.py +++ b/tests/common/configuration/test_environ_provider.py @@ -2,7 +2,12 @@ from typing import Any from dlt.common.typing import TSecretValue -from dlt.common.configuration import configspec, ConfigFieldMissingException, ConfigFileNotFoundException, resolve +from dlt.common.configuration import ( + configspec, + ConfigFieldMissingException, + ConfigFileNotFoundException, + resolve, +) from dlt.common.configuration.specs import RunConfiguration, BaseConfiguration from dlt.common.configuration.providers import environ as environ_provider @@ -27,22 +32,25 @@ class MockProdRunConfigurationVar(RunConfiguration): pipeline_name: str = "comp" - def test_resolves_from_environ(environment: Any) -> None: environment["NONECONFIGVAR"] = "Some" C = WrongConfiguration() - resolve._resolve_config_fields(C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False) + resolve._resolve_config_fields( + C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False + ) assert not C.is_partial() assert C.NoneConfigVar == environment["NONECONFIGVAR"] def test_resolves_from_environ_with_coercion(environment: Any) -> None: - environment["RUNTIME__TEST_BOOL"] = 'yes' + environment["RUNTIME__TEST_BOOL"] = "yes" C = SimpleRunConfiguration() - resolve._resolve_config_fields(C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False) + resolve._resolve_config_fields( + C, explicit_values=None, explicit_sections=(), embedded_sections=(), accept_partial=False + ) assert not C.is_partial() # value will be coerced to bool @@ -52,13 +60,13 @@ def test_resolves_from_environ_with_coercion(environment: Any) -> None: def test_secret(environment: Any) -> None: with pytest.raises(ConfigFieldMissingException): resolve.resolve_configuration(SecretConfiguration()) - environment['SECRET_VALUE'] = "1" + environment["SECRET_VALUE"] = "1" C = resolve.resolve_configuration(SecretConfiguration()) assert C.secret_value == "1" # mock the path to point to secret storage # from dlt.common.configuration import config_utils path = environ_provider.SECRET_STORAGE_PATH - del environment['SECRET_VALUE'] + del environment["SECRET_VALUE"] try: # must read a secret file environ_provider.SECRET_STORAGE_PATH = "./tests/common/cases/%s" @@ -66,13 +74,13 @@ def test_secret(environment: Any) -> None: assert C.secret_value == "BANANA" # set some weird path, no secret file at all - del environment['SECRET_VALUE'] + del environment["SECRET_VALUE"] environ_provider.SECRET_STORAGE_PATH = "!C:\\PATH%s" with pytest.raises(ConfigFieldMissingException): resolve.resolve_configuration(SecretConfiguration()) # set env which is a fallback for secret not as file - environment['SECRET_VALUE'] = "1" + environment["SECRET_VALUE"] = "1" C = resolve.resolve_configuration(SecretConfiguration()) assert C.secret_value == "1" finally: @@ -87,7 +95,7 @@ def test_secret_kube_fallback(environment: Any) -> None: # all unix editors will add x10 at the end of file, it will be preserved assert C.secret_kube == "kube\n" # we propagate secrets back to environ and strip the whitespace - assert environment['SECRET_KUBE'] == "kube" + assert environment["SECRET_KUBE"] == "kube" finally: environ_provider.SECRET_STORAGE_PATH = path @@ -99,7 +107,10 @@ def test_configuration_files(environment: Any) -> None: assert C.config_files_storage_path == environment["RUNTIME__CONFIG_FILES_STORAGE_PATH"] assert C.has_configuration_file("hasn't") is False assert C.has_configuration_file("event.schema.json") is True - assert C.get_configuration_file_path("event.schema.json") == "./tests/common/cases/schemas/ev1/event.schema.json" + assert ( + C.get_configuration_file_path("event.schema.json") + == "./tests/common/cases/schemas/ev1/event.schema.json" + ) with C.open_configuration_file("event.schema.json", "r") as f: f.read() with pytest.raises(ConfigFileNotFoundException): diff --git a/tests/common/configuration/test_inject.py b/tests/common/configuration/test_inject.py index b52d6f64b9..8b9616ccd7 100644 --- a/tests/common/configuration/test_inject.py +++ b/tests/common/configuration/test_inject.py @@ -9,7 +9,11 @@ from dlt.common.configuration.providers import EnvironProvider from dlt.common.configuration.providers.toml import SECRETS_TOML from dlt.common.configuration.resolve import inject_section -from dlt.common.configuration.specs import BaseConfiguration, GcpServiceAccountCredentialsWithoutDefaults, ConnectionStringCredentials +from dlt.common.configuration.specs import ( + BaseConfiguration, + GcpServiceAccountCredentialsWithoutDefaults, + ConnectionStringCredentials, +) from dlt.common.configuration.specs.base_configuration import is_secret_hint from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.specs.config_section_context import ConfigSectionContext @@ -21,7 +25,6 @@ def test_arguments_are_explicit(environment: Any) -> None: - @with_config def f_var(user=dlt.config.value, path=dlt.config.value): # explicit args "survive" the injection: they have precedence over env @@ -43,7 +46,6 @@ def f_var_env(user=dlt.config.value, path=dlt.config.value): def test_default_values_are_resolved(environment: Any) -> None: - @with_config def f_var(user=dlt.config.value, path="a/b/c"): assert user == "env user" @@ -54,7 +56,6 @@ def f_var(user=dlt.config.value, path="a/b/c"): def test_arguments_dlt_literal_defaults_are_required(environment: Any) -> None: - @with_config def f_config(user=dlt.config.value): assert user is not None @@ -84,7 +85,6 @@ def f_secret(password=dlt.secrets.value): def test_inject_from_argument_section(toml_providers: ConfigProvidersContext) -> None: - # `gcp_storage` is a key in `secrets.toml` and the default `credentials` section of GcpServiceAccountCredentialsWithoutDefaults must be replaced with it @with_config @@ -96,11 +96,12 @@ def f_credentials(gcp_storage: GcpServiceAccountCredentialsWithoutDefaults = dlt def test_inject_secret_value_secret_type(environment: Any) -> None: - @with_config - def f_custom_secret_type(_dict: Dict[str, Any] = dlt.secrets.value, _int: int = dlt.secrets.value, **kwargs: Any): + def f_custom_secret_type( + _dict: Dict[str, Any] = dlt.secrets.value, _int: int = dlt.secrets.value, **kwargs: Any + ): # secret values were coerced into types - assert _dict == {"a":1} + assert _dict == {"a": 1} assert _int == 1234 cfg = last_config(**kwargs) spec: Type[BaseConfiguration] = cfg.__class__ @@ -158,23 +159,24 @@ def test_inject_with_sections() -> None: def test_inject_with_sections_and_sections_context() -> None: - @with_config def no_sections(value=dlt.config.value): return value - @with_config(sections=("test", )) + @with_config(sections=("test",)) def test_sections(value=dlt.config.value): return value # a section context that prefers existing context - @with_config(sections=("test", ), sections_merge_style=ConfigSectionContext.prefer_existing) + @with_config(sections=("test",), sections_merge_style=ConfigSectionContext.prefer_existing) def test_sections_pref_existing(value=dlt.config.value): return value - # a section that wants context like dlt resource - @with_config(sections=("test", "module", "name"), sections_merge_style=ConfigSectionContext.resource_merge_style) + @with_config( + sections=("test", "module", "name"), + sections_merge_style=ConfigSectionContext.resource_merge_style, + ) def test_sections_like_resource(value=dlt.config.value): return value @@ -189,7 +191,7 @@ def test_sections_like_resource(value=dlt.config.value): assert test_sections_pref_existing() == "test_section" assert test_sections_like_resource() == "test_section" - with inject_section(ConfigSectionContext(sections=("injected", ))): + with inject_section(ConfigSectionContext(sections=("injected",))): # the "injected" section is applied to "no_section" func that has no sections assert no_sections() == "injected_section" # but not to "test" - it won't be overridden by section context @@ -198,7 +200,9 @@ def test_sections_like_resource(value=dlt.config.value): # this one explicitly prefers existing context assert test_sections_pref_existing() == "injected_section" - with inject_section(ConfigSectionContext(sections=("test", "existing_module", "existing_name"))): + with inject_section( + ConfigSectionContext(sections=("test", "existing_module", "existing_name")) + ): assert test_sections_like_resource() == "resource_style_injected" @@ -256,10 +260,13 @@ def test_initial_spec_from_arg_with_spec_type() -> None: pass -def test_use_most_specific_union_type(environment: Any, toml_providers: ConfigProvidersContext) -> None: - +def test_use_most_specific_union_type( + environment: Any, toml_providers: ConfigProvidersContext +) -> None: @with_config - def postgres_union(local_credentials: Union[ConnectionStringCredentials, str, StrAny] = dlt.secrets.value): + def postgres_union( + local_credentials: Union[ConnectionStringCredentials, str, StrAny] = dlt.secrets.value + ): return local_credentials @with_config @@ -267,7 +274,13 @@ def postgres_direct(local_credentials: ConnectionStringCredentials = dlt.secrets return local_credentials conn_str = "postgres://loader:loader@localhost:5432/dlt_data" - conn_dict = {"host": "localhost", "database": "dlt_test", "username": "loader", "password": "loader", "drivername": "postgresql"} + conn_dict = { + "host": "localhost", + "database": "dlt_test", + "username": "loader", + "password": "loader", + "drivername": "postgresql", + } conn_cred = ConnectionStringCredentials() conn_cred.parse_native_representation(conn_str) @@ -313,7 +326,6 @@ def postgres_direct(local_credentials: ConnectionStringCredentials = dlt.secrets def test_auto_derived_spec_type_name() -> None: - class AutoNameTest: @with_config def __init__(self, pos_par=dlt.secrets.value, /, kw_par=None) -> None: @@ -334,7 +346,10 @@ def stuff_test(pos_par, /, kw_par) -> None: pass # name is composed via __qualname__ of func - assert _get_spec_name_from_f(AutoNameTest.__init__) == "TestAutoDerivedSpecTypeNameAutoNameTestInitConfiguration" + assert ( + _get_spec_name_from_f(AutoNameTest.__init__) + == "TestAutoDerivedSpecTypeNameAutoNameTestInitConfiguration" + ) # synthesized spec present in current module assert "TestAutoDerivedSpecTypeNameAutoNameTestInitConfiguration" in globals() # instantiate diff --git a/tests/common/configuration/test_providers.py b/tests/common/configuration/test_providers.py index 2408aae583..f8c7900c24 100644 --- a/tests/common/configuration/test_providers.py +++ b/tests/common/configuration/test_providers.py @@ -1,5 +1,6 @@ import pytest + @pytest.mark.skip("Not implemented") def test_providers_order() -> None: pass diff --git a/tests/common/configuration/test_sections.py b/tests/common/configuration/test_sections.py index 1298dd11f2..9e0bc7e26d 100644 --- a/tests/common/configuration/test_sections.py +++ b/tests/common/configuration/test_sections.py @@ -2,14 +2,25 @@ from typing import Any, Optional from dlt.common.configuration.container import Container -from dlt.common.configuration import configspec, ConfigFieldMissingException, resolve, inject_section +from dlt.common.configuration import ( + configspec, + ConfigFieldMissingException, + resolve, + inject_section, +) from dlt.common.configuration.providers.provider import ConfigProvider from dlt.common.configuration.specs import BaseConfiguration, ConfigSectionContext from dlt.common.configuration.exceptions import LookupTrace from dlt.common.typing import AnyType from tests.utils import preserve_environ -from tests.common.configuration.utils import MockProvider, SectionedConfiguration, environment, mock_provider, env_provider +from tests.common.configuration.utils import ( + MockProvider, + SectionedConfiguration, + environment, + mock_provider, + env_provider, +) @configspec @@ -53,7 +64,9 @@ def test_sectioned_configuration(environment: Any, env_provider: ConfigProvider) traces = exc_val.value.traces["password"] # only one provider and section was tried assert len(traces) == 1 - assert traces[0] == LookupTrace("Environment Variables", ["DLT_TEST"], "DLT_TEST__PASSWORD", None) + assert traces[0] == LookupTrace( + "Environment Variables", ["DLT_TEST"], "DLT_TEST__PASSWORD", None + ) # assert traces[1] == LookupTrace("secrets.toml", ["DLT_TEST"], "DLT_TEST.password", None) # assert traces[2] == LookupTrace("config.toml", ["DLT_TEST"], "DLT_TEST.password", None) @@ -109,7 +122,14 @@ def test_explicit_sections_with_sectioned_config(mock_provider: MockProvider) -> assert mock_provider.last_sections == [("ns1",), (), ("ns1", "DLT_TEST"), ("DLT_TEST",)] mock_provider.reset_stats() resolve.resolve_configuration(SectionedConfiguration(), sections=("ns1", "ns2")) - assert mock_provider.last_sections == [("ns1", "ns2"), ("ns1",), (), ("ns1", "ns2", "DLT_TEST"), ("ns1", "DLT_TEST"), ("DLT_TEST",)] + assert mock_provider.last_sections == [ + ("ns1", "ns2"), + ("ns1",), + (), + ("ns1", "ns2", "DLT_TEST"), + ("ns1", "DLT_TEST"), + ("DLT_TEST",), + ] def test_overwrite_config_section_from_embedded(mock_provider: MockProvider) -> None: @@ -135,7 +155,13 @@ def test_explicit_sections_from_embedded_config(mock_provider: MockProvider) -> # embedded section inner of explicit mock_provider.reset_stats() resolve.resolve_configuration(EmbeddedConfiguration(), sections=("ns1",)) - assert mock_provider.last_sections == [("ns1", "sv_config",), ("sv_config",)] + assert mock_provider.last_sections == [ + ( + "ns1", + "sv_config", + ), + ("sv_config",), + ] def test_ignore_embedded_section_by_field_name(mock_provider: MockProvider) -> None: @@ -156,7 +182,11 @@ def test_ignore_embedded_section_by_field_name(mock_provider: MockProvider) -> N mock_provider.reset_stats() mock_provider.return_value_on = ("DLT_TEST",) resolve.resolve_configuration(EmbeddedWithIgnoredEmbeddedConfiguration()) - assert mock_provider.last_sections == [('ignored_embedded',), ('ignored_embedded', 'DLT_TEST'), ('DLT_TEST',)] + assert mock_provider.last_sections == [ + ("ignored_embedded",), + ("ignored_embedded", "DLT_TEST"), + ("DLT_TEST",), + ] def test_injected_sections(mock_provider: MockProvider) -> None: @@ -174,7 +204,12 @@ def test_injected_sections(mock_provider: MockProvider) -> None: mock_provider.reset_stats() mock_provider.return_value_on = ("DLT_TEST",) resolve.resolve_configuration(SectionedConfiguration()) - assert mock_provider.last_sections == [("inj-ns1",), (), ("inj-ns1", "DLT_TEST"), ("DLT_TEST",)] + assert mock_provider.last_sections == [ + ("inj-ns1",), + (), + ("inj-ns1", "DLT_TEST"), + ("DLT_TEST",), + ] # injected section inner of ns coming from embedded config mock_provider.reset_stats() mock_provider.return_value_on = () @@ -196,7 +231,7 @@ def test_section_context() -> None: with pytest.raises(ValueError): ConfigSectionContext(sections=()).source_name() with pytest.raises(ValueError): - ConfigSectionContext(sections=("sources", )).source_name() + ConfigSectionContext(sections=("sources",)).source_name() with pytest.raises(ValueError): ConfigSectionContext(sections=("sources", "modules")).source_name() @@ -221,7 +256,7 @@ def test_section_with_pipeline_name(mock_provider: MockProvider) -> None: # PIPE section is exhausted then another lookup without PIPE assert mock_provider.last_sections == [("PIPE", "ns1"), ("PIPE",), ("ns1",), ()] - mock_provider.return_value_on = ("PIPE", ) + mock_provider.return_value_on = ("PIPE",) mock_provider.reset_stats() resolve.resolve_configuration(SingleValConfiguration(), sections=("ns1",)) assert mock_provider.last_sections == [("PIPE", "ns1"), ("PIPE",)] @@ -237,10 +272,12 @@ def test_section_with_pipeline_name(mock_provider: MockProvider) -> None: mock_provider.reset_stats() resolve.resolve_configuration(SectionedConfiguration()) # first the whole SectionedConfiguration is looked under key DLT_TEST (sections: ('PIPE',), ()), then fields of SectionedConfiguration - assert mock_provider.last_sections == [('PIPE',), (), ("PIPE", "DLT_TEST"), ("DLT_TEST",)] + assert mock_provider.last_sections == [("PIPE",), (), ("PIPE", "DLT_TEST"), ("DLT_TEST",)] # with pipeline and injected sections - with container.injectable_context(ConfigSectionContext(pipeline_name="PIPE", sections=("inj-ns1",))): + with container.injectable_context( + ConfigSectionContext(pipeline_name="PIPE", sections=("inj-ns1",)) + ): mock_provider.return_value_on = () mock_provider.reset_stats() resolve.resolve_configuration(SingleValConfiguration()) diff --git a/tests/common/configuration/test_spec_union.py b/tests/common/configuration/test_spec_union.py index f013c9c568..25c32920bc 100644 --- a/tests/common/configuration/test_spec_union.py +++ b/tests/common/configuration/test_spec_union.py @@ -145,8 +145,17 @@ def test_unresolved_union() -> None: resolve_configuration(ZenConfig()) assert cfm_ex.value.fields == ["credentials"] # all the missing fields from all the union elements are present - checked_keys = set(t.key for t in itertools.chain(*cfm_ex.value.traces.values()) if t.provider == EnvironProvider().name) - assert checked_keys == {"CREDENTIALS__EMAIL", "CREDENTIALS__PASSWORD", "CREDENTIALS__API_KEY", "CREDENTIALS__API_SECRET"} + checked_keys = set( + t.key + for t in itertools.chain(*cfm_ex.value.traces.values()) + if t.provider == EnvironProvider().name + ) + assert checked_keys == { + "CREDENTIALS__EMAIL", + "CREDENTIALS__PASSWORD", + "CREDENTIALS__API_KEY", + "CREDENTIALS__API_SECRET", + } def test_union_decorator() -> None: @@ -154,7 +163,10 @@ def test_union_decorator() -> None: # this will generate equivalent of ZenConfig @dlt.source - def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): + def zen_source( + credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, + some_option: bool = False, + ): # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in credentials # both classes implement `auth` so you can always call it credentials.auth() # type: ignore[union-attr] @@ -179,6 +191,7 @@ class GoogleAnalyticsCredentialsBase(CredentialsConfiguration): """ The Base version of all the GoogleAnalyticsCredentials classes. """ + pass @@ -187,6 +200,7 @@ class GoogleAnalyticsCredentialsOAuth(GoogleAnalyticsCredentialsBase): """ This class is used to store credentials Google Analytics """ + client_id: str client_secret: TSecretValue project_id: TSecretValue @@ -195,23 +209,27 @@ class GoogleAnalyticsCredentialsOAuth(GoogleAnalyticsCredentialsBase): @dlt.source(max_table_nesting=2) -def google_analytics(credentials: Union[GoogleAnalyticsCredentialsOAuth, GcpServiceAccountCredentials] = dlt.secrets.value): +def google_analytics( + credentials: Union[ + GoogleAnalyticsCredentialsOAuth, GcpServiceAccountCredentials + ] = dlt.secrets.value +): yield dlt.resource([credentials], name="creds") def test_google_auth_union(environment: Any) -> None: info = { - "type" : "service_account", - "project_id" : "dlthub-analytics", - "private_key_id" : "45cbe97fbd3d756d55d4633a5a72d8530a05b993", - "private_key" : "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n", - "client_email" : "105150287833-compute@developer.gserviceaccount.com", - "client_id" : "106404499083406128146", - "auth_uri" : "https://accounts.google.com/o/oauth2/auth", - "token_uri" : "https://oauth2.googleapis.com/token", - "auth_provider_x509_cert_url" : "https://www.googleapis.com/oauth2/v1/certs", - "client_x509_cert_url" : "https://www.googleapis.com/robot/v1/metadata/x509/105150287833-compute%40developer.gserviceaccount.com" - } + "type": "service_account", + "project_id": "dlthub-analytics", + "private_key_id": "45cbe97fbd3d756d55d4633a5a72d8530a05b993", + "private_key": "-----BEGIN PRIVATE KEY-----\n\n-----END PRIVATE KEY-----\n", + "client_email": "105150287833-compute@developer.gserviceaccount.com", + "client_id": "106404499083406128146", + "auth_uri": "https://accounts.google.com/o/oauth2/auth", + "token_uri": "https://oauth2.googleapis.com/token", + "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", + "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/105150287833-compute%40developer.gserviceaccount.com", + } credentials = list(google_analytics(credentials=info))[0] # type: ignore[arg-type] print(dict(credentials)) @@ -225,23 +243,23 @@ def sql_database(credentials: Union[ConnectionStringCredentials, Engine, str] = def test_union_concrete_type(environment: Any) -> None: # we can pass engine explicitly - engine = create_engine('sqlite:///:memory:', echo=True) + engine = create_engine("sqlite:///:memory:", echo=True) db = sql_database(credentials=engine) creds = list(db)[0] assert isinstance(creds, Engine) # we can pass valid connection string explicitly - db = sql_database(credentials='sqlite://user@/:memory:') + db = sql_database(credentials="sqlite://user@/:memory:") creds = list(db)[0] # but it is used as native value assert isinstance(creds, ConnectionStringCredentials) # pass instance of credentials - cn = ConnectionStringCredentials('sqlite://user@/:memory:') + cn = ConnectionStringCredentials("sqlite://user@/:memory:") db = sql_database(credentials=cn) # exactly that instance is returned assert list(db)[0] is cn # invalid cn with pytest.raises(InvalidNativeValue): - db = sql_database(credentials='?') + db = sql_database(credentials="?") with pytest.raises(InvalidNativeValue): db = sql_database(credentials=123) # type: ignore[arg-type] diff --git a/tests/common/configuration/test_toml_provider.py b/tests/common/configuration/test_toml_provider.py index 71ceb790e2..db5333f610 100644 --- a/tests/common/configuration/test_toml_provider.py +++ b/tests/common/configuration/test_toml_provider.py @@ -10,14 +10,34 @@ from dlt.common.configuration.container import Container from dlt.common.configuration.inject import with_config from dlt.common.configuration.exceptions import LookupTrace -from dlt.common.configuration.providers.toml import SECRETS_TOML, CONFIG_TOML, BaseTomlProvider, SecretsTomlProvider, ConfigTomlProvider, StringTomlProvider, TomlProviderReadException +from dlt.common.configuration.providers.toml import ( + SECRETS_TOML, + CONFIG_TOML, + BaseTomlProvider, + SecretsTomlProvider, + ConfigTomlProvider, + StringTomlProvider, + TomlProviderReadException, +) from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext -from dlt.common.configuration.specs import BaseConfiguration, GcpServiceAccountCredentialsWithoutDefaults, ConnectionStringCredentials +from dlt.common.configuration.specs import ( + BaseConfiguration, + GcpServiceAccountCredentialsWithoutDefaults, + ConnectionStringCredentials, +) from dlt.common.runners.configuration import PoolRunnerConfiguration from dlt.common.typing import TSecretValue from tests.utils import preserve_environ -from tests.common.configuration.utils import SecretCredentials, WithCredentialsConfiguration, CoercionTestConfiguration, COERCIONS, SecretConfiguration, environment, toml_providers +from tests.common.configuration.utils import ( + SecretCredentials, + WithCredentialsConfiguration, + CoercionTestConfiguration, + COERCIONS, + SecretConfiguration, + environment, + toml_providers, +) @configspec @@ -31,7 +51,6 @@ class EmbeddedWithGcpCredentials(BaseConfiguration): def test_secrets_from_toml_secrets(toml_providers: ConfigProvidersContext) -> None: - # remove secret_value to trigger exception del toml_providers["secrets.toml"]._toml["secret_value"] # type: ignore[attr-defined] @@ -63,10 +82,8 @@ def test_toml_types(toml_providers: ConfigProvidersContext) -> None: def test_config_provider_order(toml_providers: ConfigProvidersContext, environment: Any) -> None: - # add env provider - @with_config(sections=("api",)) def single_val(port=None): return port @@ -86,7 +103,11 @@ def test_toml_mixed_config_inject(toml_providers: ConfigProvidersContext) -> Non # get data from both providers @with_config - def mixed_val(api_type=dlt.config.value, secret_value: TSecretValue = dlt.secrets.value, typecheck: Any = dlt.config.value): + def mixed_val( + api_type=dlt.config.value, + secret_value: TSecretValue = dlt.secrets.value, + typecheck: Any = dlt.config.value, + ): return api_type, secret_value, typecheck _tup = mixed_val(None, None, None) @@ -109,13 +130,19 @@ def test_toml_sections(toml_providers: ConfigProvidersContext) -> None: def test_secrets_toml_credentials(environment: Any, toml_providers: ConfigProvidersContext) -> None: # there are credentials exactly under destination.bigquery.credentials - c = resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination", "bigquery")) + c = resolve.resolve_configuration( + GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination", "bigquery") + ) assert c.project_id.endswith("destination.bigquery.credentials") # there are no destination.gcp_storage.credentials so it will fallback to "destination"."credentials" - c = resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination", "gcp_storage")) + c = resolve.resolve_configuration( + GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination", "gcp_storage") + ) assert c.project_id.endswith("destination.credentials") # also explicit - c = resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination",)) + c = resolve.resolve_configuration( + GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination",) + ) assert c.project_id.endswith("destination.credentials") # there's "credentials" key but does not contain valid gcp credentials with pytest.raises(ConfigFieldMissingException): @@ -132,12 +159,18 @@ def test_secrets_toml_credentials(environment: Any, toml_providers: ConfigProvid resolve.resolve_configuration(c3, sections=("destination", "bigquery")) -def test_secrets_toml_embedded_credentials(environment: Any, toml_providers: ConfigProvidersContext) -> None: +def test_secrets_toml_embedded_credentials( + environment: Any, toml_providers: ConfigProvidersContext +) -> None: # will try destination.bigquery.credentials - c = resolve.resolve_configuration(EmbeddedWithGcpCredentials(), sections=("destination", "bigquery")) + c = resolve.resolve_configuration( + EmbeddedWithGcpCredentials(), sections=("destination", "bigquery") + ) assert c.credentials.project_id.endswith("destination.bigquery.credentials") # will try destination.gcp_storage.credentials and fallback to destination.credentials - c = resolve.resolve_configuration(EmbeddedWithGcpCredentials(), sections=("destination", "gcp_storage")) + c = resolve.resolve_configuration( + EmbeddedWithGcpCredentials(), sections=("destination", "gcp_storage") + ) assert c.credentials.project_id.endswith("destination.credentials") # will try everything until credentials in the root where incomplete credentials are present c = EmbeddedWithGcpCredentials() @@ -150,11 +183,15 @@ def test_secrets_toml_embedded_credentials(environment: Any, toml_providers: Con assert set(py_ex.value.traces.keys()) == {"client_email", "private_key"} # embed "gcp_storage" will bubble up to the very top, never reverts to "credentials" - c2 = resolve.resolve_configuration(EmbeddedWithGcpStorage(), sections=("destination", "bigquery")) + c2 = resolve.resolve_configuration( + EmbeddedWithGcpStorage(), sections=("destination", "bigquery") + ) assert c2.gcp_storage.project_id.endswith("-gcp-storage") # also explicit - c3 = resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination",)) + c3 = resolve.resolve_configuration( + GcpServiceAccountCredentialsWithoutDefaults(), sections=("destination",) + ) assert c3.project_id.endswith("destination.credentials") # there's "credentials" key but does not contain valid gcp credentials with pytest.raises(ConfigFieldMissingException): @@ -166,13 +203,22 @@ def test_dicts_are_not_enumerated() -> None: pass -def test_secrets_toml_credentials_from_native_repr(environment: Any, toml_providers: ConfigProvidersContext) -> None: +def test_secrets_toml_credentials_from_native_repr( + environment: Any, toml_providers: ConfigProvidersContext +) -> None: # cfg = toml_providers["secrets.toml"] # print(cfg._toml) # print(cfg._toml["source"]["credentials"]) # resolve gcp_credentials by parsing initial value which is str holding json doc - c = resolve.resolve_configuration(GcpServiceAccountCredentialsWithoutDefaults(), sections=("source",)) - assert c.private_key == "-----BEGIN PRIVATE KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n...\n-----END PRIVATE KEY-----\n" + c = resolve.resolve_configuration( + GcpServiceAccountCredentialsWithoutDefaults(), sections=("source",) + ) + assert ( + c.private_key + == "-----BEGIN PRIVATE" + " KEY-----\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD+S\n...\n-----END" + " PRIVATE KEY-----\n" + ) # but project id got overridden from credentials.project_id assert c.project_id.endswith("-credentials") # also try sql alchemy url (native repr) @@ -252,19 +298,33 @@ def test_write_value(toml_providers: ConfigProvidersContext) -> None: # this will create path of tables provider.set_value("deep_int", 2137, "deep_pipeline", "deep", "deep", "deep", "deep") assert provider._toml["deep_pipeline"]["deep"]["deep"]["deep"]["deep"]["deep_int"] == 2137 # type: ignore[index] - assert provider.get_value("deep_int", TAny, "deep_pipeline", "deep", "deep", "deep", "deep") == (2137, "deep_pipeline.deep.deep.deep.deep.deep_int") + assert provider.get_value( + "deep_int", TAny, "deep_pipeline", "deep", "deep", "deep", "deep" + ) == (2137, "deep_pipeline.deep.deep.deep.deep.deep_int") # same without the pipeline now = pendulum.now() provider.set_value("deep_date", now, None, "deep", "deep", "deep", "deep") - assert provider.get_value("deep_date", TAny, None, "deep", "deep", "deep", "deep") == (now, "deep.deep.deep.deep.deep_date") + assert provider.get_value("deep_date", TAny, None, "deep", "deep", "deep", "deep") == ( + now, + "deep.deep.deep.deep.deep_date", + ) # in existing path provider.set_value("deep_list", [1, 2, 3], None, "deep", "deep", "deep") - assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == ([1, 2, 3], "deep.deep.deep.deep_list") + assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == ( + [1, 2, 3], + "deep.deep.deep.deep_list", + ) # still there - assert provider.get_value("deep_date", TAny, None, "deep", "deep", "deep", "deep") == (now, "deep.deep.deep.deep.deep_date") + assert provider.get_value("deep_date", TAny, None, "deep", "deep", "deep", "deep") == ( + now, + "deep.deep.deep.deep.deep_date", + ) # overwrite value provider.set_value("deep_list", [1, 2, 3, 4], None, "deep", "deep", "deep") - assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == ([1, 2, 3, 4], "deep.deep.deep.deep_list") + assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == ( + [1, 2, 3, 4], + "deep.deep.deep.deep_list", + ) # invalid type with pytest.raises(ValueError): provider.set_value("deep_decimal", Decimal("1.2"), None, "deep", "deep", "deep", "deep") @@ -272,24 +332,39 @@ def test_write_value(toml_providers: ConfigProvidersContext) -> None: # write new dict to a new key test_d1 = {"key": "top", "embed": {"inner": "bottom", "inner_2": True}} provider.set_value("deep_dict", test_d1, None, "dict_test") - assert provider.get_value("deep_dict", TAny, None, "dict_test") == (test_d1, "dict_test.deep_dict") + assert provider.get_value("deep_dict", TAny, None, "dict_test") == ( + test_d1, + "dict_test.deep_dict", + ) # write same dict over dict provider.set_value("deep_dict", test_d1, None, "dict_test") - assert provider.get_value("deep_dict", TAny, None, "dict_test") == (test_d1, "dict_test.deep_dict") + assert provider.get_value("deep_dict", TAny, None, "dict_test") == ( + test_d1, + "dict_test.deep_dict", + ) # get a fragment - assert provider.get_value("inner_2", TAny, None, "dict_test", "deep_dict", "embed") == (True, "dict_test.deep_dict.embed.inner_2") + assert provider.get_value("inner_2", TAny, None, "dict_test", "deep_dict", "embed") == ( + True, + "dict_test.deep_dict.embed.inner_2", + ) # write a dict over non dict provider.set_value("deep_list", test_d1, None, "deep", "deep", "deep") - assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == (test_d1, "deep.deep.deep.deep_list") + assert provider.get_value("deep_list", TAny, None, "deep", "deep", "deep") == ( + test_d1, + "deep.deep.deep.deep_list", + ) # merge dicts test_d2 = {"key": "_top", "key2": "new2", "embed": {"inner": "_bottom", "inner_3": 2121}} provider.set_value("deep_dict", test_d2, None, "dict_test") test_m_d1_d2 = { "key": "_top", "embed": {"inner": "_bottom", "inner_2": True, "inner_3": 2121}, - "key2": "new2" + "key2": "new2", } - assert provider.get_value("deep_dict", TAny, None, "dict_test") == (test_m_d1_d2, "dict_test.deep_dict") + assert provider.get_value("deep_dict", TAny, None, "dict_test") == ( + test_m_d1_d2, + "dict_test.deep_dict", + ) # print(provider.get_value("deep_dict", Any, None, "dict_test")) # write configuration @@ -355,7 +430,6 @@ def test_write_toml_value(toml_providers: ConfigProvidersContext) -> None: def test_toml_string_provider() -> None: - # test basic reading provider = StringTomlProvider(""" [section1.subsection] @@ -365,8 +439,8 @@ def test_toml_string_provider() -> None: key2 = "value2" """) - assert provider.get_value("key1", "", "section1", "subsection") == ("value1", "section1.subsection.key1") # type: ignore[arg-type] - assert provider.get_value("key2", "", "section2", "subsection") == ("value2", "section2.subsection.key2") # type: ignore[arg-type] + assert provider.get_value("key1", "", "section1", "subsection") == ("value1", "section1.subsection.key1") # type: ignore[arg-type] + assert provider.get_value("key2", "", "section2", "subsection") == ("value2", "section2.subsection.key2") # type: ignore[arg-type] # test basic writing provider = StringTomlProvider("") diff --git a/tests/common/configuration/utils.py b/tests/common/configuration/utils.py index f0df420c45..73643561dc 100644 --- a/tests/common/configuration/utils.py +++ b/tests/common/configuration/utils.py @@ -1,13 +1,30 @@ import pytest from os import environ import datetime # noqa: I251 -from typing import Any, Iterator, List, Optional, Tuple, Type, Dict, MutableMapping, Optional, Sequence, TYPE_CHECKING +from typing import ( + Any, + Iterator, + List, + Optional, + Tuple, + Type, + Dict, + MutableMapping, + Optional, + Sequence, + TYPE_CHECKING, +) from dlt.common import Decimal, pendulum from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.container import Container -from dlt.common.configuration.providers import ConfigProvider, EnvironProvider, ConfigTomlProvider, SecretsTomlProvider +from dlt.common.configuration.providers import ( + ConfigProvider, + EnvironProvider, + ConfigTomlProvider, + SecretsTomlProvider, +) from dlt.common.configuration.utils import get_resolved_traces from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.typing import TSecretValue, StrAny @@ -64,8 +81,8 @@ class SectionedConfiguration(BaseConfiguration): password: str = None if TYPE_CHECKING: - def __init__(self, password: str = None) -> None: - ... + + def __init__(self, password: str = None) -> None: ... @pytest.fixture(scope="function") @@ -115,7 +132,6 @@ def toml_providers() -> Iterator[ConfigProvidersContext]: class MockProvider(ConfigProvider): - def __init__(self) -> None: self.value: Any = None self.return_value_on: Tuple[str, ...] = () @@ -125,9 +141,11 @@ def reset_stats(self) -> None: self.last_section: Tuple[str, ...] = None self.last_sections: List[Tuple[str, ...]] = [] - def get_value(self, key: str, hint: Type[Any], pipeline_name: str, *sections: str) -> Tuple[Optional[Any], str]: + def get_value( + self, key: str, hint: Type[Any], pipeline_name: str, *sections: str + ) -> Tuple[Optional[Any], str]: if pipeline_name: - sections = (pipeline_name, ) + sections + sections = (pipeline_name,) + sections self.last_section = sections self.last_sections.append(sections) if sections == self.return_value_on: @@ -156,27 +174,21 @@ def supports_secrets(self) -> bool: COERCIONS = { - 'str_val': 'test string', - 'int_val': 12345, - 'bool_val': True, - 'list_val': [1, "2", [3]], - 'dict_val': { - 'a': 1, - "b": "2" - }, - 'bytes_val': b'Hello World!', - 'float_val': 1.18927, + "str_val": "test string", + "int_val": 12345, + "bool_val": True, + "list_val": [1, "2", [3]], + "dict_val": {"a": 1, "b": "2"}, + "bytes_val": b"Hello World!", + "float_val": 1.18927, "tuple_val": (1, 2, {"1": "complicated dicts allowed in literal eval"}), - 'any_val': "function() {}", - 'none_val': "none", - 'COMPLEX_VAL': { - "_": [1440, ["*"], []], - "change-email": [560, ["*"], []] - }, + "any_val": "function() {}", + "none_val": "none", + "COMPLEX_VAL": {"_": [1440, ["*"], []], "change-email": [560, ["*"], []]}, "date_val": pendulum.now(), "dec_val": Decimal("22.38"), "sequence_val": ["A", "B", "KAPPA"], "gen_list_val": ["C", "Z", "N"], "mapping_val": {"FL": 1, "FR": {"1": 2}}, - "mutable_mapping_val": {"str": "str"} + "mutable_mapping_val": {"str": "str"}, } diff --git a/tests/common/data_writers/test_buffered_writer.py b/tests/common/data_writers/test_buffered_writer.py index c275f22b2b..5832341fb2 100644 --- a/tests/common/data_writers/test_buffered_writer.py +++ b/tests/common/data_writers/test_buffered_writer.py @@ -21,9 +21,10 @@ def test_write_no_item() -> None: assert writer.closed_files == [] -@pytest.mark.parametrize("disable_compression", [True, False], ids=["no_compression", "compression"]) +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) def test_rotation_on_schema_change(disable_compression: bool) -> None: - c1 = new_column("col1", "bigint") c2 = new_column("col2", "bigint") c3 = new_column("col3", "text") @@ -36,7 +37,7 @@ def c1_doc(count: int) -> Iterator[DictStrAny]: return map(lambda x: {"col1": x}, range(0, count)) def c2_doc(count: int) -> Iterator[DictStrAny]: - return map(lambda x: {"col1": x, "col2": x*2+1}, range(0, count)) + return map(lambda x: {"col1": x, "col2": x * 2 + 1}, range(0, count)) def c3_doc(count: int) -> Iterator[DictStrAny]: return map(lambda x: {"col3": "col3_value"}, range(0, count)) @@ -112,7 +113,9 @@ def c3_doc(count: int) -> Iterator[DictStrAny]: assert "(col3_value" in content[-1] -@pytest.mark.parametrize("disable_compression", [True, False], ids=["no_compression", "compression"]) +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) def test_NO_rotation_on_schema_change(disable_compression: bool) -> None: c1 = new_column("col1", "bigint") c2 = new_column("col2", "bigint") @@ -124,7 +127,7 @@ def c1_doc(count: int) -> Iterator[DictStrAny]: return map(lambda x: {"col1": x}, range(0, count)) def c2_doc(count: int) -> Iterator[DictStrAny]: - return map(lambda x: {"col1": x, "col2": x*2+1}, range(0, count)) + return map(lambda x: {"col1": x, "col2": x * 2 + 1}, range(0, count)) # change schema before file first flush with get_writer(_format="jsonl", disable_compression=disable_compression) as writer: @@ -142,7 +145,9 @@ def c2_doc(count: int) -> Iterator[DictStrAny]: assert content[-1] == '{"col1":1,"col2":3}\n' -@pytest.mark.parametrize("disable_compression", [True, False], ids=["no_compression", "compression"]) +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) def test_writer_requiring_schema(disable_compression: bool) -> None: # assertion on flushing with pytest.raises(AssertionError): @@ -156,8 +161,10 @@ def test_writer_requiring_schema(disable_compression: bool) -> None: writer.write_data_item([{"col1": 1}], t1) -@pytest.mark.parametrize("disable_compression", [True, False], ids=["no_compression", "compression"]) +@pytest.mark.parametrize( + "disable_compression", [True, False], ids=["no_compression", "compression"] +) def test_writer_optional_schema(disable_compression: bool) -> None: with get_writer(_format="jsonl", disable_compression=disable_compression) as writer: - writer.write_data_item([{"col1": 1}], None) - writer.write_data_item([{"col1": 1}], None) + writer.write_data_item([{"col1": 1}], None) + writer.write_data_item([{"col1": 1}], None) diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 9d655bc4db..973cba4b05 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -4,10 +4,22 @@ from dlt.common import pendulum, json from dlt.common.typing import AnyFun + # from dlt.destinations.postgres import capabilities from dlt.destinations.impl.redshift import capabilities as redshift_caps -from dlt.common.data_writers.escape import escape_redshift_identifier, escape_bigquery_identifier, escape_redshift_literal, escape_postgres_literal, escape_duckdb_literal -from dlt.common.data_writers.writers import DataWriter, InsertValuesWriter, JsonlWriter, ParquetDataWriter +from dlt.common.data_writers.escape import ( + escape_redshift_identifier, + escape_bigquery_identifier, + escape_redshift_literal, + escape_postgres_literal, + escape_duckdb_literal, +) +from dlt.common.data_writers.writers import ( + DataWriter, + InsertValuesWriter, + JsonlWriter, + ParquetDataWriter, +) from tests.common.utils import load_json_case, row_to_column_schemas @@ -21,6 +33,7 @@ class _StringIOWriter(DataWriter): class _BytesIOWriter(DataWriter): _f: io.BytesIO + @pytest.fixture def insert_writer() -> Iterator[DataWriter]: with io.StringIO() as f: @@ -48,7 +61,7 @@ def test_simple_jsonl_writer(jsonl_writer: _BytesIOWriter) -> None: jsonl_writer.write_all(None, rows) # remove b'' at the end lines = jsonl_writer._f.getvalue().split(b"\n") - assert lines[-1] == b'' + assert lines[-1] == b"" assert len(lines) == 3 @@ -93,13 +106,22 @@ def test_string_literal_escape() -> None: assert escape_redshift_literal(", NULL'); DROP TABLE --") == "', NULL''); DROP TABLE --'" assert escape_redshift_literal(", NULL');\n DROP TABLE --") == "', NULL'');\\n DROP TABLE --'" assert escape_redshift_literal(", NULL);\n DROP TABLE --") == "', NULL);\\n DROP TABLE --'" - assert escape_redshift_literal(", NULL);\\n DROP TABLE --\\") == "', NULL);\\\\n DROP TABLE --\\\\'" + assert ( + escape_redshift_literal(", NULL);\\n DROP TABLE --\\") + == "', NULL);\\\\n DROP TABLE --\\\\'" + ) # assert escape_redshift_literal(b'hello_word') == "\\x68656c6c6f5f776f7264" @pytest.mark.parametrize("escaper", ALL_LITERAL_ESCAPE) def test_string_complex_escape(escaper: AnyFun) -> None: - doc = {"complex":[1,2,3,"a"], "link": "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\0xA \0x0 \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085"} + doc = { + "complex": [1, 2, 3, "a"], + "link": ( + "?commen\ntU\nrn=urn%3Ali%3Acomment%3A%28acti\0xA \0x0" + " \\vity%3A69'08444473\n\n551163392%2C6n \r \x8e9085" + ), + } escaped = escaper(doc) # should be same as string escape if escaper == escape_redshift_literal: @@ -109,16 +131,28 @@ def test_string_complex_escape(escaper: AnyFun) -> None: def test_identifier_escape() -> None: - assert escape_redshift_identifier(", NULL'); DROP TABLE\" -\\-") == '", NULL\'); DROP TABLE"" -\\\\-"' + assert ( + escape_redshift_identifier(", NULL'); DROP TABLE\" -\\-") + == '", NULL\'); DROP TABLE"" -\\\\-"' + ) def test_identifier_escape_bigquery() -> None: - assert escape_bigquery_identifier(", NULL'); DROP TABLE\"` -\\-") == '`, NULL\'); DROP TABLE"\\` -\\\\-`' + assert ( + escape_bigquery_identifier(", NULL'); DROP TABLE\"` -\\-") + == "`, NULL'); DROP TABLE\"\\` -\\\\-`" + ) def test_string_literal_escape_unicode() -> None: # test on some unicode characters assert escape_redshift_literal(", NULL);\n DROP TABLE --") == "', NULL);\\n DROP TABLE --'" - assert escape_redshift_literal("イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム") == "'イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム'" - assert escape_redshift_identifier("ąćł\"") == '"ąćł"""' - assert escape_redshift_identifier("イロハニホヘト チリヌルヲ \"ワカヨタレソ ツネナラム") == '"イロハニホヘト チリヌルヲ ""ワカヨタレソ ツネナラム"' + assert ( + escape_redshift_literal("イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム") + == "'イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム'" + ) + assert escape_redshift_identifier('ąćł"') == '"ąćł"""' + assert ( + escape_redshift_identifier('イロハニホヘト チリヌルヲ "ワカヨタレソ ツネナラム') + == '"イロハニホヘト チリヌルヲ ""ワカヨタレソ ツネナラム"' + ) diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py index e1a071903f..eddc90acf5 100644 --- a/tests/common/data_writers/utils.py +++ b/tests/common/data_writers/utils.py @@ -7,11 +7,27 @@ from tests.utils import TEST_STORAGE_ROOT -ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} +ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = { + "insert_values", + "jsonl", + "parquet", + "arrow", + "puae-jsonl", +} -def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: +def get_writer( + _format: TLoaderFileFormat = "insert_values", + buffer_max_items: int = 10, + disable_compression: bool = False, +) -> BufferedDataWriter[DataWriter]: caps = DestinationCapabilitiesContext.generic_capabilities() caps.preferred_loader_file_format = _format file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") - return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) + return BufferedDataWriter( + _format, + file_template, + buffer_max_items=buffer_max_items, + disable_compression=disable_compression, + _caps=caps, + ) diff --git a/tests/common/normalizers/custom_normalizers.py b/tests/common/normalizers/custom_normalizers.py index 8e24ffab5a..3ae65c8b53 100644 --- a/tests/common/normalizers/custom_normalizers.py +++ b/tests/common/normalizers/custom_normalizers.py @@ -5,7 +5,6 @@ class NamingConvention(SnakeCaseNamingConvention): - def normalize_identifier(self, identifier: str) -> str: if identifier.startswith("column_"): return identifier @@ -13,12 +12,12 @@ def normalize_identifier(self, identifier: str) -> str: class DataItemNormalizer(RelationalNormalizer): - def extend_schema(self) -> None: json_config = self.schema._normalizers_config["json"]["config"] d_h = self.schema._settings.setdefault("default_hints", {}) d_h["not_null"] = json_config["not_null"] - - def normalize_data_item(self, source_event: TDataItem, load_id: str, table_name) -> TNormalizedRowIterator: + def normalize_data_item( + self, source_event: TDataItem, load_id: str, table_name + ) -> TNormalizedRowIterator: yield (table_name, None), source_event diff --git a/tests/common/normalizers/test_import_normalizers.py b/tests/common/normalizers/test_import_normalizers.py index ea5842f206..df6b973943 100644 --- a/tests/common/normalizers/test_import_normalizers.py +++ b/tests/common/normalizers/test_import_normalizers.py @@ -10,26 +10,28 @@ from dlt.common.normalizers.naming import direct from dlt.common.normalizers.naming.exceptions import InvalidNamingModule, UnknownNamingModule -from tests.common.normalizers.custom_normalizers import DataItemNormalizer as CustomRelationalNormalizer +from tests.common.normalizers.custom_normalizers import ( + DataItemNormalizer as CustomRelationalNormalizer, +) from tests.utils import preserve_environ def test_default_normalizers() -> None: config = explicit_normalizers() - assert config['names'] is None - assert config['json'] is None + assert config["names"] is None + assert config["json"] is None # pass explicit config = explicit_normalizers("direct", {"module": "custom"}) - assert config['names'] == "direct" - assert config['json'] == {"module": "custom"} + assert config["names"] == "direct" + assert config["json"] == {"module": "custom"} # use environ os.environ["SCHEMA__NAMING"] = "direct" os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "custom"}' config = explicit_normalizers() - assert config['names'] == "direct" - assert config['json'] == {"module": "custom"} + assert config["names"] == "direct" + assert config["json"] == {"module": "custom"} def test_default_normalizers_with_caps() -> None: @@ -38,8 +40,7 @@ def test_default_normalizers_with_caps() -> None: destination_caps.naming_convention = "direct" with Container().injectable_context(destination_caps): config = explicit_normalizers() - assert config['names'] == "direct" - + assert config["names"] == "direct" def test_import_normalizers() -> None: @@ -52,7 +53,9 @@ def test_import_normalizers() -> None: assert config["json"] == {"module": "dlt.common.normalizers.json.relational"} os.environ["SCHEMA__NAMING"] = "direct" - os.environ["SCHEMA__JSON_NORMALIZER"] = '{"module": "tests.common.normalizers.custom_normalizers"}' + os.environ["SCHEMA__JSON_NORMALIZER"] = ( + '{"module": "tests.common.normalizers.custom_normalizers"}' + ) config, naming, json_normalizer = import_normalizers(explicit_normalizers()) assert config["names"] == "direct" assert config["json"] == {"module": "tests.common.normalizers.custom_normalizers"} diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 91b5a93466..502ce619dd 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -7,11 +7,18 @@ from dlt.common.schema import Schema, TTableSchema from dlt.common.schema.utils import new_table -from dlt.common.normalizers.json.relational import RelationalNormalizerConfigPropagation, DataItemNormalizer as RelationalNormalizer, DLT_ID_LENGTH_BYTES, TDataItemRow +from dlt.common.normalizers.json.relational import ( + RelationalNormalizerConfigPropagation, + DataItemNormalizer as RelationalNormalizer, + DLT_ID_LENGTH_BYTES, + TDataItemRow, +) + # _flatten, _get_child_row_hash, _normalize_row, normalize_data_item, from tests.utils import create_schema_with_name + @pytest.fixture def norm() -> RelationalNormalizer: return Schema("default").data_item_normalizer # type: ignore[return-value] @@ -21,15 +28,7 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None: row = { "f-1": "! 30", "f 2": [], - "f!3": { - "f4": "a", - "f-5": "b", - "f*6": { - "c": 7, - "c v": 8, - "c x": [] - } - } + "f!3": {"f4": "a", "f-5": "b", "f*6": {"c": 7, "c v": 8, "c x": []}}, } flattened_row, lists = norm._flatten("mock_table", row, 0) # type: ignore[arg-type] assert "f_1" in flattened_row @@ -41,29 +40,33 @@ def test_flatten_fix_field_name(norm: RelationalNormalizer) -> None: # assert "f_3__f_6__c_x" in flattened_row assert "f_3" not in flattened_row - assert ("f_2", ) in lists - assert ("f_3", "fx6", "c_x", ) in lists + assert ("f_2",) in lists + assert ( + "f_3", + "fx6", + "c_x", + ) in lists def test_preserve_complex_value(norm: RelationalNormalizer) -> None: # add table with complex column norm.schema.update_table( - new_table("with_complex", - columns = [{ - "name": "value", - "data_type": "complex", - "nullable": "true" # type: ignore[typeddict-item] - }]) + new_table( + "with_complex", + columns=[ + { + "name": "value", + "data_type": "complex", + "nullable": "true", # type: ignore[typeddict-item] + } + ], + ) ) - row_1 = { - "value": 1 - } + row_1 = {"value": 1} flattened_row, _ = norm._flatten("with_complex", row_1, 0) # type: ignore[arg-type] assert flattened_row["value"] == 1 # type: ignore[typeddict-item] - row_2 = { - "value": {"complex": True} - } + row_2 = {"value": {"complex": True}} flattened_row, _ = norm._flatten("with_complex", row_2, 0) # type: ignore[arg-type] assert flattened_row["value"] == row_2["value"] # type: ignore[typeddict-item] # complex value is not flattened @@ -75,15 +78,11 @@ def test_preserve_complex_value_with_hint(norm: RelationalNormalizer) -> None: norm.schema._settings.setdefault("preferred_types", {})[TSimpleRegex("re:^value$")] = "complex" norm.schema._compile_settings() - row_1 = { - "value": 1 - } + row_1 = {"value": 1} flattened_row, _ = norm._flatten("any_table", row_1, 0) # type: ignore[arg-type] assert flattened_row["value"] == 1 # type: ignore[typeddict-item] - row_2 = { - "value": {"complex": True} - } + row_2 = {"value": {"complex": True}} flattened_row, _ = norm._flatten("any_table", row_2, 0) # type: ignore[arg-type] assert flattened_row["value"] == row_2["value"] # type: ignore[typeddict-item] # complex value is not flattened @@ -91,17 +90,11 @@ def test_preserve_complex_value_with_hint(norm: RelationalNormalizer) -> None: def test_child_table_linking(norm: RelationalNormalizer) -> None: - row = { - "f": [{ - "l": ["a", "b", "c"], - "v": 120, - "o": [{"a": 1}, {"a": 2}] - }] - } + row = {"f": [{"l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}]} # request _dlt_root_id propagation add_dlt_root_id_propagation(norm) - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] # should have 7 entries (root + level 1 + 3 * list + 2 * object) assert len(rows) == 7 # root elem will not have a root hash if not explicitly added, "extend" is added only to child @@ -144,17 +137,12 @@ def test_child_table_linking(norm: RelationalNormalizer) -> None: def test_child_table_linking_primary_key(norm: RelationalNormalizer) -> None: row = { "id": "level0", - "f": [{ - "id": "level1", - "l": ["a", "b", "c"], - "v": 120, - "o": [{"a": 1}, {"a": 2}] - }] + "f": [{"id": "level1", "l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}], } norm.schema.merge_hints({"primary_key": [TSimpleRegex("id")]}) norm.schema._compile_settings() - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] root = next(t for t in rows if t[0][0] == "table")[1] # record hash is random for primary keys, not based on their content # this is a change introduced in dlt 0.2.0a30 @@ -169,7 +157,9 @@ def test_child_table_linking_primary_key(norm: RelationalNormalizer) -> None: assert "_dlt_root_id" not in t_f list_rows = [t for t in rows if t[0][0] == "table__f__l"] - assert all(e[1]["_dlt_parent_id"] != digest128("level1", DLT_ID_LENGTH_BYTES) for e in list_rows) + assert all( + e[1]["_dlt_parent_id"] != digest128("level1", DLT_ID_LENGTH_BYTES) for e in list_rows + ) assert all(r[0][1] == "table__f" for r in list_rows) obj_rows = [t for t in rows if t[0][0] == "table__f__o"] assert all(e[1]["_dlt_parent_id"] != digest128("level1", DLT_ID_LENGTH_BYTES) for e in obj_rows) @@ -179,50 +169,56 @@ def test_child_table_linking_primary_key(norm: RelationalNormalizer) -> None: def test_yields_parents_first(norm: RelationalNormalizer) -> None: row = { "id": "level0", - "f": [{ - "id": "level1", - "l": ["a", "b", "c"], - "v": 120, - "o": [{"a": 1}, {"a": 2}] - }], - "g": [{ - "id": "level2_g", - "l": ["a"] - }] + "f": [{"id": "level1", "l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}], + "g": [{"id": "level2_g", "l": ["a"]}], } - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] tables = list(r[0][0] for r in rows) # child tables are always yielded before parent tables - expected_tables = ['table', 'table__f', 'table__f__l', 'table__f__l', 'table__f__l', 'table__f__o', 'table__f__o', 'table__g', 'table__g__l'] + expected_tables = [ + "table", + "table__f", + "table__f__l", + "table__f__l", + "table__f__l", + "table__f__o", + "table__f__o", + "table__g", + "table__g__l", + ] assert expected_tables == tables def test_yields_parent_relation(norm: RelationalNormalizer) -> None: row = { "id": "level0", - "f": [{ - "id": "level1", - "l": ["a"], - "o": [{"a": 1}], - "b": { - "a": [ {"id": "level5"}], + "f": [ + { + "id": "level1", + "l": ["a"], + "o": [{"a": 1}], + "b": { + "a": [{"id": "level5"}], + }, } - }], + ], "d": { - "a": [ {"id": "level4"}], + "a": [{"id": "level4"}], "b": { - "a": [ {"id": "level5"}], + "a": [{"id": "level5"}], }, - "c": "x" + "c": "x", }, - "e": [{ - "o": [{"a": 1}], - "b": { - "a": [ {"id": "level5"}], + "e": [ + { + "o": [{"a": 1}], + "b": { + "a": [{"id": "level5"}], + }, } - }] + ], } - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] # normalizer must return parent table first and move in order of the list elements when yielding child tables # the yielding order if fully defined expected_parents = [ @@ -238,7 +234,7 @@ def test_yields_parent_relation(norm: RelationalNormalizer) -> None: # table__e is yielded it however only contains linking information ("table__e", "table"), ("table__e__o", "table__e"), - ("table__e__b__a", "table__e") + ("table__e__b__a", "table__e"), ] parents = list(r[0] for r in rows) assert parents == expected_parents @@ -281,13 +277,9 @@ def test_yields_parent_relation(norm: RelationalNormalizer) -> None: def test_list_position(norm: RelationalNormalizer) -> None: row: StrAny = { - "f": [{ - "l": ["a", "b", "c"], - "v": 120, - "lo": [{"e": "a"}, {"e": "b"}, {"e":"c"}] - }] + "f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}] } - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] # root has no pos root = [t for t in rows if t[0][0] == "table"][0][1] assert "_dlt_list_idx" not in root @@ -323,12 +315,8 @@ def test_list_position(norm: RelationalNormalizer) -> None: def test_control_descending(norm: RelationalNormalizer) -> None: row: StrAny = { - "f": [{ - "l": ["a", "b", "c"], - "v": 120, - "lo": [[{"e": "a"}, {"e": "b"}, {"e":"c"}]] - }], - "g": "val" + "f": [{"l": ["a", "b", "c"], "v": 120, "lo": [[{"e": "a"}, {"e": "b"}, {"e": "c"}]]}], + "g": "val", } # break at first row @@ -379,26 +367,23 @@ def test_list_in_list() -> None: "_dlt_id": "123456", "created_at": "2023-05-12T12:34:56Z", "ended_at": "2023-05-12T13:14:32Z", - "webpath": [[ - { - "url": "https://www.website.com/", - "timestamp": "2023-05-12T12:35:01Z" - }, - { - "url": "https://www.website.com/products", - "timestamp": "2023-05-12T12:38:45Z" - }, + "webpath": [ + [ + {"url": "https://www.website.com/", "timestamp": "2023-05-12T12:35:01Z"}, + {"url": "https://www.website.com/products", "timestamp": "2023-05-12T12:38:45Z"}, { "url": "https://www.website.com/products/item123", - "timestamp": "2023-05-12T12:42:22Z" + "timestamp": "2023-05-12T12:42:22Z", }, - [{ - "url": "https://www.website.com/products/item1234", - "timestamp": "2023-05-12T12:42:22Z" - }] + [ + { + "url": "https://www.website.com/products/item1234", + "timestamp": "2023-05-12T12:42:22Z", + } + ], ], - [1, 2, 3] - ] + [1, 2, 3], + ], } schema = create_schema_with_name("other") # root @@ -408,12 +393,12 @@ def test_list_in_list() -> None: zen__webpath = [row for row in rows if row[0][0] == "zen__webpath"] # two rows in web__zenpath for two lists assert len(zen__webpath) == 2 - assert zen__webpath[0][0] == ('zen__webpath', 'zen') + assert zen__webpath[0][0] == ("zen__webpath", "zen") # _dlt_id was hardcoded in the original row assert zen__webpath[0][1]["_dlt_parent_id"] == "123456" - assert zen__webpath[0][1]['_dlt_list_idx'] == 0 - assert zen__webpath[1][1]['_dlt_list_idx'] == 1 - assert zen__webpath[1][0] == ('zen__webpath', 'zen') + assert zen__webpath[0][1]["_dlt_list_idx"] == 0 + assert zen__webpath[1][1]["_dlt_list_idx"] == 1 + assert zen__webpath[1][0] == ("zen__webpath", "zen") # inner lists zen__webpath__list = [row for row in rows if row[0][0] == "zen__webpath__list"] # actually both list of objects and list of number will be in the same table @@ -427,7 +412,9 @@ def test_list_in_list() -> None: zen_table = new_table("zen") schema.update_table(zen_table) - path_table = new_table("zen__webpath", parent_table_name="zen", columns=[{"name": "list", "data_type": "complex"}]) + path_table = new_table( + "zen__webpath", parent_table_name="zen", columns=[{"name": "list", "data_type": "complex"}] + ) schema.update_table(path_table) rows = list(schema.normalize_data_item(chats, "1762162.1212", "zen")) # both lists are complex types now @@ -441,13 +428,9 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: # directly set record hash so it will be adopted in normalizer as top level hash row = { "_dlt_id": row_id, - "f": [{ - "l": ["a", "b", "c"], - "v": 120, - "lo": [{"e": "a"}, {"e": "b"}, {"e":"c"}] - }] + "f": [{"l": ["a", "b", "c"], "v": 120, "lo": [{"e": "a"}, {"e": "b"}, {"e": "c"}]}], } - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] children = [t for t in rows if t[0][0] != "table"] # all hashes must be different distinct_hashes = set([ch[1]["_dlt_id"] for ch in children]) @@ -455,7 +438,9 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: # compute hashes for all children for (table, _), ch in children: - expected_hash = digest128(f"{ch['_dlt_parent_id']}_{table}_{ch['_dlt_list_idx']}", DLT_ID_LENGTH_BYTES) + expected_hash = digest128( + f"{ch['_dlt_parent_id']}_{table}_{ch['_dlt_list_idx']}", DLT_ID_LENGTH_BYTES + ) assert ch["_dlt_id"] == expected_hash # direct compute one of the @@ -464,54 +449,64 @@ def test_child_row_deterministic_hash(norm: RelationalNormalizer) -> None: assert f_lo_p2["_dlt_id"] == digest128(f"{el_f['_dlt_id']}_table__f__lo_2", DLT_ID_LENGTH_BYTES) # same data with same table and row_id - rows_2 = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows_2 = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] children_2 = [t for t in rows_2 if t[0][0] != "table"] # corresponding hashes must be identical assert all(ch[0][1]["_dlt_id"] == ch[1][1]["_dlt_id"] for ch in zip(children, children_2)) # change parent table and all child hashes must be different - rows_4 = list(norm._normalize_row(row, {}, ("other_table", ))) # type: ignore[arg-type] + rows_4 = list(norm._normalize_row(row, {}, ("other_table",))) # type: ignore[arg-type] children_4 = [t for t in rows_4 if t[0][0] != "other_table"] assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_4)) # change parent hash and all child hashes must be different row["_dlt_id"] = uniq_id() - rows_3 = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + rows_3 = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] children_3 = [t for t in rows_3 if t[0][0] != "table"] assert all(ch[0][1]["_dlt_id"] != ch[1][1]["_dlt_id"] for ch in zip(children, children_3)) def test_keeps_dlt_id(norm: RelationalNormalizer) -> None: h = uniq_id() - row = { - "a": "b", - "_dlt_id": h - } - rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + row = {"a": "b", "_dlt_id": h} + rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] root = [t for t in rows if t[0][0] == "table"][0][1] assert root["_dlt_id"] == h def test_propagate_hardcoded_context(norm: RelationalNormalizer) -> None: row = {"level": 1, "list": ["a", "b", "c"], "comp": [{"_timestamp": "a"}]} - rows = list(norm._normalize_row(row, {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"}, ("table", ))) # type: ignore[arg-type] + rows = list(norm._normalize_row(row, {"_timestamp": 1238.9, "_dist_key": "SENDER_3000"}, ("table",))) # type: ignore[arg-type] # context is not added to root element root = next(t for t in rows if t[0][0] == "table")[1] assert "_timestamp" in root assert "_dist_key" in root # the original _timestamp field will be overwritten in children and added to lists - assert all(e[1]["_timestamp"] == 1238.9 and e[1]["_dist_key"] == "SENDER_3000" for e in rows if e[0][0] != "table") + assert all( + e[1]["_timestamp"] == 1238.9 and e[1]["_dist_key"] == "SENDER_3000" + for e in rows + if e[0][0] != "table" + ) def test_propagates_root_context(norm: RelationalNormalizer) -> None: add_dlt_root_id_propagation(norm) # add timestamp propagation - norm.schema._normalizers_config["json"]["config"]["propagation"]["root"]["timestamp"] = "_partition_ts" + norm.schema._normalizers_config["json"]["config"]["propagation"]["root"][ + "timestamp" + ] = "_partition_ts" # add propagation for non existing element - norm.schema._normalizers_config["json"]["config"]["propagation"]["root"]["__not_found"] = "__not_found" + norm.schema._normalizers_config["json"]["config"]["propagation"]["root"][ + "__not_found" + ] = "__not_found" - row = {"_dlt_id": "###", "timestamp": 12918291.1212, "dependent_list":[1, 2,3], "dependent_objects": [{"vx": "ax"}]} - normalized_rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + row = { + "_dlt_id": "###", + "timestamp": 12918291.1212, + "dependent_list": [1, 2, 3], + "dependent_objects": [{"vx": "ax"}], + } + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] # all non-root rows must have: non_root = [r for r in normalized_rows if r[0][1] is not None] assert all(r[1]["_dlt_root_id"] == "###" for r in non_root) @@ -520,15 +515,19 @@ def test_propagates_root_context(norm: RelationalNormalizer) -> None: @pytest.mark.parametrize("add_pk,add_dlt_id", [(False, False), (True, False), (True, True)]) -def test_propagates_table_context(norm: RelationalNormalizer, add_pk: bool, add_dlt_id: bool) -> None: +def test_propagates_table_context( + norm: RelationalNormalizer, add_pk: bool, add_dlt_id: bool +) -> None: add_dlt_root_id_propagation(norm) - prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"]["config"]["propagation"] + prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"][ + "config" + ]["propagation"] prop_config["root"]["timestamp"] = "_partition_ts" # type: ignore[index] # for table "table__lvl1" request to propagate "vx" and "partition_ovr" as "_partition_ts" (should overwrite root) prop_config["tables"]["table__lvl1"] = { # type: ignore[index] "vx": "__vx", "partition_ovr": "_partition_ts", - "__not_found": "__not_found" + "__not_found": "__not_found", } if add_pk: @@ -536,21 +535,17 @@ def test_propagates_table_context(norm: RelationalNormalizer, add_pk: bool, add_ norm.schema.merge_hints({"primary_key": [TSimpleRegex("vx")]}) row = { - "_dlt_id": "###", - "timestamp": 12918291.1212, - "lvl1": [{ - "vx": "ax", - "partition_ovr": 1283.12, - "lvl2": [{ - "_partition_ts": "overwritten" - }] - }] - } + "_dlt_id": "###", + "timestamp": 12918291.1212, + "lvl1": [ + {"vx": "ax", "partition_ovr": 1283.12, "lvl2": [{"_partition_ts": "overwritten"}]} + ], + } if add_dlt_id: # to reproduce a bug where rows with _dlt_id set were not extended row["lvl1"][0]["_dlt_id"] = "row_id_lvl1" # type: ignore[index] - normalized_rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] non_root = [r for r in normalized_rows if r[0][1] is not None] # _dlt_root_id in all non root assert all(r[1]["_dlt_root_id"] == "###" for r in non_root) @@ -559,21 +554,30 @@ def test_propagates_table_context(norm: RelationalNormalizer, add_pk: bool, add_ # _partition_ts == timestamp only at lvl1 assert all(r[1]["_partition_ts"] == 12918291.1212 for r in non_root if r[0][0] == "table__lvl1") # _partition_ts == partition_ovr and __vx only at lvl2 - assert all(r[1]["_partition_ts"] == 1283.12 and r[1]["__vx"] == "ax" for r in non_root if r[0][0] == "table__lvl1__lvl2") - assert any(r[1]["_partition_ts"] == 1283.12 and r[1]["__vx"] == "ax" for r in non_root if r[0][0] != "table__lvl1__lvl2") is False + assert all( + r[1]["_partition_ts"] == 1283.12 and r[1]["__vx"] == "ax" + for r in non_root + if r[0][0] == "table__lvl1__lvl2" + ) + assert ( + any( + r[1]["_partition_ts"] == 1283.12 and r[1]["__vx"] == "ax" + for r in non_root + if r[0][0] != "table__lvl1__lvl2" + ) + is False + ) def test_propagates_table_context_to_lists(norm: RelationalNormalizer) -> None: add_dlt_root_id_propagation(norm) - prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"]["config"]["propagation"] + prop_config: RelationalNormalizerConfigPropagation = norm.schema._normalizers_config["json"][ + "config" + ]["propagation"] prop_config["root"]["timestamp"] = "_partition_ts" # type: ignore[index] - row = { - "_dlt_id": "###", - "timestamp": 12918291.1212, - "lvl1": [1, 2, 3, [4, 5, 6]] - } - normalized_rows = list(norm._normalize_row(row, {}, ("table", ))) # type: ignore[arg-type] + row = {"_dlt_id": "###", "timestamp": 12918291.1212, "lvl1": [1, 2, 3, [4, 5, 6]]} + normalized_rows = list(norm._normalize_row(row, {}, ("table",))) # type: ignore[arg-type] # _partition_ts == timestamp on all child tables non_root = [r for r in normalized_rows if r[0][1] is not None] assert all(r[1]["_partition_ts"] == 12918291.1212 for r in non_root) @@ -586,7 +590,7 @@ def test_removes_normalized_list(norm: RelationalNormalizer) -> None: # after normalizing the list that got normalized into child table must be deleted row = {"comp": [{"_timestamp": "a"}]} # get iterator - normalized_rows_i = norm._normalize_row(row, {}, ("table", )) # type: ignore[arg-type] + normalized_rows_i = norm._normalize_row(row, {}, ("table",)) # type: ignore[arg-type] # yield just one item root_row = next(normalized_rows_i) # root_row = next(r for r in normalized_rows if r[0][1] is None) @@ -597,17 +601,20 @@ def test_preserves_complex_types_list(norm: RelationalNormalizer) -> None: # the exception to test_removes_normalized_list # complex types should be left as they are # add table with complex column - norm.schema.update_table(new_table("event_slot", - columns = [{ - "name": "value", - "data_type": "complex", - "nullable": "true" # type: ignore[typeddict-item] - }]) + norm.schema.update_table( + new_table( + "event_slot", + columns=[ + { + "name": "value", + "data_type": "complex", + "nullable": "true", # type: ignore[typeddict-item] + } + ], + ) ) - row = { - "value": ["from", {"complex": True}] - } - normalized_rows = list(norm._normalize_row(row, {}, ("event_slot", ))) # type: ignore[arg-type] + row = {"value": ["from", {"complex": True}]} + normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # type: ignore[arg-type] # make sure only 1 row is emitted, the list is not normalized assert len(normalized_rows) == 1 # value is kept in root row -> market as complex @@ -615,10 +622,8 @@ def test_preserves_complex_types_list(norm: RelationalNormalizer) -> None: assert root_row[1]["value"] == row["value"] # same should work for a list - row = { - "value": ["from", ["complex", True]] # type: ignore[list-item] - } - normalized_rows = list(norm._normalize_row(row, {}, ("event_slot", ))) # type: ignore[arg-type] + row = {"value": ["from", ["complex", True]]} # type: ignore[list-item] + normalized_rows = list(norm._normalize_row(row, {}, ("event_slot",))) # type: ignore[arg-type] # make sure only 1 row is emitted, the list is not normalized assert len(normalized_rows) == 1 # value is kept in root row -> market as complex @@ -634,7 +639,10 @@ def test_wrap_in_dict(norm: RelationalNormalizer) -> None: # wrap a list rows = list(norm.schema.normalize_data_item([1, 2, 3, 4, "A"], "load_id", "listex")) assert len(rows) == 6 - assert rows[0][0] == ("listex", None,) + assert rows[0][0] == ( + "listex", + None, + ) assert rows[1][0] == ("listex__value", "listex") assert rows[-1][1]["value"] == "A" @@ -644,15 +652,19 @@ def test_complex_types_for_recursion_level(norm: RelationalNormalizer) -> None: # if max recursion depth is set, nested elements will be kept as complex row = { "_dlt_id": "row_id", - "f": [{ - "l": ["a"], # , "b", "c" - "v": 120, - "lo": [{"e": {"v": 1}}] # , {"e": {"v": 2}}, {"e":{"v":3 }} - }] + "f": [ + { + "l": ["a"], # , "b", "c" + "v": 120, + "lo": [{"e": {"v": 1}}], # , {"e": {"v": 2}}, {"e":{"v":3 }} + } + ], } n_rows_nl = list(norm.schema.normalize_data_item(row, "load_id", "default")) # all nested elements were yielded - assert ["default", "default__f", "default__f__l", "default__f__lo"] == [r[0][0] for r in n_rows_nl] + assert ["default", "default__f", "default__f__l", "default__f__lo"] == [ + r[0][0] for r in n_rows_nl + ] # set max nesting to 0 set_max_nesting(norm, 0) @@ -697,12 +709,10 @@ def test_extract_with_table_name_meta() -> None: "flags": 0, "parent_id": None, "guild_id": "815421435900198962", - "permission_overwrites": [] + "permission_overwrites": [], } # force table name - rows = list( - create_schema_with_name("discord").normalize_data_item(row, "load_id", "channel") - ) + rows = list(create_schema_with_name("discord").normalize_data_item(row, "load_id", "channel")) # table is channel assert rows[0][0][0] == "channel" normalized_row = rows[0][1] @@ -729,13 +739,7 @@ def test_parse_with_primary_key() -> None: schema._compile_settings() add_dlt_root_id_propagation(schema.data_item_normalizer) # type: ignore[arg-type] - row = { - "id": "817949077341208606", - "w_id":[{ - "id": 9128918293891111, - "wo_id": [1, 2, 3] - }] - } + row = {"id": "817949077341208606", "w_id": [{"id": 9128918293891111, "wo_id": [1, 2, 3]}]} rows = list(schema.normalize_data_item(row, "load_id", "discord")) # get root root = next(t[1] for t in rows if t[0][0] == "discord") @@ -753,11 +757,15 @@ def test_parse_with_primary_key() -> None: assert "_dlt_root_id" in el_w_id # this must have deterministic child key - f_wo_id = next(t[1] for t in rows if t[0][0] == "discord__w_id__wo_id" and t[1]["_dlt_list_idx"] == 2) + f_wo_id = next( + t[1] for t in rows if t[0][0] == "discord__w_id__wo_id" and t[1]["_dlt_list_idx"] == 2 + ) assert f_wo_id["value"] == 3 assert f_wo_id["_dlt_root_id"] != digest128("817949077341208606", DLT_ID_LENGTH_BYTES) assert f_wo_id["_dlt_parent_id"] != digest128("9128918293891111", DLT_ID_LENGTH_BYTES) - assert f_wo_id["_dlt_id"] == RelationalNormalizer._get_child_row_hash(f_wo_id["_dlt_parent_id"], "discord__w_id__wo_id", 2) + assert f_wo_id["_dlt_id"] == RelationalNormalizer._get_child_row_hash( + f_wo_id["_dlt_parent_id"], "discord__w_id__wo_id", 2 + ) def test_keeps_none_values() -> None: @@ -777,16 +785,10 @@ def test_normalize_and_shorten_deterministically() -> None: data = { "short>ident:1": { - "short>ident:2": { - "short>ident:3": "a" - }, + "short>ident:2": {"short>ident:3": "a"}, }, - "LIST+ident:1": { - "LIST+ident:2": { - "LIST+ident:3": [1] - } - }, - "long+long:SO+LONG:_>16": True + "LIST+ident:1": {"LIST+ident:2": {"LIST+ident:3": [1]}}, + "long+long:SO+LONG:_>16": True, } rows = list(schema.normalize_data_item(data, "1762162.1212", "s")) # all identifiers are 16 chars or shorter @@ -800,14 +802,20 @@ def test_normalize_and_shorten_deterministically() -> None: root_data = rows[0][1] root_data_keys = list(root_data.keys()) # "short:ident:2": "a" will be flattened into root - tag = NamingConvention._compute_tag("short_ident_1__short_ident_2__short_ident_3", NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + "short_ident_1__short_ident_2__short_ident_3", NamingConvention._DEFAULT_COLLISION_PROB + ) assert tag in root_data_keys[0] # long:SO+LONG:_>16 shortened on normalized name - tag = NamingConvention._compute_tag("long+long:SO+LONG:_>16", NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + "long+long:SO+LONG:_>16", NamingConvention._DEFAULT_COLLISION_PROB + ) assert tag in root_data_keys[1] # table name in second row table_name = rows[1][0][0] - tag = NamingConvention._compute_tag("s__lis_txident_1__lis_txident_2__lis_txident_3", NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + "s__lis_txident_1__lis_txident_2__lis_txident_3", NamingConvention._DEFAULT_COLLISION_PROB + ) assert tag in table_name @@ -829,7 +837,6 @@ def test_normalize_empty_keys() -> None: # could also be in schema tests def test_propagation_update_on_table_change(norm: RelationalNormalizer): - # append does not have propagated columns table_1 = new_table("table_1", write_disposition="append") norm.schema.update_table(table_1) @@ -838,40 +845,41 @@ def test_propagation_update_on_table_change(norm: RelationalNormalizer): # change table to merge table_1["write_disposition"] = "merge" norm.schema.update_table(table_1) - assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"][table_1["name"]] == {'_dlt_id': '_dlt_root_id'} + assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + table_1["name"] + ] == {"_dlt_id": "_dlt_root_id"} # add subtable table_2 = new_table("table_2", parent_table_name="table_1") norm.schema.update_table(table_2) - assert "table_2" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + assert ( + "table_2" not in norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"] + ) # test merging into existing propagation - norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"]["table_3"] = {'prop1': 'prop2'} + norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"]["table_3"] = { + "prop1": "prop2" + } table_3 = new_table("table_3", write_disposition="merge") norm.schema.update_table(table_3) - assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"]["table_3"] == { - '_dlt_id': '_dlt_root_id', - 'prop1': 'prop2' - } - + assert norm.schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "table_3" + ] == {"_dlt_id": "_dlt_root_id", "prop1": "prop2"} def set_max_nesting(norm: RelationalNormalizer, max_nesting: int) -> None: - RelationalNormalizer.update_normalizer_config(norm.schema, - { - "max_nesting": max_nesting - } - ) + RelationalNormalizer.update_normalizer_config(norm.schema, {"max_nesting": max_nesting}) norm._reset() def add_dlt_root_id_propagation(norm: RelationalNormalizer) -> None: - RelationalNormalizer.update_normalizer_config(norm.schema, { - "propagation": { - "root": { - "_dlt_id": "_dlt_root_id" # type: ignore[dict-item] - }, - "tables": {} + RelationalNormalizer.update_normalizer_config( + norm.schema, + { + "propagation": { + "root": {"_dlt_id": "_dlt_root_id"}, # type: ignore[dict-item] + "tables": {}, } - }) + }, + ) norm._reset() diff --git a/tests/common/normalizers/test_naming.py b/tests/common/normalizers/test_naming.py index 02ff6e3c38..3bf4762c35 100644 --- a/tests/common/normalizers/test_naming.py +++ b/tests/common/normalizers/test_naming.py @@ -15,11 +15,17 @@ IDENT_20_CHARS = "she played cello well" RAW_IDENT = ".\n'played CELLO🚧_" RAW_IDENT_W_SPACES = f" {RAW_IDENT} \t\n" -RAW_IDENT_2 = "123.\"\rhello😄!" +RAW_IDENT_2 = '123."\rhello😄!' RAW_IDENT_2_W_SPACES = f"\n {RAW_IDENT_2} \t " RAW_PATH = [RAW_IDENT, RAW_IDENT_2_W_SPACES, RAW_IDENT_2, RAW_IDENT_2_W_SPACES] EMPTY_IDENT = " \t\n " -RAW_PATH_WITH_EMPTY_IDENT = [RAW_IDENT, RAW_IDENT_2_W_SPACES, EMPTY_IDENT, RAW_IDENT_2, RAW_IDENT_2_W_SPACES] +RAW_PATH_WITH_EMPTY_IDENT = [ + RAW_IDENT, + RAW_IDENT_2_W_SPACES, + EMPTY_IDENT, + RAW_IDENT_2, + RAW_IDENT_2_W_SPACES, +] def test_tag_collisions() -> None: @@ -29,52 +35,61 @@ def test_tag_collisions() -> None: generations = 100000 collisions = 0 for _ in range(0, generations): - tag = NamingConvention._compute_tag(uniq_id(32), collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + uniq_id(32), collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) if tag in tags: collisions += 1 else: tags[tag] = tag - assert collisions/generations < 0.001 + assert collisions / generations < 0.001 def test_tag_generation() -> None: # is content hash content = 20 * LONG_PATH - content_tag = NamingConvention._compute_tag(content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + content_tag = NamingConvention._compute_tag( + content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) # no randomness for _ in range(0, 20): - tag = NamingConvention._compute_tag(content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) assert content_tag == tag fixture = [ - ('3f17271231504b8cf65690bcdc379df8a3b8aabe12efe1ea82848ec5f497cb69', 'gds0iw'), - ('58e5c351b53ffe1233e0656a532a721ae1d2ac7af71b6cfec8ceb64c63b10721', 'uyboiq'), - ('e3f34629839cedcabba95354e48a78dc80b0cd35c02ddfbbf20196ba7f968866', '51wdcg'), - ('f0f22b8e8c58389a6c21dbcc1e261ee0354704e24996a0ec541276f58d1f2f52', 'bpm7ca'), - ('0d0de95c7c12ceee919d28d22c970285d80a36dea4fe32dbdd667a888ae6d47f', 'doqcuq'), - ('4973509ea648ddfbaf6c50e1fef33c3b0a3d1c1a82dff543a8255e60b6572567', 'cl7rpq'), - ('877c89f0dcbd24b8c3f787624ddca09deb6a44e4a72f12527209d78e4d9ed247', 'xrnycg'), - ('064df58cd3a51c50dbf30e975e63961a501212ff8e8ca544ab396727f4b8a367', 'kgiizq'), - ('c8f7da1b5c44c1ca10da67c1514c4cf365e4d5912685b25a39206d5c8c1966a1', 'dj9zqq'), - ('222d42333592ea87823fd2e7868d59fb0aded20603f433319691918299513cb6', 'futp4w'), - ('757d64eb242a91b494ec9e2661a7946410d68144d33860d6f4154092d65d5009', 'wetlpg'), - ('3c7348d43478292b4c4e0689d41a536fc8ccabdbd9fb9d0dfbe757a83d34cebe', 'avxagg'), - ('6896fac1546c201d4dc91d2c51bdcd9c820fe92fd0555947e59fdc89ca6f045d', 'wbaj3w'), - ('b4def322a4487dd90fcc4abd2f1efde0cdce81d8e0a580fd1897203ab4ebcebe', 'whojmw'), - ('07d974124b92adafc90473a3968ceb5e8329d815e0e48260473d70a781adb8ae', 'aiqcea'), - ('c67183a762e379290652cc26a786b21eff347643b1cc9012138f460901ce5d53', 'zfztpg'), - ('430976db5adef67d0009aa3cd9a2daca106829b36a7232732c5d694e7197c6d1', 'evr7rq'), - ('c1c8c0ff6933fa4e23fab5605139124b2c6cda0150a412daaea274818ee46e35', 'er0nxq'), - ('0060c538b6ce02b8d8e2c85b4e2810c58b846f4096ed7ab871fc092c45ac09d9', 'zh9xgg'), - ('4d4b99ff5d2a3d5cd076782c9cd088cd85d5c789d7de6bdc19c1d206b687d485', '2vvr5a') + ("3f17271231504b8cf65690bcdc379df8a3b8aabe12efe1ea82848ec5f497cb69", "gds0iw"), + ("58e5c351b53ffe1233e0656a532a721ae1d2ac7af71b6cfec8ceb64c63b10721", "uyboiq"), + ("e3f34629839cedcabba95354e48a78dc80b0cd35c02ddfbbf20196ba7f968866", "51wdcg"), + ("f0f22b8e8c58389a6c21dbcc1e261ee0354704e24996a0ec541276f58d1f2f52", "bpm7ca"), + ("0d0de95c7c12ceee919d28d22c970285d80a36dea4fe32dbdd667a888ae6d47f", "doqcuq"), + ("4973509ea648ddfbaf6c50e1fef33c3b0a3d1c1a82dff543a8255e60b6572567", "cl7rpq"), + ("877c89f0dcbd24b8c3f787624ddca09deb6a44e4a72f12527209d78e4d9ed247", "xrnycg"), + ("064df58cd3a51c50dbf30e975e63961a501212ff8e8ca544ab396727f4b8a367", "kgiizq"), + ("c8f7da1b5c44c1ca10da67c1514c4cf365e4d5912685b25a39206d5c8c1966a1", "dj9zqq"), + ("222d42333592ea87823fd2e7868d59fb0aded20603f433319691918299513cb6", "futp4w"), + ("757d64eb242a91b494ec9e2661a7946410d68144d33860d6f4154092d65d5009", "wetlpg"), + ("3c7348d43478292b4c4e0689d41a536fc8ccabdbd9fb9d0dfbe757a83d34cebe", "avxagg"), + ("6896fac1546c201d4dc91d2c51bdcd9c820fe92fd0555947e59fdc89ca6f045d", "wbaj3w"), + ("b4def322a4487dd90fcc4abd2f1efde0cdce81d8e0a580fd1897203ab4ebcebe", "whojmw"), + ("07d974124b92adafc90473a3968ceb5e8329d815e0e48260473d70a781adb8ae", "aiqcea"), + ("c67183a762e379290652cc26a786b21eff347643b1cc9012138f460901ce5d53", "zfztpg"), + ("430976db5adef67d0009aa3cd9a2daca106829b36a7232732c5d694e7197c6d1", "evr7rq"), + ("c1c8c0ff6933fa4e23fab5605139124b2c6cda0150a412daaea274818ee46e35", "er0nxq"), + ("0060c538b6ce02b8d8e2c85b4e2810c58b846f4096ed7ab871fc092c45ac09d9", "zh9xgg"), + ("4d4b99ff5d2a3d5cd076782c9cd088cd85d5c789d7de6bdc19c1d206b687d485", "2vvr5a"), ] for content, expected_tag in fixture: - tag = NamingConvention._compute_tag(content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) assert len(tag) == 6 assert tag == expected_tag # print(f"('{content}', '{tag}'),") + def test_tag_placement() -> None: # tags are placed in the middle of string and that must happen deterministically tag = "123456" @@ -99,20 +114,26 @@ def test_tag_placement() -> None: def test_shorten_identifier() -> None: # no limit - long_ident = 8*LONG_PATH + long_ident = 8 * LONG_PATH assert NamingConvention.shorten_identifier(long_ident, long_ident, None) == long_ident # within limit assert NamingConvention.shorten_identifier("012345678", "xxx012345678xxx", 10) == "012345678" - assert NamingConvention.shorten_identifier("0123456789", "xxx012345678xx?", 10) == "0123456789" # max_length + assert ( + NamingConvention.shorten_identifier("0123456789", "xxx012345678xx?", 10) == "0123456789" + ) # max_length # tag based on original string placed in the middle - tag = NamingConvention._compute_tag(IDENT_20_CHARS, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + IDENT_20_CHARS, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) norm_ident = NamingConvention.shorten_identifier(IDENT_20_CHARS, IDENT_20_CHARS, 20) assert tag in norm_ident assert len(norm_ident) == 20 assert norm_ident == "she plauanpualo well" # the tag must be based on raw string, not normalized string, one test case with spaces for raw_content in [uniq_id(), f" {uniq_id()} "]: - tag = NamingConvention._compute_tag(raw_content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + raw_content, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) norm_ident = NamingConvention.shorten_identifier(IDENT_20_CHARS, raw_content, 20) assert tag in norm_ident assert len(norm_ident) == 20 @@ -135,7 +156,9 @@ def test_normalize_with_shorten_identifier(convention: Type[NamingConvention]) - # force to shorten naming = convention(len(RAW_IDENT) // 2) # tag expected on stripped RAW_IDENT - tag = NamingConvention._compute_tag(RAW_IDENT, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + RAW_IDENT, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) # spaces are stripped assert naming.normalize_identifier(RAW_IDENT) == naming.normalize_identifier(RAW_IDENT_W_SPACES) assert tag in naming.normalize_identifier(RAW_IDENT) @@ -192,7 +215,11 @@ def test_normalize_path(convention: Type[NamingConvention]) -> None: norm_path_str = naming.normalize_path(raw_path_str) assert len(naming.break_path(norm_path_str)) == len(RAW_PATH) # double norm path does not change anything - assert naming.normalize_path(raw_path_str) == naming.normalize_path(norm_path_str) == naming.normalize_path(naming.normalize_path(norm_path_str)) + assert ( + naming.normalize_path(raw_path_str) + == naming.normalize_path(norm_path_str) + == naming.normalize_path(naming.normalize_path(norm_path_str)) + ) # empty element in path is ignored assert naming.make_path(*RAW_PATH_WITH_EMPTY_IDENT) == raw_path_str assert naming.normalize_path(raw_path_str) == norm_path_str @@ -200,12 +227,18 @@ def test_normalize_path(convention: Type[NamingConvention]) -> None: # preserve idents but shorten path naming = convention(len(RAW_IDENT) * 2) # give enough max length # tag computed from raw path - tag = NamingConvention._compute_tag(raw_path_str, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB) + tag = NamingConvention._compute_tag( + raw_path_str, collision_prob=NamingConvention._DEFAULT_COLLISION_PROB + ) tagged_raw_path_str = naming.normalize_path(raw_path_str) # contains tag assert tag in tagged_raw_path_str # idempotent - assert tagged_raw_path_str == naming.normalize_path(tagged_raw_path_str) == naming.normalize_path(naming.normalize_path(tagged_raw_path_str)) + assert ( + tagged_raw_path_str + == naming.normalize_path(tagged_raw_path_str) + == naming.normalize_path(naming.normalize_path(tagged_raw_path_str)) + ) assert tagged_raw_path_str == naming.make_path(*naming.break_path(tagged_raw_path_str)) # also cut idents diff --git a/tests/common/normalizers/test_naming_duck_case.py b/tests/common/normalizers/test_naming_duck_case.py index ed63800ca9..099134ca2f 100644 --- a/tests/common/normalizers/test_naming_duck_case.py +++ b/tests/common/normalizers/test_naming_duck_case.py @@ -17,8 +17,11 @@ def test_normalize_identifier(naming_unlimited: NamingConvention) -> None: assert naming_unlimited.normalize_identifier("🦚🦚Peacocks") == "🦚🦚Peacocks" assert naming_unlimited.normalize_identifier("🦚🦚peacocks") == "🦚🦚peacocks" # non latin alphabets - assert naming_unlimited.normalize_identifier("Ölübeµrsईउऊऋऌऍऎएc⇨usǁs⛔lÄnder") == "Ölübeµrsईउऊऋऌऍऎएc⇨usǁs⛔lÄnder" + assert ( + naming_unlimited.normalize_identifier("Ölübeµrsईउऊऋऌऍऎएc⇨usǁs⛔lÄnder") + == "Ölübeµrsईउऊऋऌऍऎएc⇨usǁs⛔lÄnder" + ) def test_alphabet_reduction(naming_unlimited: NamingConvention) -> None: - assert naming_unlimited.normalize_identifier("A\nB\"C\rD") == "A_B_C_D" + assert naming_unlimited.normalize_identifier('A\nB"C\rD') == "A_B_C_D" diff --git a/tests/common/normalizers/test_naming_snake_case.py b/tests/common/normalizers/test_naming_snake_case.py index b51801b6c4..6d619b5257 100644 --- a/tests/common/normalizers/test_naming_snake_case.py +++ b/tests/common/normalizers/test_naming_snake_case.py @@ -38,7 +38,10 @@ def test_normalize_identifier(naming_unlimited: NamingConvention) -> None: def test_alphabet_reduction(naming_unlimited: NamingConvention) -> None: - assert naming_unlimited.normalize_identifier(SnakeCaseNamingConvention._REDUCE_ALPHABET[0]) == SnakeCaseNamingConvention._REDUCE_ALPHABET[1] + assert ( + naming_unlimited.normalize_identifier(SnakeCaseNamingConvention._REDUCE_ALPHABET[0]) + == SnakeCaseNamingConvention._REDUCE_ALPHABET[1] + ) def test_normalize_path(naming_unlimited: NamingConvention) -> None: @@ -78,6 +81,7 @@ def test_normalize_make_path(convention: Type[NamingConvention]) -> None: def test_normalizes_underscores(naming_unlimited: NamingConvention) -> None: - assert naming_unlimited.normalize_identifier("event__value_value2____") == "event_value_value2xxxx" + assert ( + naming_unlimited.normalize_identifier("event__value_value2____") == "event_value_value2xxxx" + ) assert naming_unlimited.normalize_path("e_vent__value_value2___") == "e_vent__value_value2__x" - diff --git a/tests/common/reflection/test_reflect_spec.py b/tests/common/reflection/test_reflect_spec.py index 17ec9ade47..11c66a2763 100644 --- a/tests/common/reflection/test_reflect_spec.py +++ b/tests/common/reflection/test_reflect_spec.py @@ -5,7 +5,11 @@ from dlt.common import Decimal from dlt.common.typing import TSecretValue, is_optional_type from dlt.common.configuration.inject import get_fun_spec, with_config -from dlt.common.configuration.specs import BaseConfiguration, RunConfiguration, ConnectionStringCredentials +from dlt.common.configuration.specs import ( + BaseConfiguration, + RunConfiguration, + ConnectionStringCredentials, +) from dlt.common.reflection.spec import spec_from_signature, _get_spec_name_from_f from dlt.common.reflection.utils import get_func_def_node, get_literal_defaults @@ -13,14 +17,21 @@ _DECIMAL_DEFAULT = Decimal("0.01") _SECRET_DEFAULT = TSecretValue("PASS") _CONFIG_DEFAULT = RunConfiguration() -_CREDENTIALS_DEFAULT = ConnectionStringCredentials("postgresql://loader:loader@localhost:5432/dlt_data") +_CREDENTIALS_DEFAULT = ConnectionStringCredentials( + "postgresql://loader:loader@localhost:5432/dlt_data" +) def test_synthesize_spec_from_sig() -> None: - # spec from typed signature without defaults - def f_typed(p1: str = None, p2: Decimal = None, p3: Any = None, p4: Optional[RunConfiguration] = None, p5: TSecretValue = dlt.secrets.value) -> None: + def f_typed( + p1: str = None, + p2: Decimal = None, + p3: Any = None, + p4: Optional[RunConfiguration] = None, + p5: TSecretValue = dlt.secrets.value, + ) -> None: pass SPEC: Any = spec_from_signature(f_typed, inspect.signature(f_typed)) @@ -30,11 +41,23 @@ def f_typed(p1: str = None, p2: Decimal = None, p3: Any = None, p4: Optional[Run assert SPEC.p4 is None assert SPEC.p5 is None fields = SPEC.get_resolvable_fields() - assert fields == {"p1": Optional[str], "p2": Optional[Decimal], "p3": Optional[Any], "p4": Optional[RunConfiguration], "p5": TSecretValue} + assert fields == { + "p1": Optional[str], + "p2": Optional[Decimal], + "p3": Optional[Any], + "p4": Optional[RunConfiguration], + "p5": TSecretValue, + } # spec from typed signatures with defaults - def f_typed_default(t_p1: str = "str", t_p2: Decimal = _DECIMAL_DEFAULT, t_p3: Any = _SECRET_DEFAULT, t_p4: RunConfiguration = _CONFIG_DEFAULT, t_p5: str = None) -> None: + def f_typed_default( + t_p1: str = "str", + t_p2: Decimal = _DECIMAL_DEFAULT, + t_p3: Any = _SECRET_DEFAULT, + t_p4: RunConfiguration = _CONFIG_DEFAULT, + t_p5: str = None, + ) -> None: pass SPEC = spec_from_signature(f_typed_default, inspect.signature(f_typed_default)) @@ -46,11 +69,17 @@ def f_typed_default(t_p1: str = "str", t_p2: Decimal = _DECIMAL_DEFAULT, t_p3: A fields = SPEC().get_resolvable_fields() # Any will not assume TSecretValue type because at runtime it's a str # setting default as None will convert type into optional (t_p5) - assert fields == {"t_p1": str, "t_p2": Decimal, "t_p3": str, "t_p4": RunConfiguration, "t_p5": Optional[str]} + assert fields == { + "t_p1": str, + "t_p2": Decimal, + "t_p3": str, + "t_p4": RunConfiguration, + "t_p5": Optional[str], + } # spec from untyped signature - def f_untyped(untyped_p1 = None, untyped_p2 = dlt.config.value) -> None: + def f_untyped(untyped_p1=None, untyped_p2=dlt.config.value) -> None: pass SPEC = spec_from_signature(f_untyped, inspect.signature(f_untyped)) @@ -61,11 +90,14 @@ def f_untyped(untyped_p1 = None, untyped_p2 = dlt.config.value) -> None: # spec types derived from defaults - - def f_untyped_default(untyped_p1 = "str", untyped_p2 = _DECIMAL_DEFAULT, untyped_p3 = _CREDENTIALS_DEFAULT, untyped_p4 = None) -> None: + def f_untyped_default( + untyped_p1="str", + untyped_p2=_DECIMAL_DEFAULT, + untyped_p3=_CREDENTIALS_DEFAULT, + untyped_p4=None, + ) -> None: pass - SPEC = spec_from_signature(f_untyped_default, inspect.signature(f_untyped_default)) assert SPEC.untyped_p1 == "str" assert SPEC.untyped_p2 == _DECIMAL_DEFAULT @@ -73,11 +105,23 @@ def f_untyped_default(untyped_p1 = "str", untyped_p2 = _DECIMAL_DEFAULT, untyped assert SPEC.untyped_p4 is None fields = SPEC.get_resolvable_fields() # untyped_p4 converted to Optional[Any] - assert fields == {"untyped_p1": str, "untyped_p2": Decimal, "untyped_p3": ConnectionStringCredentials, "untyped_p4": Optional[Any]} + assert fields == { + "untyped_p1": str, + "untyped_p2": Decimal, + "untyped_p3": ConnectionStringCredentials, + "untyped_p4": Optional[Any], + } # spec from signatures containing positional only and keywords only args - def f_pos_kw_only(pos_only_1=dlt.config.value, pos_only_2: str = "default", /, *, kw_only_1=None, kw_only_2: int = 2) -> None: + def f_pos_kw_only( + pos_only_1=dlt.config.value, + pos_only_2: str = "default", + /, + *, + kw_only_1=None, + kw_only_2: int = 2, + ) -> None: pass SPEC = spec_from_signature(f_pos_kw_only, inspect.signature(f_pos_kw_only)) @@ -86,12 +130,19 @@ def f_pos_kw_only(pos_only_1=dlt.config.value, pos_only_2: str = "default", /, * assert SPEC.kw_only_1 is None assert SPEC.kw_only_2 == 2 fields = SPEC.get_resolvable_fields() - assert fields == {"pos_only_1": Any, "pos_only_2": str, "kw_only_1": Optional[Any], "kw_only_2": int} + assert fields == { + "pos_only_1": Any, + "pos_only_2": str, + "kw_only_1": Optional[Any], + "kw_only_2": int, + } # skip arguments with defaults # deregister spec to disable cache del globals()[SPEC.__name__] - SPEC = spec_from_signature(f_pos_kw_only, inspect.signature(f_pos_kw_only), include_defaults=False) + SPEC = spec_from_signature( + f_pos_kw_only, inspect.signature(f_pos_kw_only), include_defaults=False + ) assert not hasattr(SPEC, "kw_only_1") assert not hasattr(SPEC, "kw_only_2") assert not hasattr(SPEC, "pos_only_2") @@ -111,7 +162,6 @@ def f_variadic(var_1: str = "A", *args, kw_var_1: str, **kwargs) -> None: def test_spec_none_when_no_fields() -> None: - def f_default_only(arg1, arg2=None): pass @@ -119,7 +169,9 @@ def f_default_only(arg1, arg2=None): assert SPEC is not None del globals()[SPEC.__name__] - SPEC = spec_from_signature(f_default_only, inspect.signature(f_default_only), include_defaults=False) + SPEC = spec_from_signature( + f_default_only, inspect.signature(f_default_only), include_defaults=False + ) assert SPEC is None def f_no_spec(arg1): @@ -129,20 +181,39 @@ def f_no_spec(arg1): assert SPEC is None -def f_top_kw_defaults_args(arg1, arg2 = "top", arg3 = dlt.config.value, *args, kw1, kw_lit = "12131", kw_secret_val = dlt.secrets.value, **kwargs): +def f_top_kw_defaults_args( + arg1, + arg2="top", + arg3=dlt.config.value, + *args, + kw1, + kw_lit="12131", + kw_secret_val=dlt.secrets.value, + **kwargs, +): pass def test_argument_have_dlt_config_defaults() -> None: - def f_defaults( - req_val, config_val = dlt.config.value, secret_val = dlt.secrets.value, /, - pos_cf = None, pos_cf_val = dlt.config.value, pos_secret_val = dlt.secrets.value, *, - kw_val = None, kw_cf_val = dlt.config.value, kw_secret_val = dlt.secrets.value): + req_val, + config_val=dlt.config.value, + secret_val=dlt.secrets.value, + /, + pos_cf=None, + pos_cf_val=dlt.config.value, + pos_secret_val=dlt.secrets.value, + *, + kw_val=None, + kw_cf_val=dlt.config.value, + kw_secret_val=dlt.secrets.value, + ): pass @with_config - def f_kw_defaults(*, kw1 = dlt.config.value, kw_lit = "12131", kw_secret_val = dlt.secrets.value, **kwargs): + def f_kw_defaults( + *, kw1=dlt.config.value, kw_lit="12131", kw_secret_val=dlt.secrets.value, **kwargs + ): pass # do not delete those spaces @@ -151,18 +222,42 @@ def f_kw_defaults(*, kw1 = dlt.config.value, kw_lit = "12131", kw_secret_val = d @with_config # they are part of the test - def f_kw_defaults_args(arg1, arg2 = 2, arg3 = dlt.config.value, *args, kw1, kw_lit = "12131", kw_secret_val = dlt.secrets.value, **kwargs): + def f_kw_defaults_args( + arg1, + arg2=2, + arg3=dlt.config.value, + *args, + kw1, + kw_lit="12131", + kw_secret_val=dlt.secrets.value, + **kwargs, + ): pass - node = get_func_def_node(f_defaults) assert node.name == "f_defaults" literal_defaults = get_literal_defaults(node) - assert literal_defaults == {'kw_secret_val': 'dlt.secrets.value', 'kw_cf_val': 'dlt.config.value', 'kw_val': 'None', 'pos_secret_val': 'dlt.secrets.value', 'pos_cf_val': 'dlt.config.value', 'pos_cf': 'None', 'secret_val': 'dlt.secrets.value', 'config_val': 'dlt.config.value'} + assert literal_defaults == { + "kw_secret_val": "dlt.secrets.value", + "kw_cf_val": "dlt.config.value", + "kw_val": "None", + "pos_secret_val": "dlt.secrets.value", + "pos_cf_val": "dlt.config.value", + "pos_cf": "None", + "secret_val": "dlt.secrets.value", + "config_val": "dlt.config.value", + } SPEC = spec_from_signature(f_defaults, inspect.signature(f_defaults)) fields = SPEC.get_resolvable_fields() # fields market with dlt config are not optional, same for required fields - for arg in ["config_val", "secret_val", "pos_cf_val", "pos_secret_val", "kw_cf_val", "kw_secret_val"]: + for arg in [ + "config_val", + "secret_val", + "pos_cf_val", + "pos_secret_val", + "kw_cf_val", + "kw_secret_val", + ]: assert not is_optional_type(fields[arg]) for arg in ["pos_cf", "kw_val"]: assert is_optional_type(fields[arg]) @@ -172,7 +267,11 @@ def f_kw_defaults_args(arg1, arg2 = 2, arg3 = dlt.config.value, *args, kw1, kw_l node = get_func_def_node(f_kw_defaults) assert node.name == "f_kw_defaults" literal_defaults = get_literal_defaults(node) - assert literal_defaults == {'kw_secret_val': 'dlt.secrets.value', 'kw_lit': "'12131'", "kw1": "dlt.config.value"} + assert literal_defaults == { + "kw_secret_val": "dlt.secrets.value", + "kw_lit": "'12131'", + "kw1": "dlt.config.value", + } SPEC = spec_from_signature(f_kw_defaults, inspect.signature(f_kw_defaults)) fields = SPEC.get_resolvable_fields() assert not is_optional_type(fields["kw_lit"]) @@ -183,9 +282,19 @@ def f_kw_defaults_args(arg1, arg2 = 2, arg3 = dlt.config.value, *args, kw1, kw_l assert node.name == "f_kw_defaults_args" literal_defaults = get_literal_defaults(node) # print(literal_defaults) - assert literal_defaults == {'kw_secret_val': 'dlt.secrets.value', 'kw_lit': "'12131'", 'arg3': 'dlt.config.value', 'arg2': '2'} + assert literal_defaults == { + "kw_secret_val": "dlt.secrets.value", + "kw_lit": "'12131'", + "arg3": "dlt.config.value", + "arg2": "2", + } node = get_func_def_node(f_top_kw_defaults_args) assert node.name == "f_top_kw_defaults_args" literal_defaults = get_literal_defaults(node) - assert literal_defaults == {'kw_secret_val': 'dlt.secrets.value', 'kw_lit': "'12131'", 'arg3': 'dlt.config.value', 'arg2': "'top'"} + assert literal_defaults == { + "kw_secret_val": "dlt.secrets.value", + "kw_lit": "'12131'", + "arg3": "dlt.config.value", + "arg2": "'top'", + } diff --git a/tests/common/runners/test_pipes.py b/tests/common/runners/test_pipes.py index 706bef3860..6db7c2d0e2 100644 --- a/tests/common/runners/test_pipes.py +++ b/tests/common/runners/test_pipes.py @@ -27,6 +27,7 @@ class _TestPickler(NamedTuple): # self.s1 = s1 # self.s2 = s2 + class _TestClassUnkField: pass # def __init__(self, s1: _TestPicklex, s2: str) -> None: @@ -55,19 +56,25 @@ def test_pickle_encoder_none() -> None: def test_synth_pickler_unknown_types() -> None: # synth unknown tuple - obj = decode_obj("LfDoYo19lgUOtTn0Ib6JgASVQAAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjAxfVGVzdFBpY2tsZXiUk5SMA1hZWpRLe4aUgZQu") + obj = decode_obj( + "LfDoYo19lgUOtTn0Ib6JgASVQAAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjAxfVGVzdFBpY2tsZXiUk5SMA1hZWpRLe4aUgZQu" + ) assert type(obj).__name__.endswith("_TestPicklex") # this is completely different type assert not isinstance(obj, tuple) # synth unknown class containing other unknown types - obj = decode_obj("Koyo502yl4IKMqIxUTJFgASVbQAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjApfVGVzdENsYXNzlJOUKYGUfZQojAJzMZRoAIwMX1Rlc3RQaWNrbGV4lJOUjAFZlEsXhpSBlIwCczKUjAFVlIwDX3MzlEsDdWIu") + obj = decode_obj( + "Koyo502yl4IKMqIxUTJFgASVbQAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjApfVGVzdENsYXNzlJOUKYGUfZQojAJzMZRoAIwMX1Rlc3RQaWNrbGV4lJOUjAFZlEsXhpSBlIwCczKUjAFVlIwDX3MzlEsDdWIu" + ) assert type(obj).__name__.endswith("_TestClass") # tuple inside will be synthesized as well assert type(obj.s1).__name__.endswith("_TestPicklex") # known class containing unknown types - obj = decode_obj("PozhjHuf2oS7jPcRxKoagASVbQAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjBJfVGVzdENsYXNzVW5rRmllbGSUk5QpgZR9lCiMAnMxlGgAjAxfVGVzdFBpY2tsZXiUk5SMAVmUSxeGlIGUjAJzMpSMAVWUdWIu") + obj = decode_obj( + "PozhjHuf2oS7jPcRxKoagASVbQAAAAAAAACMH3Rlc3RzLmNvbW1vbi5ydW5uZXJzLnRlc3RfcGlwZXOUjBJfVGVzdENsYXNzVW5rRmllbGSUk5QpgZR9lCiMAnMxlGgAjAxfVGVzdFBpY2tsZXiUk5SMAVmUSxeGlIGUjAJzMpSMAVWUdWIu" + ) assert isinstance(obj, _TestClassUnkField) assert type(obj.s1).__name__.endswith("_TestPicklex") # type: ignore[attr-defined] @@ -88,7 +95,9 @@ def test_iter_stdout() -> None: lines = list(iter_stdout(venv, "python", "tests/common/scripts/empty.py")) assert lines == [] with pytest.raises(CalledProcessError) as cpe: - list(iter_stdout(venv, "python", "tests/common/scripts/no_stdout_no_stderr_with_fail.py")) + list( + iter_stdout(venv, "python", "tests/common/scripts/no_stdout_no_stderr_with_fail.py") + ) # empty stdout assert cpe.value.output == "" assert cpe.value.stderr == "" @@ -102,7 +111,9 @@ def test_iter_stdout_raises() -> None: with Venv.create(tempfile.mkdtemp()) as venv: expected = ["0", "1", "2"] with pytest.raises(CalledProcessError) as cpe: - for i, line in enumerate(iter_stdout(venv, "python", "tests/common/scripts/raising_counter.py")): + for i, line in enumerate( + iter_stdout(venv, "python", "tests/common/scripts/raising_counter.py") + ): assert expected[i] == line assert cpe.value.returncode == 1 # the last output line is available @@ -120,7 +131,9 @@ def test_iter_stdout_raises() -> None: # three lines with 1 MB size + newline _i = -1 with pytest.raises(CalledProcessError) as cpe: - for _i, line in enumerate(iter_stdout(venv, "python", "tests/common/scripts/long_lines_fails.py")): + for _i, line in enumerate( + iter_stdout(venv, "python", "tests/common/scripts/long_lines_fails.py") + ): assert len(line) == 1024 * 1024 assert line == "a" * 1024 * 1024 # there were 3 lines @@ -158,11 +171,15 @@ def test_iter_stdout_with_result() -> None: assert iter_until_returns(i) is None # it just excepts without encoding exception with pytest.raises(CalledProcessError): - i = iter_stdout_with_result(venv, "python", "tests/common/scripts/no_stdout_no_stderr_with_fail.py") + i = iter_stdout_with_result( + venv, "python", "tests/common/scripts/no_stdout_no_stderr_with_fail.py" + ) iter_until_returns(i) # this raises a decoded exception: UnsupportedProcessStartMethodException with pytest.raises(UnsupportedProcessStartMethodException): - i = iter_stdout_with_result(venv, "python", "tests/common/scripts/stdout_encode_exception.py") + i = iter_stdout_with_result( + venv, "python", "tests/common/scripts/stdout_encode_exception.py" + ) iter_until_returns(i) diff --git a/tests/common/runners/test_runnable.py b/tests/common/runners/test_runnable.py index eae4a46a70..e25f28e521 100644 --- a/tests/common/runners/test_runnable.py +++ b/tests/common/runners/test_runnable.py @@ -1,6 +1,7 @@ import gc import pytest import multiprocessing + # from multiprocessing.pool import Pool # from multiprocessing.dummy import Pool as ThreadPool from concurrent.futures import Executor, ProcessPoolExecutor, ThreadPoolExecutor @@ -9,10 +10,15 @@ from dlt.normalize.configuration import SchemaStorageConfiguration from dlt.common.runners import Runnable -from tests.common.runners.utils import _TestRunnableWorkerMethod, _TestRunnableWorker, ALL_METHODS, mp_method_auto +from tests.common.runners.utils import ( + _TestRunnableWorkerMethod, + _TestRunnableWorker, + ALL_METHODS, + mp_method_auto, +) -@pytest.mark.parametrize('method', ALL_METHODS) +@pytest.mark.parametrize("method", ALL_METHODS) def test_runnable_process_pool(method: str) -> None: # 4 tasks r = _TestRunnableWorker(4) @@ -45,7 +51,7 @@ def test_runnable_direct_worker_call() -> None: assert rv[0] == 199 -@pytest.mark.parametrize('method', ALL_METHODS) +@pytest.mark.parametrize("method", ALL_METHODS) def test_process_worker_started_early(method: str) -> None: with ProcessPoolExecutor(4, mp_context=multiprocessing.get_context(method)) as p: r = _TestRunnableWorkerMethod(4) @@ -71,7 +77,7 @@ def test_weak_pool_ref() -> None: r = wref[rid] -@pytest.mark.parametrize('method', ALL_METHODS) +@pytest.mark.parametrize("method", ALL_METHODS) def test_configuredworker(method: str) -> None: # call worker method with CONFIG values that should be restored into CONFIG type config = SchemaStorageConfiguration() diff --git a/tests/common/runners/test_runners.py b/tests/common/runners/test_runners.py index 9045d40ad9..2b81c2ea54 100644 --- a/tests/common/runners/test_runners.py +++ b/tests/common/runners/test_runners.py @@ -10,7 +10,12 @@ from dlt.common.runtime import initialize_runtime from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType -from tests.common.runners.utils import _TestRunnableWorkerMethod, _TestRunnableWorker, ALL_METHODS, mp_method_auto +from tests.common.runners.utils import ( + _TestRunnableWorkerMethod, + _TestRunnableWorker, + ALL_METHODS, + mp_method_auto, +) from tests.utils import init_test_logging @@ -43,6 +48,7 @@ def logger_autouse() -> None: _counter = 0 + @pytest.fixture(autouse=True) def default_args() -> None: signals._received_signal = 0 @@ -117,15 +123,12 @@ def test_single_non_idle_run() -> None: def test_runnable_with_runner() -> None: r = _TestRunnableWorkerMethod(4) - runs_count = runner.run_pool( - configure(ThreadPoolConfiguration), - r - ) + runs_count = runner.run_pool(configure(ThreadPoolConfiguration), r) assert runs_count == 1 assert [v[0] for v in r.rv] == list(range(4)) -@pytest.mark.parametrize('method', ALL_METHODS) +@pytest.mark.parametrize("method", ALL_METHODS) def test_pool_runner_process_methods(method) -> None: multiprocessing.set_start_method(method, force=True) r = _TestRunnableWorker(4) @@ -133,9 +136,6 @@ def test_pool_runner_process_methods(method) -> None: C = resolve_configuration(RunConfiguration()) initialize_runtime(C) - runs_count = runner.run_pool( - configure(ProcessPoolConfiguration), - r - ) + runs_count = runner.run_pool(configure(ProcessPoolConfiguration), r) assert runs_count == 1 assert [v[0] for v in r.rv] == list(range(4)) diff --git a/tests/common/runners/test_venv.py b/tests/common/runners/test_venv.py index 79e485862e..ee62df3c83 100644 --- a/tests/common/runners/test_venv.py +++ b/tests/common/runners/test_venv.py @@ -235,7 +235,9 @@ def test_start_command() -> None: # custom environ with custom_environ({"_CUSTOM_ENV_VALUE": "uniq"}): - with venv.start_command("python", "tests/common/scripts/environ.py", stdout=PIPE, text=True) as process: + with venv.start_command( + "python", "tests/common/scripts/environ.py", stdout=PIPE, text=True + ) as process: output, _ = process.communicate() assert process.poll() == 0 assert "_CUSTOM_ENV_VALUE" in output diff --git a/tests/common/runners/utils.py b/tests/common/runners/utils.py index 1791a0ed28..3d6adbf70c 100644 --- a/tests/common/runners/utils.py +++ b/tests/common/runners/utils.py @@ -10,7 +10,7 @@ from dlt.common.utils import uniq_id # remove fork-server because it hangs the tests no CI -ALL_METHODS = set(multiprocessing.get_all_start_methods()).intersection(['fork', 'spawn']) +ALL_METHODS = set(multiprocessing.get_all_start_methods()).intersection(["fork", "spawn"]) @pytest.fixture(autouse=True) @@ -38,7 +38,9 @@ def worker(self: "_TestRunnableWorkerMethod", v: int) -> Tuple[int, str, int]: def _run(self, pool: Executor) -> List[Tuple[int, str, int]]: rid = id(self) assert rid in _TestRunnableWorkerMethod.RUNNING - self.rv = rv = list(pool.map(_TestRunnableWorkerMethod.worker, *zip(*[(rid, i) for i in range(self.tasks)]))) + self.rv = rv = list( + pool.map(_TestRunnableWorkerMethod.worker, *zip(*[(rid, i) for i in range(self.tasks)])) + ) assert rid in _TestRunnableWorkerMethod.RUNNING return rv @@ -62,7 +64,9 @@ def worker(v: int) -> Tuple[int, int]: return (v, os.getpid()) def _run(self, pool: Executor) -> List[Tuple[int, int]]: - self.rv = rv = list(pool.map(_TestRunnableWorker.worker, *zip(*[(i, ) for i in range(self.tasks)]))) + self.rv = rv = list( + pool.map(_TestRunnableWorker.worker, *zip(*[(i,) for i in range(self.tasks)])) + ) return rv def run(self, pool: Executor) -> TRunMetrics: diff --git a/tests/common/runtime/test_collector.py b/tests/common/runtime/test_collector.py index 600c3b3d4b..dbe4b8c94d 100644 --- a/tests/common/runtime/test_collector.py +++ b/tests/common/runtime/test_collector.py @@ -45,4 +45,4 @@ def test_dict_collector_reset_counters(): assert collector.counters["counter1"] == 5 with DictCollector()("test2") as collector: - assert collector.counters == defaultdict(int) \ No newline at end of file + assert collector.counters == defaultdict(int) diff --git a/tests/common/runtime/test_logging.py b/tests/common/runtime/test_logging.py index 357cd180fb..19f67fe899 100644 --- a/tests/common/runtime/test_logging.py +++ b/tests/common/runtime/test_logging.py @@ -28,11 +28,16 @@ def test_version_extract(environment: DictStrStr) -> None: version = exec_info.dlt_version_info("logger") # assert version["dlt_version"].startswith(code_version) lib_version = pkg_version("dlt") - assert version == {'dlt_version': lib_version, 'pipeline_name': 'logger'} + assert version == {"dlt_version": lib_version, "pipeline_name": "logger"} # mock image info available in container mock_image_env(environment) version = exec_info.dlt_version_info("logger") - assert version == {'dlt_version': lib_version, 'commit_sha': '192891', 'pipeline_name': 'logger', 'image_version': 'scale/v:112'} + assert version == { + "dlt_version": lib_version, + "commit_sha": "192891", + "pipeline_name": "logger", + "image_version": "scale/v:112", + } def test_pod_info_extract(environment: DictStrStr) -> None: @@ -40,17 +45,29 @@ def test_pod_info_extract(environment: DictStrStr) -> None: assert pod_info == {} mock_pod_env(environment) pod_info = exec_info.kube_pod_info() - assert pod_info == {'kube_node_name': 'node_name', 'kube_pod_name': 'pod_name', 'kube_pod_namespace': 'namespace'} + assert pod_info == { + "kube_node_name": "node_name", + "kube_pod_name": "pod_name", + "kube_pod_namespace": "namespace", + } def test_github_info_extract(environment: DictStrStr) -> None: mock_github_env(environment) github_info = exec_info.github_info() - assert github_info == {"github_user": "rudolfix", "github_repository": "dlt-hub/beginners-workshop-2022", "github_repository_owner": "dlt-hub"} + assert github_info == { + "github_user": "rudolfix", + "github_repository": "dlt-hub/beginners-workshop-2022", + "github_repository_owner": "dlt-hub", + } mock_github_env(environment) del environment["GITHUB_USER"] github_info = exec_info.github_info() - assert github_info == {"github_user": "dlt-hub", "github_repository": "dlt-hub/beginners-workshop-2022", "github_repository_owner": "dlt-hub"} + assert github_info == { + "github_user": "dlt-hub", + "github_repository": "dlt-hub/beginners-workshop-2022", + "github_repository_owner": "dlt-hub", + } @pytest.mark.forked @@ -68,9 +85,9 @@ def test_text_logger_init(environment: DictStrStr) -> None: @pytest.mark.forked - def test_json_logger_init(environment: DictStrStr) -> None: from dlt.common.runtime import json_logging + mock_image_env(environment) mock_pod_env(environment) init_test_logging(JsonLoggerConfiguration()) @@ -87,7 +104,6 @@ def test_json_logger_init(environment: DictStrStr) -> None: @pytest.mark.forked def test_double_log_init(environment: DictStrStr) -> None: - mock_image_env(environment) mock_pod_env(environment) diff --git a/tests/common/runtime/test_signals.py b/tests/common/runtime/test_signals.py index 839738f904..179491de16 100644 --- a/tests/common/runtime/test_signals.py +++ b/tests/common/runtime/test_signals.py @@ -65,7 +65,6 @@ def test_delayed_signals_context_manager() -> None: def test_sleep_signal() -> None: - thread_signal = 0 def _thread() -> None: diff --git a/tests/common/runtime/test_telemetry.py b/tests/common/runtime/test_telemetry.py index 9ffc5dc628..eece36aae7 100644 --- a/tests/common/runtime/test_telemetry.py +++ b/tests/common/runtime/test_telemetry.py @@ -13,13 +13,21 @@ from tests.common.runtime.utils import mock_image_env, mock_github_env, mock_pod_env from tests.common.configuration.utils import environment -from tests.utils import preserve_environ, skipifspawn, skipifwindows, init_test_logging, start_test_telemetry +from tests.utils import ( + preserve_environ, + skipifspawn, + skipifwindows, + init_test_logging, + start_test_telemetry, +) @configspec class SentryLoggerConfiguration(RunConfiguration): pipeline_name: str = "logger" - sentry_dsn: str = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + sentry_dsn: str = ( + "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + ) dlthub_telemetry_segment_write_key: str = "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB" @@ -28,17 +36,19 @@ class SentryLoggerCriticalConfiguration(SentryLoggerConfiguration): log_level: str = "CRITICAL" if TYPE_CHECKING: + def __init__( self, pipeline_name: str = "logger", sentry_dsn: str = "https://sentry.io", dlthub_telemetry_segment_write_key: str = "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB", log_level: str = "CRITICAL", - ) -> None: - ... + ) -> None: ... + def test_sentry_log_level() -> None: from dlt.common.runtime.sentry import _get_sentry_log_level + sll = _get_sentry_log_level(SentryLoggerCriticalConfiguration(log_level="CRITICAL")) assert sll._handler.level == logging._nameToLevel["CRITICAL"] sll = _get_sentry_log_level(SentryLoggerCriticalConfiguration(log_level="ERROR")) @@ -97,6 +107,8 @@ def test_cleanup(environment: DictStrStr) -> None: SENT_ITEMS = [] + + def _mock_before_send(event: DictStrAny, _unused_hint: Any = None) -> DictStrAny: # print(event) SENT_ITEMS.append(event) diff --git a/tests/common/schema/test_coercion.py b/tests/common/schema/test_coercion.py index 2ac11e71d8..922024a89b 100644 --- a/tests/common/schema/test_coercion.py +++ b/tests/common/schema/test_coercion.py @@ -30,9 +30,12 @@ def test_coerce_type_to_text() -> None: # double into text assert coerce_value("text", "double", -1726.1288) == "-1726.1288" # bytes to text (base64) - assert coerce_value("text", "binary", b'binary string') == "YmluYXJ5IHN0cmluZw==" + assert coerce_value("text", "binary", b"binary string") == "YmluYXJ5IHN0cmluZw==" # HexBytes to text (hex with prefix) - assert coerce_value("text", "binary", HexBytes(b'binary string')) == "0x62696e61727920737472696e67" + assert ( + coerce_value("text", "binary", HexBytes(b"binary string")) == "0x62696e61727920737472696e67" + ) + # Str enum value class StrEnum(Enum): a = "a_value" @@ -42,6 +45,7 @@ class StrEnum(Enum): # Make sure we get the bare str value, not the enum instance assert not isinstance(str_enum_result, Enum) assert str_enum_result == "b_value" + # Mixed enum value class MixedEnum(Enum): a = "a_value" @@ -68,7 +72,7 @@ def test_coerce_type_to_bool() -> None: with pytest.raises(ValueError): coerce_value("bool", "complex", {"a": True}) with pytest.raises(ValueError): - coerce_value("bool", "binary", b'True') + coerce_value("bool", "binary", b"True") with pytest.raises(ValueError): coerce_value("bool", "timestamp", pendulum.now()) @@ -79,7 +83,7 @@ def test_coerce_type_to_double() -> None: # text into double if parsable assert coerce_value("double", "text", " -1726.1288 ") == -1726.1288 # hex text into double - assert coerce_value("double", "text", "0xff") == 255.0 + assert coerce_value("double", "text", "0xff") == 255.0 # wei, decimal to double assert coerce_value("double", "wei", Wei.from_int256(2137, decimals=2)) == 21.37 assert coerce_value("double", "decimal", Decimal("-1121.11")) == -1121.11 @@ -123,10 +127,7 @@ class IntEnum(int, Enum): assert int_enum_result == 2 -@pytest.mark.parametrize("dec_cls,data_type", [ - (Decimal, "decimal"), - (Wei, "wei") -]) +@pytest.mark.parametrize("dec_cls,data_type", [(Decimal, "decimal"), (Wei, "wei")]) def test_coerce_to_numeric(dec_cls: Type[Any], data_type: TDataType) -> None: v = coerce_value(data_type, "text", " -1726.839283 ") assert type(v) is dec_cls @@ -162,20 +163,36 @@ def test_coerce_type_from_hex_text() -> None: def test_coerce_type_to_timestamp() -> None: # timestamp cases - assert coerce_value("timestamp", "text", " 1580405246 ") == pendulum.parse("2020-01-30T17:27:26+00:00") + assert coerce_value("timestamp", "text", " 1580405246 ") == pendulum.parse( + "2020-01-30T17:27:26+00:00" + ) # the tenths of microseconds will be ignored - assert coerce_value("timestamp", "double", 1633344898.7415245) == pendulum.parse("2021-10-04T10:54:58.741524+00:00") + assert coerce_value("timestamp", "double", 1633344898.7415245) == pendulum.parse( + "2021-10-04T10:54:58.741524+00:00" + ) # if text is ISO string it will be coerced - assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466000+00:00") == pendulum.parse("2022-05-10T03:41:31.466000+00:00") - assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466+02:00") == pendulum.parse("2022-05-10T01:41:31.466Z") - assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466+0200") == pendulum.parse("2022-05-10T01:41:31.466Z") + assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466000+00:00") == pendulum.parse( + "2022-05-10T03:41:31.466000+00:00" + ) + assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466+02:00") == pendulum.parse( + "2022-05-10T01:41:31.466Z" + ) + assert coerce_value("timestamp", "text", "2022-05-10T03:41:31.466+0200") == pendulum.parse( + "2022-05-10T01:41:31.466Z" + ) # parse almost ISO compliant string - assert coerce_value("timestamp", "text", "2022-04-26 10:36+02") == pendulum.parse("2022-04-26T10:36:00+02:00") - assert coerce_value("timestamp", "text", "2022-04-26 10:36") == pendulum.parse("2022-04-26T10:36:00+00:00") + assert coerce_value("timestamp", "text", "2022-04-26 10:36+02") == pendulum.parse( + "2022-04-26T10:36:00+02:00" + ) + assert coerce_value("timestamp", "text", "2022-04-26 10:36") == pendulum.parse( + "2022-04-26T10:36:00+00:00" + ) # parse date string assert coerce_value("timestamp", "text", "2021-04-25") == pendulum.parse("2021-04-25") # from date type - assert coerce_value("timestamp", "date", datetime.date(2023, 2, 27)) == pendulum.parse("2023-02-27") + assert coerce_value("timestamp", "date", datetime.date(2023, 2, 27)) == pendulum.parse( + "2023-02-27" + ) # fails on "now" - yes pendulum by default parses "now" as .now() with pytest.raises(ValueError): @@ -222,21 +239,35 @@ def test_coerce_type_to_timestamp() -> None: def test_coerce_type_to_date() -> None: # from datetime object - assert coerce_value("date", "timestamp", pendulum.datetime(1995, 5, 6, 00, 1, 1, tz=UTC)) == pendulum.parse("1995-05-06", exact=True) + assert coerce_value( + "date", "timestamp", pendulum.datetime(1995, 5, 6, 00, 1, 1, tz=UTC) + ) == pendulum.parse("1995-05-06", exact=True) # from unix timestamp - assert coerce_value("date", "double", 1677546399.494264) == pendulum.parse("2023-02-28", exact=True) + assert coerce_value("date", "double", 1677546399.494264) == pendulum.parse( + "2023-02-28", exact=True + ) assert coerce_value("date", "text", " 1677546399 ") == pendulum.parse("2023-02-28", exact=True) # ISO date string assert coerce_value("date", "text", "2023-02-27") == pendulum.parse("2023-02-27", exact=True) # ISO datetime string - assert coerce_value("date", "text", "2022-05-10T03:41:31.466000+00:00") == pendulum.parse("2022-05-10", exact=True) - assert coerce_value("date", "text", "2022-05-10T03:41:31.466+02:00") == pendulum.parse("2022-05-10", exact=True) - assert coerce_value("date", "text", "2022-05-10T03:41:31.466+0200") == pendulum.parse("2022-05-10", exact=True) + assert coerce_value("date", "text", "2022-05-10T03:41:31.466000+00:00") == pendulum.parse( + "2022-05-10", exact=True + ) + assert coerce_value("date", "text", "2022-05-10T03:41:31.466+02:00") == pendulum.parse( + "2022-05-10", exact=True + ) + assert coerce_value("date", "text", "2022-05-10T03:41:31.466+0200") == pendulum.parse( + "2022-05-10", exact=True + ) # almost ISO compliant string - assert coerce_value("date", "text", "2022-04-26 10:36+02") == pendulum.parse("2022-04-26", exact=True) - assert coerce_value("date", "text", "2022-04-26 10:36") == pendulum.parse("2022-04-26", exact=True) + assert coerce_value("date", "text", "2022-04-26 10:36+02") == pendulum.parse( + "2022-04-26", exact=True + ) + assert coerce_value("date", "text", "2022-04-26 10:36") == pendulum.parse( + "2022-04-26", exact=True + ) - # iso time string fails + # iso time string fails with pytest.raises(ValueError): coerce_value("timestamp", "text", "03:41:31.466") @@ -247,18 +278,26 @@ def test_coerce_type_to_date() -> None: def test_coerce_type_to_time() -> None: # from ISO time string - assert coerce_value("time", "text", "03:41:31.466000") == pendulum.parse("03:41:31.466000", exact=True) + assert coerce_value("time", "text", "03:41:31.466000") == pendulum.parse( + "03:41:31.466000", exact=True + ) # time object returns same value - assert coerce_value("time", "time", pendulum.time(3, 41, 31, 466000)) == pendulum.time(3, 41, 31, 466000) + assert coerce_value("time", "time", pendulum.time(3, 41, 31, 466000)) == pendulum.time( + 3, 41, 31, 466000 + ) # from datetime object fails with pytest.raises(TypeError): coerce_value("time", "timestamp", pendulum.datetime(1995, 5, 6, 00, 1, 1, tz=UTC)) # from unix timestamp fails with pytest.raises(TypeError): - assert coerce_value("time", "double", 1677546399.494264) == pendulum.parse("01:06:39.494264", exact=True) + assert coerce_value("time", "double", 1677546399.494264) == pendulum.parse( + "01:06:39.494264", exact=True + ) with pytest.raises(ValueError): - assert coerce_value("time", "text", " 1677546399 ") == pendulum.parse("01:06:39", exact=True) + assert coerce_value("time", "text", " 1677546399 ") == pendulum.parse( + "01:06:39", exact=True + ) # ISO date string fails with pytest.raises(ValueError): assert coerce_value("time", "text", "2023-02-27") == pendulum.parse("00:00:00", exact=True) @@ -269,9 +308,9 @@ def test_coerce_type_to_time() -> None: def test_coerce_type_to_binary() -> None: # from hex string - assert coerce_value("binary", "text", "0x30") == b'0' + assert coerce_value("binary", "text", "0x30") == b"0" # from base64 - assert coerce_value("binary", "text", "YmluYXJ5IHN0cmluZw==") == b'binary string' + assert coerce_value("binary", "text", "YmluYXJ5IHN0cmluZw==") == b"binary string" # int into bytes assert coerce_value("binary", "bigint", 15) == b"\x0f" # can't into double @@ -344,8 +383,16 @@ def test_coerce_type_complex() -> None: def test_coerce_type_complex_with_pua() -> None: - v_dict = {"list": [1, Wei.from_int256(10**18), f"{_DATETIME}2022-05-10T01:41:31.466Z"], "str": "complex", "pua_date": f"{_DATETIME}2022-05-10T01:41:31.466Z"} - exp_v = {"list":[1, Wei.from_int256(10**18), "2022-05-10T01:41:31.466Z"],"str":"complex","pua_date":"2022-05-10T01:41:31.466Z"} + v_dict = { + "list": [1, Wei.from_int256(10**18), f"{_DATETIME}2022-05-10T01:41:31.466Z"], + "str": "complex", + "pua_date": f"{_DATETIME}2022-05-10T01:41:31.466Z", + } + exp_v = { + "list": [1, Wei.from_int256(10**18), "2022-05-10T01:41:31.466Z"], + "str": "complex", + "pua_date": "2022-05-10T01:41:31.466Z", + } assert coerce_value("complex", "complex", copy(v_dict)) == exp_v assert coerce_value("text", "complex", copy(v_dict)) == json.dumps(exp_v) # also decode recursively diff --git a/tests/common/schema/test_detections.py b/tests/common/schema/test_detections.py index 13cb09faec..cba2767c94 100644 --- a/tests/common/schema/test_detections.py +++ b/tests/common/schema/test_detections.py @@ -2,7 +2,16 @@ from dlt.common import pendulum, Decimal, Wei from dlt.common.schema.utils import autodetect_sc_type -from dlt.common.schema.detections import is_hexbytes_to_text, is_timestamp, is_iso_timestamp, is_iso_date, is_large_integer, is_wei_to_double, _FLOAT_TS_RANGE, _NOW_TS +from dlt.common.schema.detections import ( + is_hexbytes_to_text, + is_timestamp, + is_iso_timestamp, + is_iso_date, + is_large_integer, + is_wei_to_double, + _FLOAT_TS_RANGE, + _NOW_TS, +) def test_timestamp_detection() -> None: @@ -69,12 +78,12 @@ def test_detection_large_integer() -> None: assert is_large_integer(int, 2**64 // 2) == "wei" assert is_large_integer(int, 578960446186580977117854925043439539267) == "text" assert is_large_integer(int, 2**64 // 2 - 1) is None - assert is_large_integer(int, -2**64 // 2 - 1) is None + assert is_large_integer(int, -(2**64) // 2 - 1) is None def test_detection_hexbytes_to_text() -> None: - assert is_hexbytes_to_text(bytes, b'hey') is None - assert is_hexbytes_to_text(HexBytes, b'hey') == "text" + assert is_hexbytes_to_text(bytes, b"hey") is None + assert is_hexbytes_to_text(HexBytes, b"hey") == "text" def test_wei_to_double() -> None: @@ -89,7 +98,10 @@ def test_detection_function() -> None: assert autodetect_sc_type(["iso_date"], str, str(pendulum.now().date())) == "date" assert autodetect_sc_type(["iso_date"], float, str(pendulum.now().date())) is None assert autodetect_sc_type(["timestamp"], str, str(pendulum.now())) is None - assert autodetect_sc_type(["timestamp", "iso_timestamp"], float, pendulum.now().timestamp()) == "timestamp" + assert ( + autodetect_sc_type(["timestamp", "iso_timestamp"], float, pendulum.now().timestamp()) + == "timestamp" + ) assert autodetect_sc_type(["timestamp", "large_integer"], int, 2**64) == "wei" - assert autodetect_sc_type(["large_integer", "hexbytes_to_text"], HexBytes, b'hey') == "text" + assert autodetect_sc_type(["large_integer", "hexbytes_to_text"], HexBytes, b"hey") == "text" assert autodetect_sc_type(["large_integer", "wei_to_double"], Wei, Wei(10**18)) == "double" diff --git a/tests/common/schema/test_filtering.py b/tests/common/schema/test_filtering.py index 9a7fe01f54..8cfac9309f 100644 --- a/tests/common/schema/test_filtering.py +++ b/tests/common/schema/test_filtering.py @@ -50,9 +50,14 @@ def test_whole_row_filter_with_exception(schema: Schema) -> None: # mind that path event_bot__custom_data__included_object was also eliminated assert filtered_case == {} # this child of the row has exception (^event_bot__custom_data__included_object__ - the __ at the end select all childern but not the parent) - filtered_case = schema.filter_row("event_bot__custom_data__included_object", deepcopy(bot_case)["custom_data"]["included_object"]) + filtered_case = schema.filter_row( + "event_bot__custom_data__included_object", + deepcopy(bot_case)["custom_data"]["included_object"], + ) assert filtered_case == bot_case["custom_data"]["included_object"] - filtered_case = schema.filter_row("event_bot__custom_data__excluded_path", deepcopy(bot_case)["custom_data"]["excluded_path"]) + filtered_case = schema.filter_row( + "event_bot__custom_data__excluded_path", deepcopy(bot_case)["custom_data"]["excluded_path"] + ) assert filtered_case == {} @@ -60,16 +65,13 @@ def test_filter_parent_table_schema_update(schema: Schema) -> None: # filter out parent table and leave just child one. that should break the child-parent relationship and reject schema update _add_excludes(schema) source_row = { - "metadata": [{ - "elvl1": [{ - "elvl2": [{ - "id": "level3_kept" - }], - "f": "elvl1_removed" - }], - "f": "metadata_removed" - }] - } + "metadata": [ + { + "elvl1": [{"elvl2": [{"id": "level3_kept"}], "f": "elvl1_removed"}], + "f": "metadata_removed", + } + ] + } updates = [] @@ -96,7 +98,9 @@ def test_filter_parent_table_schema_update(schema: Schema) -> None: updates.clear() schema = Schema("event") _add_excludes(schema) - schema.get_table("event_bot")["filters"]["includes"].extend([TSimpleRegex("re:^metadata___dlt_"), TSimpleRegex("re:^metadata__elvl1___dlt_")]) + schema.get_table("event_bot")["filters"]["includes"].extend( + [TSimpleRegex("re:^metadata___dlt_"), TSimpleRegex("re:^metadata__elvl1___dlt_")] + ) schema._compile_settings() for (t, p), row in schema.normalize_data_item(source_row, "load_id", "event_bot"): row = schema.filter_row(t, row) @@ -118,7 +122,9 @@ def _add_excludes(schema: Schema) -> None: bot_table = new_table("event_bot") bot_table.setdefault("filters", {})["excludes"] = ["re:^metadata", "re:^is_flagged$", "re:^data", "re:^custom_data"] # type: ignore[typeddict-item] bot_table["filters"]["includes"] = [ - TSimpleRegex("re:^data__custom$"), TSimpleRegex("re:^custom_data__included_object__"), TSimpleRegex("re:^metadata__elvl1__elvl2__") + TSimpleRegex("re:^data__custom$"), + TSimpleRegex("re:^custom_data__included_object__"), + TSimpleRegex("re:^metadata__elvl1__elvl2__"), ] schema.update_table(bot_table) schema._compile_settings() diff --git a/tests/common/schema/test_inference.py b/tests/common/schema/test_inference.py index 24c97219fc..8d567f6993 100644 --- a/tests/common/schema/test_inference.py +++ b/tests/common/schema/test_inference.py @@ -7,7 +7,12 @@ from dlt.common.json import custom_pua_decode from dlt.common.schema import Schema, utils from dlt.common.schema.typing import TSimpleRegex -from dlt.common.schema.exceptions import CannotCoerceColumnException, CannotCoerceNullException, ParentTableNotFoundException, TablePropertiesConflictException +from dlt.common.schema.exceptions import ( + CannotCoerceColumnException, + CannotCoerceNullException, + ParentTableNotFoundException, + TablePropertiesConflictException, +) from tests.common.utils import load_json_case @@ -80,7 +85,12 @@ def test_coerce_row(schema: Schema) -> None: timestamp_float = 78172.128 timestamp_str = "1970-01-01T21:42:52.128000+00:00" # add new column with preferred - row_1 = {"timestamp": timestamp_float, "confidence": "0.1", "value": "0xFF", "number": Decimal("128.67")} + row_1 = { + "timestamp": timestamp_float, + "confidence": "0.1", + "value": "0xFF", + "number": Decimal("128.67"), + } new_row_1, new_table = schema.coerce_row("event_user", None, row_1) # convert columns to list, they must correspond to the order of fields in row_1 new_columns = list(new_table["columns"].values()) @@ -94,7 +104,12 @@ def test_coerce_row(schema: Schema) -> None: assert new_columns[3]["data_type"] == "decimal" assert "variant" not in new_columns[3] # also rows values should be coerced (confidence) - assert new_row_1 == {"timestamp": pendulum.parse(timestamp_str), "confidence": 0.1, "value": 255, "number": Decimal("128.67")} + assert new_row_1 == { + "timestamp": pendulum.parse(timestamp_str), + "confidence": 0.1, + "value": 255, + "number": Decimal("128.67"), + } # update schema schema.update_table(new_table) @@ -137,7 +152,9 @@ def test_coerce_row(schema: Schema) -> None: schema.update_table(new_table) # variant column clashes with existing column - create new_colbool_v_binary column that would be created for binary variant, but give it a type datetime - _, new_table = schema.coerce_row("event_user", None, {"new_colbool": False, "new_colbool__v_timestamp": b"not fit"}) + _, new_table = schema.coerce_row( + "event_user", None, {"new_colbool": False, "new_colbool__v_timestamp": b"not fit"} + ) schema.update_table(new_table) with pytest.raises(CannotCoerceColumnException) as exc_val: # now pass the binary that would create binary variant - but the column is occupied by text type @@ -179,7 +196,12 @@ def test_shorten_variant_column(schema: Schema) -> None: _add_preferred_types(schema) timestamp_float = 78172.128 # add new column with preferred - row_1 = {"timestamp": timestamp_float, "confidence": "0.1", "value": "0xFF", "number": Decimal("128.67")} + row_1 = { + "timestamp": timestamp_float, + "confidence": "0.1", + "value": "0xFF", + "number": Decimal("128.67"), + } _, new_table = schema.coerce_row("event_user", None, row_1) # schema assumes that identifiers are already normalized so confidence even if it is longer than 9 chars schema.update_table(new_table) @@ -188,7 +210,9 @@ def test_shorten_variant_column(schema: Schema) -> None: # now variant is created and this will be normalized # TODO: we should move the handling of variants to normalizer new_row_2, new_table = schema.coerce_row("event_user", None, {"confidence": False}) - tag = schema.naming._compute_tag("confidence__v_bool", collision_prob=schema.naming._DEFAULT_COLLISION_PROB) + tag = schema.naming._compute_tag( + "confidence__v_bool", collision_prob=schema.naming._DEFAULT_COLLISION_PROB + ) new_row_2_keys = list(new_row_2.keys()) assert tag in new_row_2_keys[0] assert len(new_row_2_keys[0]) == 9 @@ -252,15 +276,18 @@ def test_supports_variant_pua_decode(schema: Schema) -> None: # pua encoding still present assert normalized_row[0][1]["wad"].startswith("") # decode pua - decoded_row = {k: custom_pua_decode(v) for k,v in normalized_row[0][1].items()} + decoded_row = {k: custom_pua_decode(v) for k, v in normalized_row[0][1].items()} assert isinstance(decoded_row["wad"], Wei) c_row, new_table = schema.coerce_row("eth", None, decoded_row) - assert c_row["wad__v_str"] == str(2**256-1) + assert c_row["wad__v_str"] == str(2**256 - 1) assert new_table["columns"]["wad__v_str"]["data_type"] == "text" def test_supports_variant(schema: Schema) -> None: - rows = [{"evm": Wei.from_int256(2137*10**16, decimals=18)}, {"evm": Wei.from_int256(2**256-1)}] + rows = [ + {"evm": Wei.from_int256(2137 * 10**16, decimals=18)}, + {"evm": Wei.from_int256(2**256 - 1)}, + ] normalized_rows: List[Any] = [] for row in rows: normalized_rows.extend(schema.normalize_data_item(row, "128812.2131", "event")) @@ -270,7 +297,7 @@ def test_supports_variant(schema: Schema) -> None: # row 2 contains Wei assert "evm" in normalized_rows[1][1] assert isinstance(normalized_rows[1][1]["evm"], Wei) - assert normalized_rows[1][1]["evm"] == 2**256-1 + assert normalized_rows[1][1]["evm"] == 2**256 - 1 # coerce row c_row, new_table = schema.coerce_row("eth", None, normalized_rows[0][1]) assert isinstance(c_row["evm"], Wei) @@ -281,13 +308,12 @@ def test_supports_variant(schema: Schema) -> None: # coerce row that should expand to variant c_row, new_table = schema.coerce_row("eth", None, normalized_rows[1][1]) assert isinstance(c_row["evm__v_str"], str) - assert c_row["evm__v_str"] == str(2**256-1) + assert c_row["evm__v_str"] == str(2**256 - 1) assert new_table["columns"]["evm__v_str"]["data_type"] == "text" assert new_table["columns"]["evm__v_str"]["variant"] is True def test_supports_recursive_variant(schema: Schema) -> None: - class RecursiveVariant(int): # provide __call__ for SupportVariant def __call__(self) -> Any: @@ -296,18 +322,16 @@ def __call__(self) -> Any: else: return ("div2", RecursiveVariant(self // 2)) - row = {"rv": RecursiveVariant(8)} c_row, new_table = schema.coerce_row("rec_variant", None, row) # this variant keeps expanding until the value is 1, we start from 8 so there are log2(8) == 3 divisions - col_name = "rv" + "__v_div2"*3 + col_name = "rv" + "__v_div2" * 3 assert c_row[col_name] == 1 assert new_table["columns"][col_name]["data_type"] == "bigint" assert new_table["columns"][col_name]["variant"] is True def test_supports_variant_autovariant_conflict(schema: Schema) -> None: - class PureVariant(int): def __init__(self, v: Any) -> None: self.v = v @@ -319,7 +343,7 @@ def __call__(self) -> Any: if isinstance(self.v, float): return ("text", self.v) - assert issubclass(PureVariant,int) + assert issubclass(PureVariant, int) rows = [{"pv": PureVariant(3377)}, {"pv": PureVariant(21.37)}] normalized_rows: List[Any] = [] for row in rows: @@ -413,9 +437,13 @@ def test_update_schema_table_prop_conflict(schema: Schema) -> None: def test_update_schema_column_conflict(schema: Schema) -> None: - tab1 = utils.new_table("tab1", write_disposition="append", columns=[ - {"name": "col1", "data_type": "text", "nullable": False}, - ]) + tab1 = utils.new_table( + "tab1", + write_disposition="append", + columns=[ + {"name": "col1", "data_type": "text", "nullable": False}, + ], + ) schema.update_table(tab1) tab1_u1 = deepcopy(tab1) # simulate column that had other datatype inferred @@ -508,15 +536,20 @@ def test_infer_on_incomplete_column(schema: Schema) -> None: schema.update_table(table) # make sure that column is still incomplete and has no default hints assert schema.get_table("table")["columns"]["I"] == { - 'name': 'I', - 'nullable': False, - 'primary_key': True, - 'x-special': 'spec' + "name": "I", + "nullable": False, + "primary_key": True, + "x-special": "spec", } timestamp_float = 78172.128 # add new column with preferred - row_1 = {"timestamp": timestamp_float, "confidence": "0.1", "I": "0xFF", "number": Decimal("128.67")} + row_1 = { + "timestamp": timestamp_float, + "confidence": "0.1", + "I": "0xFF", + "number": Decimal("128.67"), + } _, new_table = schema.coerce_row("table", None, row_1) assert "I" in new_table["columns"] i_column = new_table["columns"]["I"] diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 2eb903f041..0bb7818b31 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -2,38 +2,38 @@ from copy import copy, deepcopy from dlt.common.schema import Schema, utils -from dlt.common.schema.exceptions import CannotCoerceColumnException, CannotCoerceNullException, TablePropertiesConflictException +from dlt.common.schema.exceptions import ( + CannotCoerceColumnException, + CannotCoerceNullException, + TablePropertiesConflictException, +) from dlt.common.schema.typing import TStoredSchema, TTableSchema, TColumnSchema COL_1_HINTS: TColumnSchema = { # type: ignore[typeddict-unknown-key] - "cluster": False, - "foreign_key": True, - "data_type": "text", - "name": "test", - "x-special": True, - "x-special-int": 100, - "nullable": False, - "x-special-bool": False, - "prop": None - } + "cluster": False, + "foreign_key": True, + "data_type": "text", + "name": "test", + "x-special": True, + "x-special-int": 100, + "nullable": False, + "x-special-bool": False, + "prop": None, +} COL_1_HINTS_DEFAULTS: TColumnSchema = { # type: ignore[typeddict-unknown-key] - 'foreign_key': True, - 'data_type': 'text', - 'name': 'test', - 'x-special': True, - 'x-special-int': 100, - 'nullable': False, - "x-special-bool": False, - } - -COL_2_HINTS: TColumnSchema = { - "nullable": True, - "name": "test_2", - "primary_key": False + "foreign_key": True, + "data_type": "text", + "name": "test", + "x-special": True, + "x-special-int": 100, + "nullable": False, + "x-special-bool": False, } +COL_2_HINTS: TColumnSchema = {"nullable": True, "name": "test_2", "primary_key": False} + def test_check_column_defaults() -> None: assert utils.has_default_column_hint_value("data_type", "text") is False @@ -77,21 +77,17 @@ def test_remove_defaults_stored_schema() -> None: "description": "description", "resource": "🦚Table", "x-special": 128, - "columns": { - "test": COL_1_HINTS, - "test_2": COL_2_HINTS - } + "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } stored_schema: TStoredSchema = { # type: ignore[typeddict-unknown-key] "name": "schema", - "tables": { - "table": deepcopy(table), - "table_copy": deepcopy(table) - }, - "x-top-level": True + "tables": {"table": deepcopy(table), "table_copy": deepcopy(table)}, + "x-top-level": True, } # mock the case in table_copy where resource == table_name - stored_schema["tables"]["table_copy"]["resource"] = stored_schema["tables"]["table_copy"]["name"] = "table_copy" + stored_schema["tables"]["table_copy"]["resource"] = stored_schema["tables"]["table_copy"][ + "name" + ] = "table_copy" default_stored = utils.remove_defaults(stored_schema) # nullability always present @@ -141,13 +137,13 @@ def test_merge_columns() -> None: assert col_a == { "name": "test_2", "nullable": False, - 'cluster': False, - 'foreign_key': True, - 'data_type': 'text', - 'x-special': True, - 'x-special-int': 100, - 'x-special-bool': False, - 'prop': None + "cluster": False, + "foreign_key": True, + "data_type": "text", + "x-special": True, + "x-special-int": 100, + "x-special-bool": False, + "prop": None, } col_a = utils.merge_columns(copy(COL_1_HINTS), copy(COL_2_HINTS), merge_defaults=True) @@ -155,14 +151,14 @@ def test_merge_columns() -> None: assert col_a == { "name": "test_2", "nullable": True, - 'cluster': False, - 'foreign_key': True, - 'data_type': 'text', - 'x-special': True, - 'x-special-int': 100, - 'x-special-bool': False, - 'prop': None, - 'primary_key': False + "cluster": False, + "foreign_key": True, + "data_type": "text", + "x-special": True, + "x-special-int": 100, + "x-special-bool": False, + "prop": None, + "primary_key": False, } @@ -172,10 +168,7 @@ def test_diff_tables() -> None: "description": "description", "resource": "🦚Table", "x-special": 128, - "columns": { - "test": COL_1_HINTS, - "test_2": COL_2_HINTS - } + "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } empty = utils.new_table("table") del empty["resource"] @@ -193,11 +186,7 @@ def test_diff_tables() -> None: changed["name"] = "new name" partial = utils.diff_tables(deepcopy(table), changed) print(partial) - assert partial == { - "name": "new name", - "description": "new description", - "columns": {} - } + assert partial == {"name": "new name", "description": "new description", "columns": {}} # ignore identical table props existing = deepcopy(table) @@ -209,16 +198,12 @@ def test_diff_tables() -> None: "description": "new description", "write_disposition": "append", "schema_contract": "freeze", - "columns": {} + "columns": {}, } existing["write_disposition"] = "append" existing["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) - assert partial == { - "name": "new name", - "description": "new description", - "columns": {} - } + assert partial == {"name": "new name", "description": "new description", "columns": {}} # detect changed column existing = deepcopy(table) @@ -252,10 +237,7 @@ def test_diff_tables_conflicts() -> None: "parent": "parent", "description": "description", "x-special": 128, - "columns": { - "test": COL_1_HINTS, - "test_2": COL_2_HINTS - } + "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } other = utils.new_table("table_2") @@ -277,10 +259,7 @@ def test_merge_tables() -> None: "description": "description", "resource": "🦚Table", "x-special": 128, - "columns": { - "test": COL_1_HINTS, - "test_2": COL_2_HINTS - } + "columns": {"test": COL_1_HINTS, "test_2": COL_2_HINTS}, } changed = deepcopy(table) changed["x-special"] = 129 # type: ignore[typeddict-unknown-key] diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index a42018f97b..fbcce66ae1 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -13,8 +13,18 @@ from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id from dlt.common.schema import TColumnSchema, Schema, TStoredSchema, utils, TColumnHint -from dlt.common.schema.exceptions import InvalidSchemaName, ParentTableNotFoundException, SchemaEngineNoUpgradePathException -from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME, TColumnName, TSimpleRegex, COLUMN_HINTS +from dlt.common.schema.exceptions import ( + InvalidSchemaName, + ParentTableNotFoundException, + SchemaEngineNoUpgradePathException, +) +from dlt.common.schema.typing import ( + LOADS_TABLE_NAME, + VERSION_TABLE_NAME, + TColumnName, + TSimpleRegex, + COLUMN_HINTS, +) from dlt.common.storages import SchemaStorage from tests.utils import autouse_test_storage, preserve_environ @@ -30,17 +40,15 @@ def schema_storage() -> SchemaStorage: SchemaStorageConfiguration(), explicit_value={ "import_schema_path": "tests/common/cases/schemas/rasa", - "external_schema_format": "json" - } + "external_schema_format": "json", + }, ) return SchemaStorage(C, makedirs=True) @pytest.fixture def schema_storage_no_import() -> SchemaStorage: - C = resolve_configuration( - SchemaStorageConfiguration() - ) + C = resolve_configuration(SchemaStorageConfiguration()) return SchemaStorage(C, makedirs=True) @@ -51,15 +59,16 @@ def schema() -> Schema: @pytest.fixture def cn_schema() -> Schema: - return Schema("column_default", { - "names": "tests.common.normalizers.custom_normalizers", - "json": { - "module": "tests.common.normalizers.custom_normalizers", - "config": { - "not_null": ["fake_id"] - } - } - }) + return Schema( + "column_default", + { + "names": "tests.common.normalizers.custom_normalizers", + "json": { + "module": "tests.common.normalizers.custom_normalizers", + "config": {"not_null": ["fake_id"]}, + }, + }, + ) def test_normalize_schema_name(schema: Schema) -> None: @@ -118,7 +127,9 @@ def test_simple_regex_validator() -> None: assert utils.simple_regex_validator(".", "k", "v", TSimpleRegex) is True # validate regex - assert utils.simple_regex_validator(".", "k", TSimpleRegex("re:^_record$"), TSimpleRegex) is True + assert ( + utils.simple_regex_validator(".", "k", TSimpleRegex("re:^_record$"), TSimpleRegex) is True + ) # invalid regex with pytest.raises(DictValidationException) as e: utils.simple_regex_validator(".", "k", "re:[[^_record$", TSimpleRegex) @@ -169,7 +180,7 @@ def test_schema_name() -> None: Schema("1_a") # too long with pytest.raises(InvalidSchemaName) as exc: - Schema("a"*65) + Schema("a" * 65) def test_create_schema_with_normalize_name() -> None: @@ -178,10 +189,15 @@ def test_create_schema_with_normalize_name() -> None: def test_schema_descriptions_and_annotations(schema_storage: SchemaStorage): - schema = SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event", extensions=("yaml", )) + schema = SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event", extensions=("yaml",) + ) assert schema.tables["blocks"]["description"] == "Ethereum blocks" assert schema.tables["blocks"]["x-annotation"] == "this will be preserved on save" # type: ignore[typeddict-item] - assert schema.tables["blocks"]["columns"]["_dlt_load_id"]["description"] == "load id coming from the extractor" + assert ( + schema.tables["blocks"]["columns"]["_dlt_load_id"]["description"] + == "load id coming from the extractor" + ) assert schema.tables["blocks"]["columns"]["_dlt_load_id"]["x-column-annotation"] == "column annotation preserved on save" # type: ignore[typeddict-item] # mod and save @@ -194,7 +210,9 @@ def test_schema_descriptions_and_annotations(schema_storage: SchemaStorage): loaded_schema = schema_storage.load_schema("event") assert loaded_schema.tables["blocks"]["description"].endswith("Saved") assert loaded_schema.tables["blocks"]["x-annotation"].endswith("Saved") # type: ignore[typeddict-item] - assert loaded_schema.tables["blocks"]["columns"]["_dlt_load_id"]["description"].endswith("Saved") + assert loaded_schema.tables["blocks"]["columns"]["_dlt_load_id"]["description"].endswith( + "Saved" + ) assert loaded_schema.tables["blocks"]["columns"]["_dlt_load_id"]["x-column-annotation"].endswith("Saved") # type: ignore[typeddict-item] @@ -219,12 +237,21 @@ def test_replace_schema_content() -> None: assert schema.version_hash != schema.stored_version_hash -@pytest.mark.parametrize("columns,hint,value", [ - (["_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx"], "nullable", False), - (["_dlt_id"], "unique", True), - (["_dlt_parent_id"], "foreign_key", True), -]) -def test_relational_normalizer_schema_hints(columns: Sequence[str], hint: str, value: bool, schema_storage: SchemaStorage) -> None: +@pytest.mark.parametrize( + "columns,hint,value", + [ + ( + ["_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx"], + "nullable", + False, + ), + (["_dlt_id"], "unique", True), + (["_dlt_parent_id"], "foreign_key", True), + ], +) +def test_relational_normalizer_schema_hints( + columns: Sequence[str], hint: str, value: bool, schema_storage: SchemaStorage +) -> None: schema = schema_storage.load_schema("event") for name in columns: # infer column hints @@ -249,13 +276,17 @@ def test_save_store_schema(schema: Schema, schema_storage: SchemaStorage) -> Non assert_new_schema_values(schema_copy) -def test_save_store_schema_custom_normalizers(cn_schema: Schema, schema_storage: SchemaStorage) -> None: +def test_save_store_schema_custom_normalizers( + cn_schema: Schema, schema_storage: SchemaStorage +) -> None: schema_storage.save_schema(cn_schema) schema_copy = schema_storage.load_schema(cn_schema.name) assert_new_schema_values_custom_normalizers(schema_copy) -def test_save_load_incomplete_column(schema: Schema, schema_storage_no_import: SchemaStorage) -> None: +def test_save_load_incomplete_column( + schema: Schema, schema_storage_no_import: SchemaStorage +) -> None: # make sure that incomplete column is saved and restored without default hints incomplete_col = utils.new_column("I", nullable=False) incomplete_col["primary_key"] = True @@ -265,10 +296,10 @@ def test_save_load_incomplete_column(schema: Schema, schema_storage_no_import: S schema_storage_no_import.save_schema(schema) schema_copy = schema_storage_no_import.load_schema("event") assert schema_copy.get_table("table")["columns"]["I"] == { - 'name': 'I', - 'nullable': False, - 'primary_key': True, - 'x-special': 'spec' + "name": "I", + "nullable": False, + "primary_key": True, + "x-special": "spec", } @@ -306,7 +337,6 @@ def test_upgrade_engine_v1_schema() -> None: upgraded = utils.migrate_schema(schema_dict, from_engine=1, to_engine=7) assert upgraded["engine_version"] == 7 - # upgrade 1 -> 8 schema_dict = load_json_case("schemas/ev1/event.schema") assert schema_dict["engine_version"] == 1 @@ -324,7 +354,9 @@ def test_unknown_engine_upgrade() -> None: def test_preserve_column_order(schema: Schema, schema_storage: SchemaStorage) -> None: # python dicts are ordered from v3.6, add 50 column with random names - update: List[TColumnSchema] = [schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50)] + update: List[TColumnSchema] = [ + schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50) + ] schema.update_table(utils.new_table("event_test_order", columns=update)) def verify_items(table, update) -> None: @@ -339,7 +371,9 @@ def verify_items(table, update) -> None: table = loaded_schema.get_table_columns("event_test_order") verify_items(table, update) # add more columns - update2: List[TColumnSchema] = [schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50)] + update2: List[TColumnSchema] = [ + schema._infer_column(uniq_id(), pendulum.now().timestamp()) for _ in range(50) + ] loaded_schema.update_table(utils.new_table("event_test_order", columns=update2)) table = loaded_schema.get_table_columns("event_test_order") verify_items(table, update + update2) @@ -347,7 +381,7 @@ def verify_items(table, update) -> None: schema_storage.save_schema(loaded_schema) loaded_schema = schema_storage.load_schema("event") table = loaded_schema.get_table_columns("event_test_order") - verify_items(table, update + update2) + verify_items(table, update + update2) def test_get_schema_new_exist(schema_storage: SchemaStorage) -> None: @@ -355,16 +389,35 @@ def test_get_schema_new_exist(schema_storage: SchemaStorage) -> None: schema_storage.load_schema("wrongschema") -@pytest.mark.parametrize("columns,hint,value", [ - (["timestamp", "_timestamp", "_dist_key", "_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx", "sender_id"], "nullable", False), - (["confidence", "_sender_id"], "nullable", True), - (["timestamp", "_timestamp"], "partition", True), - (["_dist_key", "sender_id"], "cluster", True), - (["_dlt_id"], "unique", True), - (["_dlt_parent_id"], "foreign_key", True), - (["timestamp", "_timestamp"], "sort", True), -]) -def test_rasa_event_hints(columns: Sequence[str], hint: str, value: bool, schema_storage: SchemaStorage) -> None: +@pytest.mark.parametrize( + "columns,hint,value", + [ + ( + [ + "timestamp", + "_timestamp", + "_dist_key", + "_dlt_id", + "_dlt_root_id", + "_dlt_load_id", + "_dlt_parent_id", + "_dlt_list_idx", + "sender_id", + ], + "nullable", + False, + ), + (["confidence", "_sender_id"], "nullable", True), + (["timestamp", "_timestamp"], "partition", True), + (["_dist_key", "sender_id"], "cluster", True), + (["_dlt_id"], "unique", True), + (["_dlt_parent_id"], "foreign_key", True), + (["timestamp", "_timestamp"], "sort", True), + ], +) +def test_rasa_event_hints( + columns: Sequence[str], hint: str, value: bool, schema_storage: SchemaStorage +) -> None: schema = schema_storage.load_schema("event") for name in columns: # infer column hints @@ -432,10 +485,16 @@ def test_merge_hints(schema: Schema) -> None: schema._settings["default_hints"] = {} schema._compiled_hints = {} new_hints = { - "not_null": ["_dlt_id", "_dlt_root_id", "_dlt_parent_id", "_dlt_list_idx", "re:^_dlt_load_id$"], - "foreign_key": ["re:^_dlt_parent_id$"], - "unique": ["re:^_dlt_id$"] - } + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "re:^_dlt_load_id$", + ], + "foreign_key": ["re:^_dlt_parent_id$"], + "unique": ["re:^_dlt_id$"], + } schema.merge_hints(new_hints) # type: ignore[arg-type] assert schema._settings["default_hints"] == new_hints @@ -446,17 +505,21 @@ def test_merge_hints(schema: Schema) -> None: assert set(new_hints[k]) == set(schema._settings["default_hints"][k]) # type: ignore[index] # add new stuff - new_new_hints = { - "not_null": ["timestamp"], - "primary_key": ["id"] - } + new_new_hints = {"not_null": ["timestamp"], "primary_key": ["id"]} schema.merge_hints(new_new_hints) # type: ignore[arg-type] expected_hints = { - "not_null": ["_dlt_id", "_dlt_root_id", "_dlt_parent_id", "_dlt_list_idx", "re:^_dlt_load_id$", "timestamp"], - "foreign_key": ["re:^_dlt_parent_id$"], - "unique": ["re:^_dlt_id$"], - "primary_key": ["id"] - } + "not_null": [ + "_dlt_id", + "_dlt_root_id", + "_dlt_parent_id", + "_dlt_list_idx", + "re:^_dlt_load_id$", + "timestamp", + ], + "foreign_key": ["re:^_dlt_parent_id$"], + "unique": ["re:^_dlt_id$"], + "primary_key": ["id"], + } assert len(expected_hints) == len(schema._settings["default_hints"]) for k in expected_hints: assert set(expected_hints[k]) == set(schema._settings["default_hints"][k]) # type: ignore[index] @@ -467,8 +530,8 @@ def test_default_table_resource() -> None: eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") tables = Schema.from_dict(eth_v5).tables - assert tables['blocks']['resource'] == 'blocks' - assert all([t.get('resource') is None for t in tables.values() if t.get('parent')]) + assert tables["blocks"]["resource"] == "blocks" + assert all([t.get("resource") is None for t in tables.values() if t.get("parent")]) def test_data_tables(schema: Schema, schema_storage: SchemaStorage) -> None: @@ -478,8 +541,10 @@ def test_data_tables(schema: Schema, schema_storage: SchemaStorage) -> None: # with tables schema = schema_storage.load_schema("event") # some of them are incomplete - assert set(schema.tables.keys()) == set([LOADS_TABLE_NAME, VERSION_TABLE_NAME, 'event_slot', 'event_user', 'event_bot']) - assert [t["name"] for t in schema.data_tables()] == ['event_slot'] + assert set(schema.tables.keys()) == set( + [LOADS_TABLE_NAME, VERSION_TABLE_NAME, "event_slot", "event_user", "event_bot"] + ) + assert [t["name"] for t in schema.data_tables()] == ["event_slot"] def test_write_disposition(schema_storage: SchemaStorage) -> None: @@ -504,28 +569,39 @@ def test_write_disposition(schema_storage: SchemaStorage) -> None: def test_compare_columns() -> None: - table = utils.new_table("test_table", columns=[ - {"name": "col1", "data_type": "text", "nullable": True}, - {"name": "col2", "data_type": "text", "nullable": False}, - {"name": "col3", "data_type": "timestamp", "nullable": True}, - {"name": "col4", "data_type": "timestamp", "nullable": True} - ]) - table2 = utils.new_table("test_table", columns=[ - {"name": "col1", "data_type": "text", "nullable": False} - ]) + table = utils.new_table( + "test_table", + columns=[ + {"name": "col1", "data_type": "text", "nullable": True}, + {"name": "col2", "data_type": "text", "nullable": False}, + {"name": "col3", "data_type": "timestamp", "nullable": True}, + {"name": "col4", "data_type": "timestamp", "nullable": True}, + ], + ) + table2 = utils.new_table( + "test_table", columns=[{"name": "col1", "data_type": "text", "nullable": False}] + ) # columns identical with self for c in table["columns"].values(): assert utils.compare_complete_columns(c, c) is True - assert utils.compare_complete_columns(table["columns"]["col3"], table["columns"]["col4"]) is False + assert ( + utils.compare_complete_columns(table["columns"]["col3"], table["columns"]["col4"]) is False + ) # data type may not differ - assert utils.compare_complete_columns(table["columns"]["col1"], table["columns"]["col3"]) is False + assert ( + utils.compare_complete_columns(table["columns"]["col1"], table["columns"]["col3"]) is False + ) # nullability may differ - assert utils.compare_complete_columns(table["columns"]["col1"], table2["columns"]["col1"]) is True + assert ( + utils.compare_complete_columns(table["columns"]["col1"], table2["columns"]["col1"]) is True + ) # any of the hints may differ for hint in COLUMN_HINTS: table["columns"]["col3"][hint] = True # type: ignore[typeddict-unknown-key] # name may not differ - assert utils.compare_complete_columns(table["columns"]["col3"], table["columns"]["col4"]) is False + assert ( + utils.compare_complete_columns(table["columns"]["col3"], table["columns"]["col4"]) is False + ) def test_normalize_table_identifiers() -> None: @@ -536,24 +612,16 @@ def test_normalize_table_identifiers() -> None: issues_table = deepcopy(schema.tables["issues"]) # this schema is already normalized so normalization is idempotent assert schema.tables["issues"] == schema.normalize_table_identifiers(issues_table) - assert schema.tables["issues"] == schema.normalize_table_identifiers(schema.normalize_table_identifiers(issues_table)) + assert schema.tables["issues"] == schema.normalize_table_identifiers( + schema.normalize_table_identifiers(issues_table) + ) def test_normalize_table_identifiers_merge_columns() -> None: # create conflicting columns table_create = [ - { - "name": "case", - "data_type": "bigint", - "nullable": False, - "x-description": "desc" - }, - { - "name": "Case", - "data_type": "double", - "nullable": True, - "primary_key": True - }, + {"name": "case", "data_type": "bigint", "nullable": False, "x-description": "desc"}, + {"name": "Case", "data_type": "double", "nullable": True, "primary_key": True}, ] # schema normalizing to snake case will conflict on case and Case table = utils.new_table("blend", columns=table_create) # type: ignore[arg-type] @@ -561,18 +629,21 @@ def test_normalize_table_identifiers_merge_columns() -> None: # only one column assert len(norm_table["columns"]) == 1 assert norm_table["columns"]["case"] == { - 'nullable': False, # remove default, preserve non default - 'primary_key': True, - 'name': 'case', - 'data_type': 'double', - 'x-description': 'desc' + "nullable": False, # remove default, preserve non default + "primary_key": True, + "name": "case", + "data_type": "double", + "x-description": "desc", } def assert_new_schema_values_custom_normalizers(schema: Schema) -> None: # check normalizers config assert schema._normalizers_config["names"] == "tests.common.normalizers.custom_normalizers" - assert schema._normalizers_config["json"]["module"] == "tests.common.normalizers.custom_normalizers" + assert ( + schema._normalizers_config["json"]["module"] + == "tests.common.normalizers.custom_normalizers" + ) # check if schema was extended by json normalizer assert ["fake_id"] == schema.settings["default_hints"]["not_null"] # call normalizers @@ -595,13 +666,17 @@ def assert_new_schema_values(schema: Schema) -> None: assert schema._stored_previous_hashes == [] assert len(schema.settings["default_hints"]) > 0 # check settings - assert utils.standard_type_detections() == schema.settings["detections"] == schema._type_detections + assert ( + utils.standard_type_detections() == schema.settings["detections"] == schema._type_detections + ) # check normalizers config assert schema._normalizers_config["names"] == "snake_case" assert schema._normalizers_config["json"]["module"] == "dlt.common.normalizers.json.relational" assert isinstance(schema.naming, snake_case.NamingConvention) # check if schema was extended by json normalizer - assert set(["_dlt_id", "_dlt_root_id", "_dlt_parent_id", "_dlt_list_idx", "_dlt_load_id"]).issubset(schema.settings["default_hints"]["not_null"]) + assert set( + ["_dlt_id", "_dlt_root_id", "_dlt_parent_id", "_dlt_list_idx", "_dlt_load_id"] + ).issubset(schema.settings["default_hints"]["not_null"]) # call normalizers assert schema.naming.normalize_identifier("A") == "a" assert schema.naming.normalize_path("A__B") == "a__b" @@ -624,35 +699,62 @@ def test_group_tables_by_resource(schema: Schema) -> None: schema.update_table(utils.new_table("b_events", columns=[])) schema.update_table(utils.new_table("c_products", columns=[], resource="products")) schema.update_table(utils.new_table("a_events__1", columns=[], parent_table_name="a_events")) - schema.update_table(utils.new_table("a_events__1__2", columns=[], parent_table_name="a_events__1")) + schema.update_table( + utils.new_table("a_events__1__2", columns=[], parent_table_name="a_events__1") + ) schema.update_table(utils.new_table("b_events__1", columns=[], parent_table_name="b_events")) # All resources without filter expected_tables = { - "a_events": [schema.tables["a_events"], schema.tables["a_events__1"], schema.tables["a_events__1__2"]], + "a_events": [ + schema.tables["a_events"], + schema.tables["a_events__1"], + schema.tables["a_events__1__2"], + ], "b_events": [schema.tables["b_events"], schema.tables["b_events__1"]], "products": [schema.tables["c_products"]], "_dlt_version": [schema.tables["_dlt_version"]], - "_dlt_loads": [schema.tables["_dlt_loads"]] + "_dlt_loads": [schema.tables["_dlt_loads"]], } result = utils.group_tables_by_resource(schema.tables) assert result == expected_tables # With resource filter - result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("re:[a-z]_events"))) + result = utils.group_tables_by_resource( + schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("re:[a-z]_events")) + ) assert result == { - "a_events": [schema.tables["a_events"], schema.tables["a_events__1"], schema.tables["a_events__1__2"]], + "a_events": [ + schema.tables["a_events"], + schema.tables["a_events__1"], + schema.tables["a_events__1__2"], + ], "b_events": [schema.tables["b_events"], schema.tables["b_events__1"]], } # With resources that has many top level tables schema.update_table(utils.new_table("mc_products", columns=[], resource="products")) - schema.update_table(utils.new_table("mc_products__sub", columns=[], parent_table_name="mc_products")) - result = utils.group_tables_by_resource(schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products"))) + schema.update_table( + utils.new_table("mc_products__sub", columns=[], parent_table_name="mc_products") + ) + result = utils.group_tables_by_resource( + schema.tables, pattern=utils.compile_simple_regex(TSimpleRegex("products")) + ) # both tables with resource "products" must be here - assert result == {'products': [ - {'columns': {}, 'name': 'c_products', 'resource': 'products', 'write_disposition': 'append'}, - {'columns': {}, 'name': 'mc_products', 'resource': 'products', 'write_disposition': 'append'}, - {'columns': {}, 'name': 'mc_products__sub', 'parent': 'mc_products'} + assert result == { + "products": [ + { + "columns": {}, + "name": "c_products", + "resource": "products", + "write_disposition": "append", + }, + { + "columns": {}, + "name": "mc_products", + "resource": "products", + "write_disposition": "append", + }, + {"columns": {}, "name": "mc_products__sub", "parent": "mc_products"}, ] } diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py index 2f6b4743f3..7fdeed5408 100644 --- a/tests/common/schema/test_schema_contract.py +++ b/tests/common/schema/test_schema_contract.py @@ -7,19 +7,13 @@ from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.typing import TTableSchema + def get_schema() -> Schema: s = Schema("event") - columns = { - "column_1": { - "name": "column_1", - "data_type": "text" - }, - "column_2": { - "name": "column_2", - "data_type": "bigint", - "is_variant": True - } + columns = { + "column_1": {"name": "column_1", "data_type": "text"}, + "column_2": {"name": "column_2", "data_type": "bigint", "is_variant": True}, } incomplete_columns = { @@ -28,43 +22,37 @@ def get_schema() -> Schema: }, "incomplete_column_2": { "name": "incomplete_column_2", - } + }, } - # add some tables - s.update_table(cast(TTableSchema, { - "name": "tables", - "columns": columns - })) + s.update_table(cast(TTableSchema, {"name": "tables", "columns": columns})) - s.update_table(cast(TTableSchema, { - "name": "child_table", - "parent": "tables", - "columns": columns - })) + s.update_table( + cast(TTableSchema, {"name": "child_table", "parent": "tables", "columns": columns}) + ) - s.update_table(cast(TTableSchema, { - "name": "incomplete_table", - "columns": incomplete_columns - })) + s.update_table(cast(TTableSchema, {"name": "incomplete_table", "columns": incomplete_columns})) - s.update_table(cast(TTableSchema, { - "name": "mixed_table", - "columns": {**incomplete_columns, **columns} - })) + s.update_table( + cast(TTableSchema, {"name": "mixed_table", "columns": {**incomplete_columns, **columns}}) + ) - s.update_table(cast(TTableSchema, { - "name": "evolve_once_table", - "x-normalizer": {"evolve-columns-once": True}, - "columns": {**incomplete_columns, **columns} - })) + s.update_table( + cast( + TTableSchema, + { + "name": "evolve_once_table", + "x-normalizer": {"evolve-columns-once": True}, + "columns": {**incomplete_columns, **columns}, + }, + ) + ) return s def test_resolve_contract_settings() -> None: - # defaults schema = get_schema() assert schema.resolve_contract_settings_for_table("tables") == DEFAULT_SCHEMA_CONTRACT_MODE @@ -76,12 +64,12 @@ def test_resolve_contract_settings() -> None: assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "freeze", - "data_type": "freeze" + "data_type": "freeze", } assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "freeze", - "data_type": "freeze" + "data_type": "freeze", } # table specific single setting @@ -93,12 +81,12 @@ def test_resolve_contract_settings() -> None: assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } # schema specific full setting @@ -107,12 +95,12 @@ def test_resolve_contract_settings() -> None: assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "freeze", - "data_type": "freeze" + "data_type": "freeze", } assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "freeze", - "data_type": "freeze" + "data_type": "freeze", } # schema specific single setting @@ -124,12 +112,12 @@ def test_resolve_contract_settings() -> None: assert schema.resolve_contract_settings_for_table("tables") == { "tables": "freeze", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "freeze", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } # mixed settings: table setting always prevails @@ -142,39 +130,26 @@ def test_resolve_contract_settings() -> None: assert schema.resolve_contract_settings_for_table("tables") == { "tables": "evolve", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } assert schema.resolve_contract_settings_for_table("child_table") == { "tables": "evolve", "columns": "discard_value", - "data_type": "evolve" + "data_type": "evolve", } # ensure other settings do not interfere with the main setting we are testing -base_settings = [{ - "tables": "evolve", - "columns": "evolve", - "data_type": "evolve" - }, { - "tables": "discard_row", - "columns": "discard_row", - "data_type": "discard_row" - }, { - "tables": "discard_value", - "columns": "discard_value", - "data_type": "discard_value" - }, { - "tables": "freeze", - "columns": "freeze", - "data_type": "freeze" - } +base_settings = [ + {"tables": "evolve", "columns": "evolve", "data_type": "evolve"}, + {"tables": "discard_row", "columns": "discard_row", "data_type": "discard_row"}, + {"tables": "discard_value", "columns": "discard_value", "data_type": "discard_value"}, + {"tables": "freeze", "columns": "freeze", "data_type": "freeze"}, ] @pytest.mark.parametrize("base_settings", base_settings) def test_check_adding_table(base_settings) -> None: - schema = get_schema() new_table = copy.deepcopy(schema.tables["tables"]) new_table["name"] = "new_table" @@ -182,17 +157,31 @@ def test_check_adding_table(base_settings) -> None: # # check adding new table # - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), new_table) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), new_table + ) assert (partial, filters) == (new_table, []) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), new_table) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), new_table + ) assert (partial, filters) == (None, [("tables", "new_table", "discard_row")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), new_table) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), new_table + ) assert (partial, filters) == (None, [("tables", "new_table", "discard_value")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, raise_on_freeze=False) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), + new_table, + raise_on_freeze=False, + ) assert (partial, filters) == (None, [("tables", "new_table", "freeze")]) with pytest.raises(DataValidationError) as val_ex: - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, data_item={"item": 1}) + schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), + new_table, + data_item={"item": 1}, + ) assert val_ex.value.schema_name == schema.name assert val_ex.value.table_name == "new_table" assert val_ex.value.column_name is None @@ -206,22 +195,44 @@ def test_check_adding_table(base_settings) -> None: def test_check_adding_new_columns(base_settings) -> None: schema = get_schema() - def assert_new_column(table_update: TTableSchema, column_name: str) -> None: popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop(column_name) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(table_update)) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), + copy.deepcopy(table_update), + ) assert (partial, filters) == (table_update, []) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(table_update)) - assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_row")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(table_update)) - assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_value")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), + copy.deepcopy(table_update), + ) + assert (partial, filters) == ( + popped_table_update, + [("columns", column_name, "discard_row")], + ) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), + copy.deepcopy(table_update), + ) + assert (partial, filters) == ( + popped_table_update, + [("columns", column_name, "discard_value")], + ) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), + copy.deepcopy(table_update), + raise_on_freeze=False, + ) assert (partial, filters) == (popped_table_update, [("columns", column_name, "freeze")]) with pytest.raises(DataValidationError) as val_ex: - schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), {column_name: 1}) + schema.apply_schema_contract( + cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), + copy.deepcopy(table_update), + {column_name: 1}, + ) assert val_ex.value.schema_name == schema.name assert val_ex.value.table_name == table_update["name"] assert val_ex.value.column_name == column_name @@ -235,12 +246,7 @@ def assert_new_column(table_update: TTableSchema, column_name: str) -> None: # table_update: TTableSchema = { "name": "tables", - "columns": { - "new_column": { - "name": "new_column", - "data_type": "text" - } - } + "columns": {"new_column": {"name": "new_column", "data_type": "text"}}, } assert_new_column(table_update, "new_column") @@ -253,7 +259,7 @@ def assert_new_column(table_update: TTableSchema, column_name: str) -> None: "incomplete_column_1": { "name": "incomplete_column_1", } - } + }, } assert_new_column(table_update, "incomplete_column_1") @@ -263,14 +269,11 @@ def assert_new_column(table_update: TTableSchema, column_name: str) -> None: table_update = { "name": "evolve_once_table", "columns": { - "new_column": { - "name": "new_column", - "data_type": "text" - }, + "new_column": {"name": "new_column", "data_type": "text"}, "incomplete_column_1": { "name": "incomplete_column_1", - } - } + }, + }, } partial, filters = schema.apply_schema_contract(base_settings, copy.deepcopy(table_update)) assert (partial, filters) == (table_update, []) @@ -285,27 +288,47 @@ def test_check_adding_new_variant() -> None: table_update: TTableSchema = { "name": "tables", "columns": { - "column_2_variant": { - "name": "column_2_variant", - "data_type": "bigint", - "variant": True - } - } + "column_2_variant": {"name": "column_2_variant", "data_type": "bigint", "variant": True} + }, } popped_table_update = copy.deepcopy(table_update) popped_table_update["columns"].pop("column_2_variant") - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(table_update)) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), + copy.deepcopy(table_update), + ) assert (partial, filters) == (table_update, []) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(table_update)) - assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_row")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(table_update)) - assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_value")]) - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), + copy.deepcopy(table_update), + ) + assert (partial, filters) == ( + popped_table_update, + [("columns", "column_2_variant", "discard_row")], + ) + partial, filters = schema.apply_schema_contract( + cast( + TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}} + ), + copy.deepcopy(table_update), + ) + assert (partial, filters) == ( + popped_table_update, + [("columns", "column_2_variant", "discard_value")], + ) + partial, filters = schema.apply_schema_contract( + cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), + copy.deepcopy(table_update), + raise_on_freeze=False, + ) assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "freeze")]) with pytest.raises(DataValidationError) as val_ex: - schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) + schema.apply_schema_contract( + cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), + copy.deepcopy(table_update), + ) assert val_ex.value.schema_name == schema.name assert val_ex.value.table_name == table_update["name"] assert val_ex.value.column_name == "column_2_variant" @@ -315,10 +338,19 @@ def test_check_adding_new_variant() -> None: assert val_ex.value.data_item is None # we do not pass it to apply_schema_contract # variants are not new columns - new data types - partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(table_update)) + partial, filters = schema.apply_schema_contract( + cast( + TSchemaContractDict, + {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}, + ), + copy.deepcopy(table_update), + ) assert (partial, filters) == (table_update, []) # evolve once does not apply to variant evolution table_update["name"] = "evolve_once_table" with pytest.raises(DataValidationError): - schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) + schema.apply_schema_contract( + cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), + copy.deepcopy(table_update), + ) diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 401b463875..5b794f51ee 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -133,7 +133,7 @@ def test_create_ancestry() -> None: # modify save and load schema 15 times and check ancestry expected_previous_hashes = ["yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE="] - for i in range(1,15): + for i in range(1, 15): # keep expected previous_hashes expected_previous_hashes.insert(0, schema._stored_version_hash) @@ -150,4 +150,4 @@ def test_create_ancestry() -> None: # we never have more than 10 previous_hashes assert len(schema._stored_previous_hashes) == i + 1 if i + 1 <= 10 else 10 - assert len(schema._stored_previous_hashes) == 10 \ No newline at end of file + assert len(schema._stored_previous_hashes) == 10 diff --git a/tests/common/scripts/args.py b/tests/common/scripts/args.py index 627daeb76b..67c6cc651a 100644 --- a/tests/common/scripts/args.py +++ b/tests/common/scripts/args.py @@ -1,4 +1,4 @@ import sys print(len(sys.argv)) -print(sys.argv) \ No newline at end of file +print(sys.argv) diff --git a/tests/common/scripts/counter.py b/tests/common/scripts/counter.py index 99352cd1f3..a7fa34dfec 100644 --- a/tests/common/scripts/counter.py +++ b/tests/common/scripts/counter.py @@ -6,4 +6,4 @@ print(i) sys.stdout.flush() sleep(0.3) -print("exit") \ No newline at end of file +print("exit") diff --git a/tests/common/scripts/cwd.py b/tests/common/scripts/cwd.py index 404cf43ada..ea065561f3 100644 --- a/tests/common/scripts/cwd.py +++ b/tests/common/scripts/cwd.py @@ -1,3 +1,3 @@ import os -print(os.getcwd()) \ No newline at end of file +print(os.getcwd()) diff --git a/tests/common/scripts/long_lines.py b/tests/common/scripts/long_lines.py index ca5469cd4c..0d22c692ba 100644 --- a/tests/common/scripts/long_lines.py +++ b/tests/common/scripts/long_lines.py @@ -10,4 +10,4 @@ # without new lines print(line_b, file=sys.stderr, end="") -print(line_a, end="") \ No newline at end of file +print(line_a, end="") diff --git a/tests/common/scripts/long_lines_fails.py b/tests/common/scripts/long_lines_fails.py index 0633f078e0..37e2f13e31 100644 --- a/tests/common/scripts/long_lines_fails.py +++ b/tests/common/scripts/long_lines_fails.py @@ -11,4 +11,4 @@ # without new lines print(line_b, file=sys.stderr, end="") print(line_a, end="") -exit(-1) \ No newline at end of file +exit(-1) diff --git a/tests/common/scripts/no_stdout_exception.py b/tests/common/scripts/no_stdout_exception.py index 90c71a4551..75bebd8cc7 100644 --- a/tests/common/scripts/no_stdout_exception.py +++ b/tests/common/scripts/no_stdout_exception.py @@ -1 +1 @@ -raise Exception("no stdout") \ No newline at end of file +raise Exception("no stdout") diff --git a/tests/common/scripts/no_stdout_no_stderr_with_fail.py b/tests/common/scripts/no_stdout_no_stderr_with_fail.py index 8e7ef7e83f..d0d1c88de8 100644 --- a/tests/common/scripts/no_stdout_no_stderr_with_fail.py +++ b/tests/common/scripts/no_stdout_no_stderr_with_fail.py @@ -1 +1 @@ -exit(-1) \ No newline at end of file +exit(-1) diff --git a/tests/common/scripts/raising_counter.py b/tests/common/scripts/raising_counter.py index 74c9a53b20..fcc7cbc7d8 100644 --- a/tests/common/scripts/raising_counter.py +++ b/tests/common/scripts/raising_counter.py @@ -8,4 +8,4 @@ if i == 2: raise Exception("end") sleep(0.3) -print("exit") \ No newline at end of file +print("exit") diff --git a/tests/common/scripts/stdout_encode_exception.py b/tests/common/scripts/stdout_encode_exception.py index 57658d431b..c08f812b04 100644 --- a/tests/common/scripts/stdout_encode_exception.py +++ b/tests/common/scripts/stdout_encode_exception.py @@ -5,11 +5,11 @@ from dlt.common.runners.stdout import exec_to_stdout - def worker(data1, data2): print("in func") raise UnsupportedProcessStartMethodException("this") + f = partial(worker, "this is string", TRunMetrics(True, 300)) with exec_to_stdout(f) as rv: print(rv) diff --git a/tests/common/scripts/stdout_encode_result.py b/tests/common/scripts/stdout_encode_result.py index b399734a4d..51c9b553db 100644 --- a/tests/common/scripts/stdout_encode_result.py +++ b/tests/common/scripts/stdout_encode_result.py @@ -8,6 +8,7 @@ def worker(data1, data2): print("in func") return data1, data2 + f = partial(worker, "this is string", TRunMetrics(True, 300)) with exec_to_stdout(f) as rv: print(rv) diff --git a/tests/common/storages/test_file_storage.py b/tests/common/storages/test_file_storage.py index 194fcb9afb..9f212070e8 100644 --- a/tests/common/storages/test_file_storage.py +++ b/tests/common/storages/test_file_storage.py @@ -69,7 +69,10 @@ def test_in_storage(test_storage: FileStorage) -> None: assert test_storage.in_storage(".") is True assert test_storage.in_storage(os.curdir) is True assert test_storage.in_storage(os.path.realpath(os.curdir)) is False - assert test_storage.in_storage(os.path.join(os.path.realpath(os.curdir), TEST_STORAGE_ROOT)) is True + assert ( + test_storage.in_storage(os.path.join(os.path.realpath(os.curdir), TEST_STORAGE_ROOT)) + is True + ) def test_from_wd_to_relative_path(test_storage: FileStorage) -> None: @@ -129,31 +132,31 @@ def test_validate_file_name_component() -> None: @pytest.mark.parametrize("action", ("rename_tree_files", "rename_tree", "atomic_rename")) def test_rename_nested_tree(test_storage: FileStorage, action: str) -> None: - source_dir = os.path.join(test_storage.storage_path, 'source') - nested_dir_1 = os.path.join(source_dir, 'nested1') - nested_dir_2 = os.path.join(nested_dir_1, 'nested2') - empty_dir = os.path.join(source_dir, 'empty') + source_dir = os.path.join(test_storage.storage_path, "source") + nested_dir_1 = os.path.join(source_dir, "nested1") + nested_dir_2 = os.path.join(nested_dir_1, "nested2") + empty_dir = os.path.join(source_dir, "empty") os.makedirs(nested_dir_2) os.makedirs(empty_dir) - with open(os.path.join(source_dir, 'test1.txt'), 'w', encoding="utf-8") as f: - f.write('test') - with open(os.path.join(nested_dir_1, 'test2.txt'), 'w', encoding="utf-8") as f: - f.write('test') - with open(os.path.join(nested_dir_2, 'test3.txt'), 'w', encoding="utf-8") as f: - f.write('test') + with open(os.path.join(source_dir, "test1.txt"), "w", encoding="utf-8") as f: + f.write("test") + with open(os.path.join(nested_dir_1, "test2.txt"), "w", encoding="utf-8") as f: + f.write("test") + with open(os.path.join(nested_dir_2, "test3.txt"), "w", encoding="utf-8") as f: + f.write("test") - dest_dir = os.path.join(test_storage.storage_path, 'dest') + dest_dir = os.path.join(test_storage.storage_path, "dest") getattr(test_storage, action)(source_dir, dest_dir) assert not os.path.exists(source_dir) assert os.path.exists(dest_dir) - assert os.path.exists(os.path.join(dest_dir, 'nested1')) - assert os.path.exists(os.path.join(dest_dir, 'nested1', 'nested2')) - assert os.path.exists(os.path.join(dest_dir, 'empty')) - assert os.path.exists(os.path.join(dest_dir, 'test1.txt')) - assert os.path.exists(os.path.join(dest_dir, 'nested1', 'test2.txt')) - assert os.path.exists(os.path.join(dest_dir, 'nested1', 'nested2', 'test3.txt')) + assert os.path.exists(os.path.join(dest_dir, "nested1")) + assert os.path.exists(os.path.join(dest_dir, "nested1", "nested2")) + assert os.path.exists(os.path.join(dest_dir, "empty")) + assert os.path.exists(os.path.join(dest_dir, "test1.txt")) + assert os.path.exists(os.path.join(dest_dir, "nested1", "test2.txt")) + assert os.path.exists(os.path.join(dest_dir, "nested1", "nested2", "test3.txt")) @skipifnotwindows diff --git a/tests/common/storages/test_loader_storage.py b/tests/common/storages/test_loader_storage.py index 1acfeb873b..ad9e1bcac2 100644 --- a/tests/common/storages/test_loader_storage.py +++ b/tests/common/storages/test_loader_storage.py @@ -5,7 +5,12 @@ from dlt.common import sleep, json, pendulum from dlt.common.schema import Schema, TSchemaTables -from dlt.common.storages.load_storage import LoadPackageInfo, LoadStorage, ParsedLoadJobFileName, TJobState +from dlt.common.storages.load_storage import ( + LoadPackageInfo, + LoadStorage, + ParsedLoadJobFileName, + TJobState, +) from dlt.common.configuration import resolve_configuration from dlt.common.storages import LoadStorageConfiguration from dlt.common.storages.exceptions import LoadPackageNotFound, NoMigrationPathException @@ -34,9 +39,15 @@ def test_complete_successful_package(storage: LoadStorage) -> None: assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has package assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) - assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) + assert storage.storage.has_file( + os.path.join( + storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME + ) + ) # but completed packages are deleted - assert not storage.storage.has_folder(storage._get_job_folder_completed_path(load_id, "completed_jobs")) + assert not storage.storage.has_folder( + storage._get_job_folder_completed_path(load_id, "completed_jobs") + ) assert_package_info(storage, load_id, "loaded", "completed_jobs", jobs_count=0) # delete completed package storage.delete_completed_package(load_id) @@ -50,9 +61,15 @@ def test_complete_successful_package(storage: LoadStorage) -> None: assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has load preserved assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) - assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) + assert storage.storage.has_file( + os.path.join( + storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME + ) + ) # has completed loads - assert storage.storage.has_folder(storage._get_job_folder_completed_path(load_id, "completed_jobs")) + assert storage.storage.has_folder( + storage._get_job_folder_completed_path(load_id, "completed_jobs") + ) storage.delete_completed_package(load_id) assert not storage.storage.has_folder(storage.get_completed_package_path(load_id)) @@ -64,7 +81,9 @@ def test_wipe_normalized_packages(storage: LoadStorage) -> None: def test_is_partially_loaded(storage: LoadStorage) -> None: - load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}], start_job=False) + load_id, file_name = start_loading_file( + storage, [{"content": "a"}, {"content": "b"}], start_job=False + ) info = storage.get_load_package_info(load_id) # all jobs are new assert LoadStorage.is_package_partially_loaded(info) is False @@ -101,7 +120,9 @@ def test_complete_package_failed_jobs(storage: LoadStorage) -> None: # present in completed loads folder assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) # has completed loads - assert storage.storage.has_folder(storage._get_job_folder_completed_path(load_id, "completed_jobs")) + assert storage.storage.has_folder( + storage._get_job_folder_completed_path(load_id, "completed_jobs") + ) assert_package_info(storage, load_id, "loaded", "failed_jobs") # get failed jobs info @@ -132,7 +153,9 @@ def test_abort_package(storage: LoadStorage) -> None: storage.fail_job(load_id, file_name, "EXCEPTION") assert_package_info(storage, load_id, "normalized", "failed_jobs") storage.complete_load_package(load_id, True) - assert storage.storage.has_folder(storage._get_job_folder_completed_path(load_id, "completed_jobs")) + assert storage.storage.has_folder( + storage._get_job_folder_completed_path(load_id, "completed_jobs") + ) assert_package_info(storage, load_id, "aborted", "failed_jobs") @@ -143,8 +166,10 @@ def test_save_load_schema(storage: LoadStorage) -> None: storage.create_temp_load_package("copy") saved_file_name = storage.save_temp_schema(schema, "copy") - assert saved_file_name.endswith(os.path.join(storage.storage.storage_path, "copy", LoadStorage.SCHEMA_FILE_NAME)) - assert storage.storage.has_file(os.path.join("copy",LoadStorage.SCHEMA_FILE_NAME)) + assert saved_file_name.endswith( + os.path.join(storage.storage.storage_path, "copy", LoadStorage.SCHEMA_FILE_NAME) + ) + assert storage.storage.has_file(os.path.join("copy", LoadStorage.SCHEMA_FILE_NAME)) schema_copy = storage.load_temp_schema("copy") assert schema.stored_version == schema_copy.stored_version @@ -218,7 +243,9 @@ def test_process_schema_update(storage: LoadStorage) -> None: storage.commit_schema_update(load_id, applied_update) assert storage.begin_schema_update(load_id) is None # processed file exists - applied_update_path = os.path.join(storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) + applied_update_path = os.path.join( + storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME + ) assert storage.storage.has_file(applied_update_path) is True assert json.loads(storage.storage.load(applied_update_path)) == applied_update # verify info package @@ -260,7 +287,9 @@ def test_unknown_migration_path() -> None: LoadStorage(False, "jsonl", LoadStorage.ALL_SUPPORTED_FILE_FORMATS) -def start_loading_file(s: LoadStorage, content: Sequence[StrAny], start_job: bool = True) -> Tuple[str, str]: +def start_loading_file( + s: LoadStorage, content: Sequence[StrAny], start_job: bool = True +) -> Tuple[str, str]: load_id = uniq_id() s.create_temp_load_package(load_id) # write test file @@ -276,7 +305,13 @@ def start_loading_file(s: LoadStorage, content: Sequence[StrAny], start_job: boo return load_id, file_name -def assert_package_info(storage: LoadStorage, load_id: str, package_state: str, job_state: TJobState, jobs_count: int = 1) -> LoadPackageInfo: +def assert_package_info( + storage: LoadStorage, + load_id: str, + package_state: str, + job_state: TJobState, + jobs_count: int = 1, +) -> LoadPackageInfo: package_info = storage.get_load_package_info(load_id) # make sure it is serializable json.dumps(package_info) diff --git a/tests/common/storages/test_local_filesystem.py b/tests/common/storages/test_local_filesystem.py index e9550a3173..3827535fbd 100644 --- a/tests/common/storages/test_local_filesystem.py +++ b/tests/common/storages/test_local_filesystem.py @@ -11,7 +11,9 @@ TEST_SAMPLE_FILES = "tests/common/storages/samples" -@pytest.mark.parametrize("bucket_url,load_content", itertools.product(["file:///", "/", ""], [True, False])) +@pytest.mark.parametrize( + "bucket_url,load_content", itertools.product(["file:///", "/", ""], [True, False]) +) def test_filesystem_dict_local(bucket_url: str, load_content: bool) -> None: if bucket_url in [""]: # relative paths @@ -20,7 +22,7 @@ def test_filesystem_dict_local(bucket_url: str, load_content: bool) -> None: if bucket_url == "/": bucket_url = os.path.abspath(TEST_SAMPLE_FILES) else: - bucket_url = pathlib.Path(TEST_SAMPLE_FILES).absolute().as_uri() + bucket_url = pathlib.Path(TEST_SAMPLE_FILES).absolute().as_uri() config = FilesystemConfiguration(bucket_url=bucket_url) filesystem, _ = fsspec_from_config(config) diff --git a/tests/common/storages/test_normalize_storage.py b/tests/common/storages/test_normalize_storage.py index 7199405c12..2749a0ce1d 100644 --- a/tests/common/storages/test_normalize_storage.py +++ b/tests/common/storages/test_normalize_storage.py @@ -16,13 +16,20 @@ def test_load_events_and_group_by_sender() -> None: def test_build_extracted_file_name() -> None: load_id = uniq_id() - name = NormalizeStorage.build_extracted_file_stem("event", "table_with_parts__many", load_id) + ".jsonl" + name = ( + NormalizeStorage.build_extracted_file_stem("event", "table_with_parts__many", load_id) + + ".jsonl" + ) assert NormalizeStorage.get_schema_name(name) == "event" - assert NormalizeStorage.parse_normalize_file_name(name) == TParsedNormalizeFileName("event", "table_with_parts__many", load_id, "jsonl") + assert NormalizeStorage.parse_normalize_file_name(name) == TParsedNormalizeFileName( + "event", "table_with_parts__many", load_id, "jsonl" + ) # empty schema should be supported name = NormalizeStorage.build_extracted_file_stem("", "table", load_id) + ".jsonl" - assert NormalizeStorage.parse_normalize_file_name(name) == TParsedNormalizeFileName("", "table", load_id, "jsonl") + assert NormalizeStorage.parse_normalize_file_name(name) == TParsedNormalizeFileName( + "", "table", load_id, "jsonl" + ) def test_full_migration_path() -> None: diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index 401c22f0bc..c72fa75927 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -7,11 +7,25 @@ from dlt.common.schema.schema import Schema from dlt.common.schema.typing import TStoredSchema from dlt.common.schema.utils import explicit_normalizers -from dlt.common.storages.exceptions import InStorageSchemaModified, SchemaNotFoundError, UnexpectedSchemaName -from dlt.common.storages import SchemaStorageConfiguration, SchemaStorage, LiveSchemaStorage, FileStorage +from dlt.common.storages.exceptions import ( + InStorageSchemaModified, + SchemaNotFoundError, + UnexpectedSchemaName, +) +from dlt.common.storages import ( + SchemaStorageConfiguration, + SchemaStorage, + LiveSchemaStorage, + FileStorage, +) from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT -from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V8 +from tests.common.utils import ( + load_yml_case, + yml_case_path, + COMMON_TEST_CASES_PATH, + IMPORTED_VERSION_HASH_ETH_V8, +) @pytest.fixture @@ -22,13 +36,23 @@ def storage() -> SchemaStorage: @pytest.fixture def synced_storage() -> SchemaStorage: # will be created in /schemas - return init_storage(SchemaStorageConfiguration(import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/import")) + return init_storage( + SchemaStorageConfiguration( + import_schema_path=TEST_STORAGE_ROOT + "/import", + export_schema_path=TEST_STORAGE_ROOT + "/import", + ) + ) @pytest.fixture def ie_storage() -> SchemaStorage: # will be created in /schemas - return init_storage(SchemaStorageConfiguration(import_schema_path=TEST_STORAGE_ROOT + "/import", export_schema_path=TEST_STORAGE_ROOT + "/export")) + return init_storage( + SchemaStorageConfiguration( + import_schema_path=TEST_STORAGE_ROOT + "/import", + export_schema_path=TEST_STORAGE_ROOT + "/export", + ) + ) def init_storage(C: SchemaStorageConfiguration) -> SchemaStorage: @@ -49,7 +73,9 @@ def test_load_non_existing(storage: SchemaStorage) -> None: def test_load_schema_with_upgrade() -> None: # point the storage root to v4 schema google_spreadsheet_v3.schema - storage = LiveSchemaStorage(SchemaStorageConfiguration(COMMON_TEST_CASES_PATH + "schemas/sheets")) + storage = LiveSchemaStorage( + SchemaStorageConfiguration(COMMON_TEST_CASES_PATH + "schemas/sheets") + ) # the hash when computed on the schema does not match the version_hash in the file so it should raise InStorageSchemaModified # but because the version upgrade is required, the check is skipped and the load succeeds storage.load_schema("google_spreadsheet_v4") @@ -64,7 +90,9 @@ def test_import_initial(synced_storage: SchemaStorage, storage: SchemaStorage) - assert_schema_imported(synced_storage, storage) -def test_import_overwrites_existing_if_modified(synced_storage: SchemaStorage, storage: SchemaStorage) -> None: +def test_import_overwrites_existing_if_modified( + synced_storage: SchemaStorage, storage: SchemaStorage +) -> None: schema = Schema("ethereum") storage.save_schema(schema) # now import schema that wil overwrite schema in storage as it is not linked to external schema @@ -242,28 +270,43 @@ def test_save_store_schema(storage: SchemaStorage) -> None: d_n["names"] = "tests.common.normalizers.custom_normalizers" schema = Schema("column_event", normalizers=d_n) storage.save_schema(schema) - assert storage.storage.has_file(SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("column_event", "json")) + assert storage.storage.has_file( + SchemaStorage.NAMED_SCHEMA_FILE_PATTERN % ("column_event", "json") + ) loaded_schema = storage.load_schema("column_event") # also tables gets normalized inside so custom_ is added - assert loaded_schema.to_dict()["tables"]["column__dlt_loads"] == schema.to_dict()["tables"]["column__dlt_loads"] + assert ( + loaded_schema.to_dict()["tables"]["column__dlt_loads"] + == schema.to_dict()["tables"]["column__dlt_loads"] + ) assert loaded_schema.to_dict() == schema.to_dict() def test_schema_from_file() -> None: # json has precedence - schema = SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event") + schema = SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event" + ) assert schema.name == "event" - schema = SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event", extensions=("yaml",)) + schema = SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "event", extensions=("yaml",) + ) assert schema.name == "event" assert "blocks" in schema.tables with pytest.raises(SchemaNotFoundError): - SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "eth", extensions=("yaml",)) + SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "eth", extensions=("yaml",) + ) # file name and schema content mismatch with pytest.raises(UnexpectedSchemaName): - SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), "name_mismatch", extensions=("yaml",)) + SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/local"), + "name_mismatch", + extensions=("yaml",), + ) # def test_save_empty_schema_name(storage: SchemaStorage) -> None: @@ -276,7 +319,10 @@ def test_schema_from_file() -> None: def prepare_import_folder(storage: SchemaStorage) -> None: - shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v8"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) + shutil.copy( + yml_case_path("schemas/eth/ethereum_schema_v8"), + os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml"), + ) def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: diff --git a/tests/common/storages/test_transactional_file.py b/tests/common/storages/test_transactional_file.py index 119b5ee3dd..7afdf10c38 100644 --- a/tests/common/storages/test_transactional_file.py +++ b/tests/common/storages/test_transactional_file.py @@ -109,7 +109,9 @@ def test_file_transaction_multiple_writers(fs: fsspec.AbstractFileSystem, file_n assert writer_2.read() == b"test 4" -def test_file_transaction_multiple_writers_with_races(fs: fsspec.AbstractFileSystem, file_name: str): +def test_file_transaction_multiple_writers_with_races( + fs: fsspec.AbstractFileSystem, file_name: str +): writer_1 = TransactionalFile(file_name, fs) time.sleep(0.5) writer_2 = TransactionalFile(file_name, fs) @@ -129,8 +131,10 @@ def test_file_transaction_simultaneous(fs: fsspec.AbstractFileSystem): pool = ThreadPoolExecutor(max_workers=40) results = pool.map( - lambda _: TransactionalFile( - "/bucket/test_123", fs).acquire_lock(blocking=False, jitter_mean=0.3), range(200) + lambda _: TransactionalFile("/bucket/test_123", fs).acquire_lock( + blocking=False, jitter_mean=0.3 + ), + range(200), ) assert sum(results) == 1 diff --git a/tests/common/storages/test_versioned_storage.py b/tests/common/storages/test_versioned_storage.py index ff23480a48..2859c7662c 100644 --- a/tests/common/storages/test_versioned_storage.py +++ b/tests/common/storages/test_versioned_storage.py @@ -9,7 +9,9 @@ class MigratedStorage(VersionedStorage): - def migrate_storage(self, from_version: semver.VersionInfo, to_version: semver.VersionInfo) -> None: + def migrate_storage( + self, from_version: semver.VersionInfo, to_version: semver.VersionInfo + ) -> None: # migration example: if from_version == "1.0.0" and from_version < to_version: from_version = semver.VersionInfo.parse("1.1.0") @@ -56,4 +58,4 @@ def test_downgrade_not_possible(test_storage: FileStorage) -> None: write_version(test_storage, "1.2.0") with pytest.raises(NoMigrationPathException) as wmpe: MigratedStorage("1.1.0", True, test_storage) - assert wmpe.value.migrated_version == "1.2.0" \ No newline at end of file + assert wmpe.value.migrated_version == "1.2.0" diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index a4296279bf..00ea29a859 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -6,7 +6,12 @@ from dlt.common.storages.fsspec_filesystem import FileItem, FileItemDict -def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFileSystem, config: FilesystemConfiguration, load_content: bool) -> None: +def assert_sample_files( + all_file_items: List[FileItem], + filesystem: AbstractFileSystem, + config: FilesystemConfiguration, + load_content: bool, +) -> None: for item in all_file_items: assert isinstance(item["file_name"], str) assert item["file_url"].endswith(item["file_name"]) @@ -30,6 +35,7 @@ def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFile # parse csv with file_dict.open(mode="rt") as f: from csv import DictReader + elements = list(DictReader(f)) assert len(elements) > 0 if item["mime_type"] == "application/parquet": @@ -45,14 +51,14 @@ def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFile assert len(all_file_items) == 10 assert set([item["file_name"] for item in all_file_items]) == { - 'csv/freshman_kgs.csv', - 'csv/freshman_lbs.csv', - 'csv/mlb_players.csv', - 'csv/mlb_teams_2012.csv', - 'jsonl/mlb_players.jsonl', - 'met_csv/A801/A881_20230920.csv', - 'met_csv/A803/A803_20230919.csv', - 'met_csv/A803/A803_20230920.csv', - 'parquet/mlb_players.parquet', - 'sample.txt' - } \ No newline at end of file + "csv/freshman_kgs.csv", + "csv/freshman_lbs.csv", + "csv/mlb_players.csv", + "csv/mlb_teams_2012.csv", + "jsonl/mlb_players.jsonl", + "met_csv/A801/A881_20230920.csv", + "met_csv/A803/A803_20230919.csv", + "met_csv/A803/A803_20230920.csv", + "parquet/mlb_players.parquet", + "sample.txt", + } diff --git a/tests/common/test_arithmetics.py b/tests/common/test_arithmetics.py index 4912d976eb..87c0a94751 100644 --- a/tests/common/test_arithmetics.py +++ b/tests/common/test_arithmetics.py @@ -18,7 +18,6 @@ def test_default_numeric_quantize() -> None: scale_18 = Decimal("0.5327010784") assert str(numeric_default_quantize(scale_18)) == "0.532701078" - # less than 0 digits scale_5 = Decimal("0.4") assert str(numeric_default_quantize(scale_5)) == "0.400000000" @@ -27,7 +26,7 @@ def test_default_numeric_quantize() -> None: def test_numeric_context() -> None: # we reach (38,9) numeric with numeric_default_context(): - v = Decimal(10**29-1) + Decimal("0.532701079") + v = Decimal(10**29 - 1) + Decimal("0.532701079") assert str(v) == "99999999999999999999999999999.532701079" assert numeric_default_quantize(v) == v diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index 5483a95f45..d416297581 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -35,34 +35,83 @@ def test_import_all_destinations() -> None: def test_normalize_dataset_name() -> None: # with schema name appended - assert DestinationClientDwhConfiguration(dataset_name="ban_ana_dataset", default_schema_name="default").normalize_dataset_name(Schema("banana")) == "ban_ana_dataset_banana" + assert ( + DestinationClientDwhConfiguration( + dataset_name="ban_ana_dataset", default_schema_name="default" + ).normalize_dataset_name(Schema("banana")) + == "ban_ana_dataset_banana" + ) # without schema name appended - assert DestinationClientDwhConfiguration(dataset_name="ban_ana_dataset", default_schema_name="default").normalize_dataset_name(Schema("default")) == "ban_ana_dataset" + assert ( + DestinationClientDwhConfiguration( + dataset_name="ban_ana_dataset", default_schema_name="default" + ).normalize_dataset_name(Schema("default")) + == "ban_ana_dataset" + ) # dataset name will be normalized (now it is up to destination to normalize this) - assert DestinationClientDwhConfiguration(dataset_name="BaNaNa", default_schema_name="default").normalize_dataset_name(Schema("banana")) == "ba_na_na_banana" + assert ( + DestinationClientDwhConfiguration( + dataset_name="BaNaNa", default_schema_name="default" + ).normalize_dataset_name(Schema("banana")) + == "ba_na_na_banana" + ) # empty schemas are invalid with pytest.raises(ValueError): - DestinationClientDwhConfiguration(dataset_name="banana_dataset", default_schema_name=None).normalize_dataset_name(Schema(None)) + DestinationClientDwhConfiguration( + dataset_name="banana_dataset", default_schema_name=None + ).normalize_dataset_name(Schema(None)) with pytest.raises(ValueError): - DestinationClientDwhConfiguration(dataset_name="banana_dataset", default_schema_name="").normalize_dataset_name(Schema("")) + DestinationClientDwhConfiguration( + dataset_name="banana_dataset", default_schema_name="" + ).normalize_dataset_name(Schema("")) # empty dataset name is valid! - assert DestinationClientDwhConfiguration(dataset_name="", default_schema_name="ban_schema").normalize_dataset_name(Schema("schema_ana")) == "_schema_ana" + assert ( + DestinationClientDwhConfiguration( + dataset_name="", default_schema_name="ban_schema" + ).normalize_dataset_name(Schema("schema_ana")) + == "_schema_ana" + ) # empty dataset name is valid! - assert DestinationClientDwhConfiguration(dataset_name="", default_schema_name="schema_ana").normalize_dataset_name(Schema("schema_ana")) == "" + assert ( + DestinationClientDwhConfiguration( + dataset_name="", default_schema_name="schema_ana" + ).normalize_dataset_name(Schema("schema_ana")) + == "" + ) # None dataset name is valid! - assert DestinationClientDwhConfiguration(dataset_name=None, default_schema_name="ban_schema").normalize_dataset_name(Schema("schema_ana")) == "_schema_ana" + assert ( + DestinationClientDwhConfiguration( + dataset_name=None, default_schema_name="ban_schema" + ).normalize_dataset_name(Schema("schema_ana")) + == "_schema_ana" + ) # None dataset name is valid! - assert DestinationClientDwhConfiguration(dataset_name=None, default_schema_name="schema_ana").normalize_dataset_name(Schema("schema_ana")) is None + assert ( + DestinationClientDwhConfiguration( + dataset_name=None, default_schema_name="schema_ana" + ).normalize_dataset_name(Schema("schema_ana")) + is None + ) # now mock the schema name to make sure that it is normalized schema = Schema("barbapapa") schema._schema_name = "BarbaPapa" - assert DestinationClientDwhConfiguration(dataset_name="set", default_schema_name="default").normalize_dataset_name(schema) == "set_barba_papa" + assert ( + DestinationClientDwhConfiguration( + dataset_name="set", default_schema_name="default" + ).normalize_dataset_name(schema) + == "set_barba_papa" + ) def test_normalize_dataset_name_none_default_schema() -> None: # if default schema is None, suffix is not added - assert DestinationClientDwhConfiguration(dataset_name="ban_ana_dataset", default_schema_name=None).normalize_dataset_name(Schema("default")) == "ban_ana_dataset" + assert ( + DestinationClientDwhConfiguration( + dataset_name="ban_ana_dataset", default_schema_name=None + ).normalize_dataset_name(Schema("default")) + == "ban_ana_dataset" + ) diff --git a/tests/common/test_git.py b/tests/common/test_git.py index 96a5f33d94..10bc05970e 100644 --- a/tests/common/test_git.py +++ b/tests/common/test_git.py @@ -3,7 +3,15 @@ import pytest from dlt.common.storages import FileStorage -from dlt.common.git import clone_repo, ensure_remote_head, git_custom_key_command, get_fresh_repo_files, get_repo, is_dirty, is_clean_and_synced +from dlt.common.git import ( + clone_repo, + ensure_remote_head, + git_custom_key_command, + get_fresh_repo_files, + get_repo, + is_dirty, + is_clean_and_synced, +) from tests.utils import test_storage, skipifwindows from tests.common.utils import load_secret, modify_and_commit_file, restore_secret_storage_path @@ -42,7 +50,12 @@ def test_clone(test_storage: FileStorage) -> None: def test_clone_with_commit_id(test_storage: FileStorage) -> None: repo_path = test_storage.make_full_path("awesome_repo") # clone a small public repo - clone_repo(AWESOME_REPO, repo_path, with_git_command=None, branch="7f88000be2d4f265c83465fec4b0b3613af347dd").close() + clone_repo( + AWESOME_REPO, + repo_path, + with_git_command=None, + branch="7f88000be2d4f265c83465fec4b0b3613af347dd", + ).close() assert test_storage.has_folder("awesome_repo") # cannot pull detached head with pytest.raises(GitError): diff --git a/tests/common/test_json.py b/tests/common/test_json.py index f6e9b06425..8136ed3ad2 100644 --- a/tests/common/test_json.py +++ b/tests/common/test_json.py @@ -6,10 +6,24 @@ from dlt.common import json, Decimal, pendulum from dlt.common.arithmetics import numeric_default_context -from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, may_have_pua, _orjson, _simplejson, SupportsJson, _DATETIME +from dlt.common.json import ( + _DECIMAL, + _WEI, + custom_pua_decode, + may_have_pua, + _orjson, + _simplejson, + SupportsJson, + _DATETIME, +) from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT -from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED, JSON_TYPED_DICT_NESTED, JSON_TYPED_DICT_NESTED_DECODED +from tests.cases import ( + JSON_TYPED_DICT, + JSON_TYPED_DICT_DECODED, + JSON_TYPED_DICT_NESTED, + JSON_TYPED_DICT_NESTED_DECODED, +) from tests.common.utils import json_case_path, load_json_case @@ -158,7 +172,10 @@ def test_json_decimals(json_impl: SupportsJson) -> None: # serialize out of local context s = json_impl.dumps(doc) # full precision. you need to quantize yourself if you need it - assert s == '{"decimal":"99999999999999999999999999999999999999999999999999999999999999999999999999.999"}' + assert ( + s + == '{"decimal":"99999999999999999999999999999999999999999999999999999999999999999999999999.999"}' + ) @pytest.mark.parametrize("json_impl", _JSON_IMPL) @@ -199,18 +216,27 @@ def test_json_pendulum(json_impl: SupportsJson) -> None: @pytest.mark.parametrize("json_impl", _JSON_IMPL) def test_json_named_tuple(json_impl: SupportsJson) -> None: - assert json_impl.dumps(NamedTupleTest("STR", Decimal("1.3333"))) == '{"str_field":"STR","dec_field":"1.3333"}' + assert ( + json_impl.dumps(NamedTupleTest("STR", Decimal("1.3333"))) + == '{"str_field":"STR","dec_field":"1.3333"}' + ) with io.BytesIO() as b: json_impl.typed_dump(NamedTupleTest("STR", Decimal("1.3333")), b) - assert b.getvalue().decode("utf-8") == '{"str_field":"STR","dec_field":"\uF0261.3333"}' + assert b.getvalue().decode("utf-8") == '{"str_field":"STR","dec_field":"\uf0261.3333"}' @pytest.mark.parametrize("json_impl", _JSON_IMPL) def test_data_class(json_impl: SupportsJson) -> None: - assert json_impl.dumps(DataClassTest(str_field="AAA")) == '{"str_field":"AAA","int_field":5,"dec_field":"0.5"}' + assert ( + json_impl.dumps(DataClassTest(str_field="AAA")) + == '{"str_field":"AAA","int_field":5,"dec_field":"0.5"}' + ) with io.BytesIO() as b: json_impl.typed_dump(DataClassTest(str_field="AAA"), b) - assert b.getvalue().decode("utf-8") == '{"str_field":"AAA","int_field":5,"dec_field":"\uF0260.5"}' + assert ( + b.getvalue().decode("utf-8") + == '{"str_field":"AAA","int_field":5,"dec_field":"\uf0260.5"}' + ) @pytest.mark.parametrize("json_impl", _JSON_IMPL) @@ -246,7 +272,7 @@ def test_json_typed_encode(json_impl: SupportsJson) -> None: assert d["decimal"][0] == _DECIMAL assert d["wei"][0] == _WEI # decode all - d_d = {k: custom_pua_decode(v) for k,v in d.items()} + d_d = {k: custom_pua_decode(v) for k, v in d.items()} assert d_d == JSON_TYPED_DICT_DECODED @@ -261,7 +287,6 @@ def test_pua_detection(json_impl: SupportsJson) -> None: assert not may_have_pua(content_b) - def test_load_and_compare_all_impls() -> None: with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: content_b = f.read() @@ -272,6 +297,6 @@ def test_load_and_compare_all_impls() -> None: # same docs, same output for idx in range(0, len(docs) - 1): - assert docs[idx] == docs[idx+1] - assert dump_s[idx] == dump_s[idx+1] - assert dump_b[idx] == dump_b[idx+1] + assert docs[idx] == docs[idx + 1] + assert dump_s[idx] == dump_s[idx + 1] + assert dump_b[idx] == dump_b[idx + 1] diff --git a/tests/common/test_pipeline_state.py b/tests/common/test_pipeline_state.py index cce610839f..2c6a89b978 100644 --- a/tests/common/test_pipeline_state.py +++ b/tests/common/test_pipeline_state.py @@ -11,7 +11,7 @@ def test_delete_source_state_keys() -> None: "a": {"b": {"c": 1}}, "x": {"y": {"c": 2}}, "y": {"x": {"a": 3}}, - "resources": {"some_data": {"incremental": {"last_value": 123}}} + "resources": {"some_data": {"incremental": {"last_value": 123}}}, } state = deepcopy(_fake_source_state) @@ -54,12 +54,12 @@ def test_get_matching_resources() -> None: # with state argument results = ps._get_matching_resources(pattern, _fake_source_state) - assert sorted(results) == ['events_a', 'events_b'] + assert sorted(results) == ["events_a", "events_b"] # with state context with mock.patch.object(ps, "source_state", autospec=True, return_value=_fake_source_state): results = ps._get_matching_resources(pattern, _fake_source_state) - assert sorted(results) == ['events_a', 'events_b'] + assert sorted(results) == ["events_a", "events_b"] # no resources key results = ps._get_matching_resources(pattern, {}) diff --git a/tests/common/test_time.py b/tests/common/test_time.py index 56c6849ab8..72a9098e4d 100644 --- a/tests/common/test_time.py +++ b/tests/common/test_time.py @@ -3,7 +3,12 @@ from pendulum.tz import UTC from dlt.common import pendulum -from dlt.common.time import timestamp_before, timestamp_within, ensure_pendulum_datetime, ensure_pendulum_date +from dlt.common.time import ( + timestamp_before, + timestamp_within, + ensure_pendulum_datetime, + ensure_pendulum_date, +) from dlt.common.typing import TAnyDateTime @@ -72,9 +77,7 @@ def test_before() -> None: @pytest.mark.parametrize("date_value, expected", test_params) -def test_ensure_pendulum_datetime( - date_value: TAnyDateTime, expected: pendulum.DateTime -) -> None: +def test_ensure_pendulum_datetime(date_value: TAnyDateTime, expected: pendulum.DateTime) -> None: dt = ensure_pendulum_datetime(date_value) assert dt == expected # always UTC @@ -86,4 +89,6 @@ def test_ensure_pendulum_datetime( def test_ensure_pendulum_date_utc() -> None: # when converting from datetimes make sure to shift to UTC before doing date assert ensure_pendulum_date("2021-01-01T00:00:00+05:00") == pendulum.date(2020, 12, 31) - assert ensure_pendulum_date(datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone(timedelta(hours=8)))) == pendulum.date(2020, 12, 31) \ No newline at end of file + assert ensure_pendulum_date( + datetime(2021, 1, 1, 0, 0, 0, tzinfo=timezone(timedelta(hours=8))) + ) == pendulum.date(2020, 12, 31) diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 41d3d8d274..fff053c2d3 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -1,10 +1,34 @@ - -from typing import List, Literal, Mapping, MutableMapping, MutableSequence, NewType, Sequence, TypeVar, TypedDict, Optional, Union -from dlt.common.configuration.specs.base_configuration import BaseConfiguration, get_config_if_union_hint +from typing import ( + List, + Literal, + Mapping, + MutableMapping, + MutableSequence, + NewType, + Sequence, + TypeVar, + TypedDict, + Optional, + Union, +) +from dlt.common.configuration.specs.base_configuration import ( + BaseConfiguration, + get_config_if_union_hint, +) from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.common.typing import StrAny, extract_inner_type, extract_union_types, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict, is_union_type - +from dlt.common.typing import ( + StrAny, + extract_inner_type, + extract_union_types, + is_dict_generic_type, + is_list_generic_type, + is_literal_type, + is_newtype_type, + is_optional_type, + is_typeddict, + is_union_type, +) class TTestTyDi(TypedDict): @@ -91,6 +115,9 @@ def test_get_config_if_union() -> None: assert get_config_if_union_hint(Union[BaseException, str, StrAny]) is None # type: ignore[arg-type] assert get_config_if_union_hint(Union[BaseConfiguration, str, StrAny]) is BaseConfiguration # type: ignore[arg-type] assert get_config_if_union_hint(Union[str, BaseConfiguration, StrAny]) is BaseConfiguration # type: ignore[arg-type] - assert get_config_if_union_hint( - Union[GcpServiceAccountCredentialsWithoutDefaults, StrAny, str] # type: ignore[arg-type] - ) is GcpServiceAccountCredentialsWithoutDefaults + assert ( + get_config_if_union_hint( + Union[GcpServiceAccountCredentialsWithoutDefaults, StrAny, str] # type: ignore[arg-type] + ) + is GcpServiceAccountCredentialsWithoutDefaults + ) diff --git a/tests/common/test_utils.py b/tests/common/test_utils.py index d51f54d6d1..a989daaa04 100644 --- a/tests/common/test_utils.py +++ b/tests/common/test_utils.py @@ -5,9 +5,20 @@ from typing import Dict from dlt.common.runners import Venv -from dlt.common.utils import (graph_find_scc_nodes, flatten_list_of_str_or_dicts, digest128, graph_edges_to_nodes, map_nested_in_place, - reveal_pseudo_secret, obfuscate_pseudo_secret, get_module_name, concat_strings_with_limit, increase_row_count, - merge_row_count, extend_list_deduplicated) +from dlt.common.utils import ( + graph_find_scc_nodes, + flatten_list_of_str_or_dicts, + digest128, + graph_edges_to_nodes, + map_nested_in_place, + reveal_pseudo_secret, + obfuscate_pseudo_secret, + get_module_name, + concat_strings_with_limit, + increase_row_count, + merge_row_count, + extend_list_deduplicated, +) def test_flatten_list_of_str_or_dicts() -> None: @@ -21,30 +32,27 @@ def test_flatten_list_of_str_or_dicts() -> None: def test_digest128_length() -> None: - assert len(digest128("hash it")) == 120/6 + assert len(digest128("hash it")) == 120 / 6 def test_map_dicts_in_place() -> None: - _d = { - "a": "1", - "b": ["a", "b", ["a", "b"], {"a": "c"}], - "c": { - "d": "e", - "e": ["a", 2] - } + _d = {"a": "1", "b": ["a", "b", ["a", "b"], {"a": "c"}], "c": {"d": "e", "e": ["a", 2]}} + exp_d = { + "a": "11", + "b": ["aa", "bb", ["aa", "bb"], {"a": "cc"}], + "c": {"d": "ee", "e": ["aa", 4]}, } - exp_d = {'a': '11', 'b': ['aa', 'bb', ['aa', 'bb'], {'a': 'cc'}], 'c': {'d': 'ee', 'e': ['aa', 4]}} - assert map_nested_in_place(lambda v: v*2, _d) == exp_d + assert map_nested_in_place(lambda v: v * 2, _d) == exp_d # in place assert _d == exp_d _l = ["a", "b", ["a", "b"], {"a": "c"}] exp_l = ["aa", "bb", ["aa", "bb"], {"a": "cc"}] - assert map_nested_in_place(lambda v: v*2, _l) == exp_l + assert map_nested_in_place(lambda v: v * 2, _l) == exp_l assert _l == exp_l with pytest.raises(ValueError): - map_nested_in_place(lambda v: v*2, "a") + map_nested_in_place(lambda v: v * 2, "a") def test_pseudo_obfuscation() -> None: @@ -79,9 +87,25 @@ def test_concat_strings_with_limit() -> None: assert list(concat_strings_with_limit(philosopher, ";\n", 15)) == ["Bertrand Russell"] # only two strings will be merged (22 chars total) - philosophers = ["Bertrand Russell", "Ludwig Wittgenstein", "G.E. Moore", "J.L. Mackie", "Alfred Tarski"] - moore_merged = ['Bertrand Russell', 'Ludwig Wittgenstein', 'G.E. Moore J.L. Mackie', 'Alfred Tarski'] - moore_merged_2 = ['Bertrand Russell', 'Ludwig Wittgenstein', 'G.E. Moore;\nJ.L. Mackie', 'Alfred Tarski'] + philosophers = [ + "Bertrand Russell", + "Ludwig Wittgenstein", + "G.E. Moore", + "J.L. Mackie", + "Alfred Tarski", + ] + moore_merged = [ + "Bertrand Russell", + "Ludwig Wittgenstein", + "G.E. Moore J.L. Mackie", + "Alfred Tarski", + ] + moore_merged_2 = [ + "Bertrand Russell", + "Ludwig Wittgenstein", + "G.E. Moore;\nJ.L. Mackie", + "Alfred Tarski", + ] assert list(concat_strings_with_limit(philosophers, " ", 22)) == moore_merged # none will be merged assert list(concat_strings_with_limit(philosophers, ";\n", 22)) == philosophers @@ -94,7 +118,7 @@ def test_concat_strings_with_limit() -> None: def test_find_scc_nodes() -> None: - edges = [('A', 'B'), ('B', 'C'), ('D', 'E'), ('F', 'G'), ('G', 'H'), ('I', 'I'), ('J', 'J')] + edges = [("A", "B"), ("B", "C"), ("D", "E"), ("F", "G"), ("G", "H"), ("I", "I"), ("J", "J")] def _comp(s): return sorted([tuple(sorted(c)) for c in s]) @@ -113,8 +137,28 @@ def _comp(s): def test_graph_edges_to_nodes() -> None: - edges = [('A', 'B'), ('A', 'C'), ('B', 'C'), ('D', 'E'), ('F', 'G'), ('G', 'H'), ('I', 'I'), ('J', 'J')] - graph = {"A": {"B", "C"}, "B": {"C"}, "C": set(), "D": {"E"}, "E": set(), "F": {"G"}, "G": {"H"}, "H": set(), "I": set(), "J": set()} + edges = [ + ("A", "B"), + ("A", "C"), + ("B", "C"), + ("D", "E"), + ("F", "G"), + ("G", "H"), + ("I", "I"), + ("J", "J"), + ] + graph = { + "A": {"B", "C"}, + "B": {"C"}, + "C": set(), + "D": {"E"}, + "E": set(), + "F": {"G"}, + "G": {"H"}, + "H": set(), + "I": set(), + "J": set(), + } g1 = graph_edges_to_nodes(edges) for perm_edges in itertools.permutations(edges): @@ -126,7 +170,7 @@ def test_graph_edges_to_nodes() -> None: # test a few edge cases assert graph_edges_to_nodes([]) == {} # ignores double edge - assert graph_edges_to_nodes([('A', 'B'), ('A', 'B')]) == {'A': {'B'}, 'B': set()} + assert graph_edges_to_nodes([("A", "B"), ("A", "B")]) == {"A": {"B"}, "B": set()} def test_increase_row_counts() -> None: @@ -135,21 +179,13 @@ def test_increase_row_counts() -> None: increase_row_count(counts, "table2", 0) increase_row_count(counts, "table3", 10) - assert counts == { - "table1": 1, - "table2": 0, - "table3": 10 - } + assert counts == {"table1": 1, "table2": 0, "table3": 10} increase_row_count(counts, "table1", 2) increase_row_count(counts, "table2", 3) increase_row_count(counts, "table3", 4) - assert counts == { - "table1": 3, - "table2": 3, - "table3": 14 - } + assert counts == {"table1": 3, "table2": 3, "table3": 14} def test_merge_row_counts() -> None: @@ -158,30 +194,33 @@ def test_merge_row_counts() -> None: "table2": 3, } - merge_row_count(rc1, { - "table2": 5, - "table3": 20, - }) - assert rc1 == { - "table1": 3, - "table2": 8, - "table3": 20 - } - merge_row_count(rc1, { - "table2": 5, - "table3": 20, - "table4": 2 - }) - assert rc1 == { - "table1": 3, - "table2": 13, - "table3": 40, - "table4": 2 - } + merge_row_count( + rc1, + { + "table2": 5, + "table3": 20, + }, + ) + assert rc1 == {"table1": 3, "table2": 8, "table3": 20} + merge_row_count(rc1, {"table2": 5, "table3": 20, "table4": 2}) + assert rc1 == {"table1": 3, "table2": 13, "table3": 40, "table4": 2} def test_extend_list_deduplicated() -> None: - assert extend_list_deduplicated(["one", "two", "three"], ["four", "five", "six"]) == ["one", "two", "three", "four", "five", "six"] - assert extend_list_deduplicated(["one", "two", "three", "six"], ["two", "four", "five", "six"]) == ["one", "two", "three", "six", "four", "five"] - assert extend_list_deduplicated(["one", "two", "three"], ["one", "two", "three"]) == ["one", "two", "three"] + assert extend_list_deduplicated(["one", "two", "three"], ["four", "five", "six"]) == [ + "one", + "two", + "three", + "four", + "five", + "six", + ] + assert extend_list_deduplicated( + ["one", "two", "three", "six"], ["two", "four", "five", "six"] + ) == ["one", "two", "three", "six", "four", "five"] + assert extend_list_deduplicated(["one", "two", "three"], ["one", "two", "three"]) == [ + "one", + "two", + "three", + ] assert extend_list_deduplicated([], ["one", "two", "three"]) == ["one", "two", "three"] diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index f274c82014..533b91808c 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -11,12 +11,13 @@ from dlt.common.validation import validate_dict, validate_dict_ignoring_xkeys - TLiteral = Literal["uno", "dos", "tres"] + class TDict(TypedDict): field: TLiteral + class TTestRecord(TypedDict): f_bool: bool f_str: str @@ -38,28 +39,12 @@ class TTestRecord(TypedDict): f_optional_union: Optional[Union[TLiteral, TDict]] -TEST_COL: TColumnSchema = { - "name": "col1", - "data_type": "bigint", - "nullable": False - } +TEST_COL: TColumnSchema = {"name": "col1", "data_type": "bigint", "nullable": False} TEST_COL_LIST: List[TColumnSchema] = [ - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }, - { - "name": "col2", - "data_type": "double", - "nullable": False - }, - { - "name": "col3", - "data_type": "bool", - "nullable": False - } + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "col2", "data_type": "double", "nullable": False}, + {"name": "col3", "data_type": "bool", "nullable": False}, ] TEST_DOC: TTestRecord = { @@ -72,7 +57,7 @@ class TTestRecord(TypedDict): "f_seq_simple": ["x", "y"], "f_seq_optional_str": ["opt1", "opt2"], "f_seq_of_optional_int": [1, 2, 3], - "f_list_of_dict": TEST_COL_LIST, + "f_list_of_dict": TEST_COL_LIST, "f_dict_simple": {"col1": "map_me"}, "f_map_simple": {"col1": "map_me"}, "f_map_of_dict": {"col1": deepcopy(TEST_COL)}, @@ -80,16 +65,19 @@ class TTestRecord(TypedDict): "f_literal": "uno", "f_literal_optional": "dos", "f_seq_literal": ["uno", "dos", "tres"], - "f_optional_union": {"field": "uno"} + "f_optional_union": {"field": "uno"}, } + @pytest.fixture def test_doc() -> TTestRecord: return deepcopy(TEST_DOC) def test_validate_schema_cases() -> None: - with open("tests/common/cases/schemas/eth/ethereum_schema_v8.yml", mode="r", encoding="utf-8") as f: + with open( + "tests/common/cases/schemas/eth/ethereum_schema_v8.yml", mode="r", encoding="utf-8" + ) as f: schema_dict: TStoredSchema = yaml.safe_load(f) validate_dict_ignoring_xkeys( @@ -243,7 +231,7 @@ def test_nested_union(test_doc: TTestRecord) -> None: with pytest.raises(DictValidationException) as e: validate_dict(TTestRecord, test_doc, ".") assert e.value.field == "f_optional_union" - assert e.value.value == {'field': 'not valid'} + assert e.value.value == {"field": "not valid"} test_doc["f_optional_union"] = "dos" validate_dict(TTestRecord, test_doc, ".") @@ -252,4 +240,4 @@ def test_nested_union(test_doc: TTestRecord) -> None: with pytest.raises(DictValidationException) as e: validate_dict(TTestRecord, test_doc, ".") assert e.value.field == "f_optional_union" - assert e.value.value == "blah" \ No newline at end of file + assert e.value.value == "blah" diff --git a/tests/common/test_wei.py b/tests/common/test_wei.py index 8ee47d11c0..1f15978ddc 100644 --- a/tests/common/test_wei.py +++ b/tests/common/test_wei.py @@ -7,9 +7,12 @@ def test_init() -> None: assert Wei.from_int256(10**18, decimals=18) == 1 # make sure the wei scale is supported assert Wei.from_int256(1, decimals=18) == Decimal("0.000000000000000001") - assert Wei.from_int256(2**256-1) == 2**256-1 - assert str(Wei.from_int256(2**256-1, decimals=18)) == "115792089237316195423570985008687907853269984665640564039457.584007913129639935" - assert str(Wei.from_int256(2**256-1)) == str(2**256-1) + assert Wei.from_int256(2**256 - 1) == 2**256 - 1 + assert ( + str(Wei.from_int256(2**256 - 1, decimals=18)) + == "115792089237316195423570985008687907853269984665640564039457.584007913129639935" + ) + assert str(Wei.from_int256(2**256 - 1)) == str(2**256 - 1) assert type(Wei.from_int256(1)) is Wei @@ -30,6 +33,14 @@ def test_wei_variant() -> None: # we get variant value when we call Wei assert Wei(578960446186580977117854925043439539266)() == 578960446186580977117854925043439539266 - assert Wei(578960446186580977117854925043439539267)() == ("str", "578960446186580977117854925043439539267") - assert Wei(-578960446186580977117854925043439539267)() == -578960446186580977117854925043439539267 - assert Wei(-578960446186580977117854925043439539268)() == ("str", "-578960446186580977117854925043439539268") + assert Wei(578960446186580977117854925043439539267)() == ( + "str", + "578960446186580977117854925043439539267", + ) + assert ( + Wei(-578960446186580977117854925043439539267)() == -578960446186580977117854925043439539267 + ) + assert Wei(-578960446186580977117854925043439539268)() == ( + "str", + "-578960446186580977117854925043439539268", + ) diff --git a/tests/common/utils.py b/tests/common/utils.py index db9a8318fb..0235d18bbe 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -18,7 +18,9 @@ # for import schema tests, change when upgrading the schema version IMPORTED_VERSION_HASH_ETH_V8 = "C5An8WClbavalXDdNSqXbdI7Swqh/mTWMcwWKCF//EE=" # test sentry DSN -TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" +TEST_SENTRY_DSN = ( + "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" +) # preserve secrets path to be able to restore it SECRET_STORAGE_PATH = environ_provider.SECRET_STORAGE_PATH @@ -42,11 +44,7 @@ def yml_case_path(name: str) -> str: def row_to_column_schemas(row: StrAny) -> TTableSchemaColumns: - return {k: { - "name": k, - "data_type": "text", - "nullable": False - } for k in row.keys()} + return {k: {"name": k, "data_type": "text", "nullable": False} for k in row.keys()} @pytest.fixture(autouse=True) @@ -56,13 +54,17 @@ def restore_secret_storage_path() -> None: def load_secret(name: str) -> str: environ_provider.SECRET_STORAGE_PATH = "./tests/common/cases/secrets/%s" - secret, _ = environ_provider.EnvironProvider().get_value(name, environ_provider.TSecretValue, None) + secret, _ = environ_provider.EnvironProvider().get_value( + name, environ_provider.TSecretValue, None + ) if not secret: raise FileNotFoundError(environ_provider.SECRET_STORAGE_PATH % name) return secret -def modify_and_commit_file(repo_path: str, file_name: str, content: str = "NEW README CONTENT") -> Tuple[str, Commit]: +def modify_and_commit_file( + repo_path: str, file_name: str, content: str = "NEW README CONTENT" +) -> Tuple[str, Commit]: file_path = os.path.join(repo_path, file_name) with open(file_path, "w", encoding="utf-8") as f: diff --git a/tests/conftest.py b/tests/conftest.py index 8a14fa1550..7e12990fd0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,12 +4,26 @@ from typing import List # patch which providers to enable -from dlt.common.configuration.providers import ConfigProvider, EnvironProvider, SecretsTomlProvider, ConfigTomlProvider -from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext, ConfigProvidersConfiguration +from dlt.common.configuration.providers import ( + ConfigProvider, + EnvironProvider, + SecretsTomlProvider, + ConfigTomlProvider, +) +from dlt.common.configuration.specs.config_providers_context import ( + ConfigProvidersContext, + ConfigProvidersConfiguration, +) + def initial_providers() -> List[ConfigProvider]: # do not read the global config - return [EnvironProvider(), SecretsTomlProvider(project_dir="tests/.dlt", add_global_config=False), ConfigTomlProvider(project_dir="tests/.dlt", add_global_config=False)] + return [ + EnvironProvider(), + SecretsTomlProvider(project_dir="tests/.dlt", add_global_config=False), + ConfigTomlProvider(project_dir="tests/.dlt", add_global_config=False), + ] + ConfigProvidersContext.initial_providers = initial_providers # type: ignore[method-assign] # also disable extras @@ -26,29 +40,41 @@ def pytest_configure(config): from dlt.common.storages import configuration as storage_configuration test_storage_root = "_storage" - run_configuration.RunConfiguration.config_files_storage_path = os.path.join(test_storage_root, "config/") - run_configuration.RunConfiguration.dlthub_telemetry_segment_write_key = "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB" + run_configuration.RunConfiguration.config_files_storage_path = os.path.join( + test_storage_root, "config/" + ) + run_configuration.RunConfiguration.dlthub_telemetry_segment_write_key = ( + "TLJiyRkGVZGCi2TtjClamXpFcxAA1rSB" + ) delattr(run_configuration.RunConfiguration, "__init__") run_configuration.RunConfiguration = dataclasses.dataclass(run_configuration.RunConfiguration, init=True, repr=False) # type: ignore # push telemetry to CI - storage_configuration.LoadStorageConfiguration.load_volume_path = os.path.join(test_storage_root, "load") + storage_configuration.LoadStorageConfiguration.load_volume_path = os.path.join( + test_storage_root, "load" + ) delattr(storage_configuration.LoadStorageConfiguration, "__init__") storage_configuration.LoadStorageConfiguration = dataclasses.dataclass(storage_configuration.LoadStorageConfiguration, init=True, repr=False) # type: ignore[misc, call-overload] - storage_configuration.NormalizeStorageConfiguration.normalize_volume_path = os.path.join(test_storage_root, "normalize") + storage_configuration.NormalizeStorageConfiguration.normalize_volume_path = os.path.join( + test_storage_root, "normalize" + ) # delete __init__, otherwise it will not be recreated by dataclass delattr(storage_configuration.NormalizeStorageConfiguration, "__init__") storage_configuration.NormalizeStorageConfiguration = dataclasses.dataclass(storage_configuration.NormalizeStorageConfiguration, init=True, repr=False) # type: ignore[misc, call-overload] - storage_configuration.SchemaStorageConfiguration.schema_volume_path = os.path.join(test_storage_root, "schemas") + storage_configuration.SchemaStorageConfiguration.schema_volume_path = os.path.join( + test_storage_root, "schemas" + ) delattr(storage_configuration.SchemaStorageConfiguration, "__init__") storage_configuration.SchemaStorageConfiguration = dataclasses.dataclass(storage_configuration.SchemaStorageConfiguration, init=True, repr=False) # type: ignore[misc, call-overload] - - assert run_configuration.RunConfiguration.config_files_storage_path == os.path.join(test_storage_root, "config/") - assert run_configuration.RunConfiguration().config_files_storage_path == os.path.join(test_storage_root, "config/") - + assert run_configuration.RunConfiguration.config_files_storage_path == os.path.join( + test_storage_root, "config/" + ) + assert run_configuration.RunConfiguration().config_files_storage_path == os.path.join( + test_storage_root, "config/" + ) # path pipeline instance id up to millisecond from dlt.common import pendulum diff --git a/tests/destinations/test_path_utils.py b/tests/destinations/test_path_utils.py index 4317da59b6..1cf2b17d76 100644 --- a/tests/destinations/test_path_utils.py +++ b/tests/destinations/test_path_utils.py @@ -18,9 +18,11 @@ def test_create_path() -> None: "table_name": "table_name", "load_id": "load_id", "file_id": "file_id", - "ext": "ext" + "ext": "ext", } - path = path_utils.create_path("{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", **path_vars) + path = path_utils.create_path( + "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", **path_vars + ) assert path == "schema_name/table_name/load_id.file_id.ext" # extension gets added automatically @@ -29,14 +31,23 @@ def test_create_path() -> None: def test_get_table_prefix_layout() -> None: - - prefix_layout = path_utils.get_table_prefix_layout("{schema_name}/{table_name}/{load_id}.{file_id}.{ext}") + prefix_layout = path_utils.get_table_prefix_layout( + "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}" + ) assert prefix_layout == "{schema_name}/{table_name}/" - assert prefix_layout.format(schema_name="my_schema", table_name="my_table") == "my_schema/my_table/" + assert ( + prefix_layout.format(schema_name="my_schema", table_name="my_table") + == "my_schema/my_table/" + ) - prefix_layout = path_utils.get_table_prefix_layout("some_random{schema_name}/stuff_in_between/{table_name}/{load_id}") + prefix_layout = path_utils.get_table_prefix_layout( + "some_random{schema_name}/stuff_in_between/{table_name}/{load_id}" + ) assert prefix_layout == "some_random{schema_name}/stuff_in_between/{table_name}/" - assert prefix_layout.format(schema_name="my_schema", table_name="my_table") == "some_randommy_schema/stuff_in_between/my_table/" + assert ( + prefix_layout.format(schema_name="my_schema", table_name="my_table") + == "some_randommy_schema/stuff_in_between/my_table/" + ) # disallow missing table_name with pytest.raises(CantExtractTablePrefix): @@ -48,7 +59,10 @@ def test_get_table_prefix_layout() -> None: # disallow any placeholders before table name (ie. Athena) with pytest.raises(CantExtractTablePrefix): - path_utils.get_table_prefix_layout("{schema_name}some_random{table_name}/stuff_in_between/", supported_prefix_placeholders=[]) + path_utils.get_table_prefix_layout( + "{schema_name}some_random{table_name}/stuff_in_between/", + supported_prefix_placeholders=[], + ) # disallow table_name without following separator with pytest.raises(CantExtractTablePrefix): diff --git a/tests/extract/cases/eth_source/source.py b/tests/extract/cases/eth_source/source.py index 08adb79a22..4410954f0b 100644 --- a/tests/extract/cases/eth_source/source.py +++ b/tests/extract/cases/eth_source/source.py @@ -1,6 +1,7 @@ from typing import Any import dlt + @dlt.source def ethereum() -> Any: # this just tests if the schema "ethereum" was loaded diff --git a/tests/extract/cases/section_source/external_resources.py b/tests/extract/cases/section_source/external_resources.py index 0a991d7438..07d3767e0a 100644 --- a/tests/extract/cases/section_source/external_resources.py +++ b/tests/extract/cases/section_source/external_resources.py @@ -6,19 +6,27 @@ @dlt.source def with_external(source_val: str = dlt.config.value): - @dlt.resource def inner_resource(val): yield val - return dlt.resource([source_val], name="source_val"), inner_resource(source_val), init_resource_f_2, resource_f_2 + return ( + dlt.resource([source_val], name="source_val"), + inner_resource(source_val), + init_resource_f_2, + resource_f_2, + ) @dlt.source def with_bound_external(source_val: str = dlt.config.value): - @dlt.resource def inner_resource(val): yield val - return dlt.resource([source_val], name="source_val"), inner_resource(source_val), init_resource_f_2(), resource_f_2() \ No newline at end of file + return ( + dlt.resource([source_val], name="source_val"), + inner_resource(source_val), + init_resource_f_2(), + resource_f_2(), + ) diff --git a/tests/extract/cases/section_source/named_module.py b/tests/extract/cases/section_source/named_module.py index 4a46ad0e19..c7580982b6 100644 --- a/tests/extract/cases/section_source/named_module.py +++ b/tests/extract/cases/section_source/named_module.py @@ -7,6 +7,7 @@ def source_f_1(val: str = dlt.config.value): return dlt.resource([val], name="f_1") + @dlt.resource def resource_f_2(val: str = dlt.config.value): yield [val] diff --git a/tests/extract/conftest.py b/tests/extract/conftest.py index f5dc47f54b..17f93b0ba5 100644 --- a/tests/extract/conftest.py +++ b/tests/extract/conftest.py @@ -1 +1,7 @@ -from tests.utils import duckdb_pipeline_location, autouse_test_storage, preserve_environ, patch_home_dir, wipe_pipeline \ No newline at end of file +from tests.utils import ( + duckdb_pipeline_location, + autouse_test_storage, + preserve_environ, + patch_home_dir, + wipe_pipeline, +) diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 27cdc3d22d..d1ff98fc26 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -22,10 +22,21 @@ from dlt.cli.source_detection import detect_source_configs from dlt.extract import DltResource, DltSource -from dlt.extract.exceptions import (DynamicNameNotStandaloneResource, InvalidResourceDataTypeFunctionNotAGenerator, - InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, - PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, - SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable) +from dlt.extract.exceptions import ( + DynamicNameNotStandaloneResource, + InvalidResourceDataTypeFunctionNotAGenerator, + InvalidResourceDataTypeIsNone, + InvalidResourceDataTypeMultiplePipes, + ParametrizedResourceUnbound, + PipeGenInvalid, + PipeNotBoundToData, + ResourceFunctionExpected, + ResourceInnerCallableConfigWrapDisallowed, + SourceDataIsNone, + SourceIsAClassTypeError, + SourceNotAFunction, + SourceSchemaNotAvailable, +) from dlt.extract.typing import TableNameMeta from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V8 @@ -41,12 +52,10 @@ def empty() -> None: with pytest.raises(SourceDataIsNone): dlt.source(empty)() - @dlt.source def deco_empty() -> None: pass - with pytest.raises(SourceDataIsNone): deco_empty() @@ -79,7 +88,6 @@ def test_load_schema_for_callable() -> None: def test_unbound_parametrized_transformer() -> None: - empty_pipe = DltResource.Empty._pipe assert empty_pipe.is_empty assert not empty_pipe.is_data_bound @@ -123,7 +131,7 @@ def test_transformer_no_parens() -> None: bound_r = dlt.resource([1, 2, 3], name="data") @dlt.transformer - def empty_t_1(item, meta = None): + def empty_t_1(item, meta=None): yield "a" * item assert list(bound_r | empty_t_1) == ["a", "aa", "aaa"] @@ -158,7 +166,6 @@ def accept_meta(item, meta=None, **kwargs): def test_source_name_is_invalid_schema_name() -> None: - def camelCase(): return dlt.resource([1, 2, 3], name="resource") @@ -183,10 +190,13 @@ def camelCase(): def test_resource_name_is_invalid_table_name_and_columns() -> None: - @dlt.source def camelCase(): - return dlt.resource([1, 2, 3], name="Resource !", columns={"KA!AX": {"name": "DIF!", "nullable": False, "data_type": "text"}}) + return dlt.resource( + [1, 2, 3], + name="Resource !", + columns={"KA!AX": {"name": "DIF!", "nullable": False, "data_type": "text"}}, + ) s = camelCase() assert s.resources["Resource !"].selected @@ -201,10 +211,9 @@ def camelCase(): def test_columns_argument() -> None: - @dlt.resource(name="user", columns={"tags": {"data_type": "complex", "x-extra": "x-annotation"}}) # type: ignore[typeddict-unknown-key] def get_users(): - yield {"u": "u", "tags": [1, 2 ,3]} + yield {"u": "u", "tags": [1, 2, 3]} t = get_users().compute_table_schema() @@ -229,20 +238,35 @@ def get_users(): def test_apply_hints_columns() -> None: @dlt.resource(name="user", columns={"tags": {"data_type": "complex", "primary_key": True}}) def get_users(): - yield {"u": "u", "tags": [1, 2 ,3]} + yield {"u": "u", "tags": [1, 2, 3]} users = get_users() assert users.columns == {"tags": {"data_type": "complex", "name": "tags", "primary_key": True}} - assert cast(TTableSchemaColumns, users.columns)["tags"] == users.compute_table_schema()["columns"]["tags"] + assert ( + cast(TTableSchemaColumns, users.columns)["tags"] + == users.compute_table_schema()["columns"]["tags"] + ) # columns property can be changed in place cast(TTableSchemaColumns, users.columns)["tags"]["data_type"] = "text" assert users.compute_table_schema()["columns"]["tags"]["data_type"] == "text" # apply column definition - it should be merged with defaults - users.apply_hints(columns={"tags": {"primary_key": False, "data_type": "text"}, "things": new_column("things", nullable=False)}) - assert cast(TTableSchemaColumns, users.columns)["tags"] == {"data_type": "text", "name": "tags", "primary_key": False} - assert cast(TTableSchemaColumns, users.columns)["things"] == {"name": "things", "nullable": False} + users.apply_hints( + columns={ + "tags": {"primary_key": False, "data_type": "text"}, + "things": new_column("things", nullable=False), + } + ) + assert cast(TTableSchemaColumns, users.columns)["tags"] == { + "data_type": "text", + "name": "tags", + "primary_key": False, + } + assert cast(TTableSchemaColumns, users.columns)["things"] == { + "name": "things", + "nullable": False, + } # delete columns by passing empty users.apply_hints(columns={}) @@ -306,6 +330,7 @@ def some_data(): def test_source_sections() -> None: # source in __init__.py of module from tests.extract.cases.section_source import init_source_f_1, init_resource_f_2 + # source in file module with name override from tests.extract.cases.section_source.named_module import source_f_1, resource_f_2 @@ -335,21 +360,27 @@ def test_source_sections() -> None: assert list(resource_f_2()) == ["NAME OVERRIDDEN LEVEL"] # values in function name section - os.environ[f"{known_sections.SOURCES.upper()}__SECTION_SOURCE__INIT_SOURCE_F_1__VAL"] = "SECTION INIT_SOURCE_F_1 LEVEL" + os.environ[f"{known_sections.SOURCES.upper()}__SECTION_SOURCE__INIT_SOURCE_F_1__VAL"] = ( + "SECTION INIT_SOURCE_F_1 LEVEL" + ) assert list(init_source_f_1()) == ["SECTION INIT_SOURCE_F_1 LEVEL"] - os.environ[f"{known_sections.SOURCES.upper()}__SECTION_SOURCE__INIT_RESOURCE_F_2__VAL"] = "SECTION INIT_RESOURCE_F_2 LEVEL" + os.environ[f"{known_sections.SOURCES.upper()}__SECTION_SOURCE__INIT_RESOURCE_F_2__VAL"] = ( + "SECTION INIT_RESOURCE_F_2 LEVEL" + ) assert list(init_resource_f_2()) == ["SECTION INIT_RESOURCE_F_2 LEVEL"] - os.environ[f"{known_sections.SOURCES.upper()}__NAME_OVERRIDDEN__SOURCE_F_1__VAL"] = "NAME SOURCE_F_1 LEVEL" + os.environ[f"{known_sections.SOURCES.upper()}__NAME_OVERRIDDEN__SOURCE_F_1__VAL"] = ( + "NAME SOURCE_F_1 LEVEL" + ) assert list(source_f_1()) == ["NAME SOURCE_F_1 LEVEL"] - os.environ[f"{known_sections.SOURCES.upper()}__NAME_OVERRIDDEN__RESOURCE_F_2__VAL"] = "NAME RESOURCE_F_2 LEVEL" + os.environ[f"{known_sections.SOURCES.upper()}__NAME_OVERRIDDEN__RESOURCE_F_2__VAL"] = ( + "NAME RESOURCE_F_2 LEVEL" + ) assert list(resource_f_2()) == ["NAME RESOURCE_F_2 LEVEL"] def test_source_explicit_section() -> None: - @dlt.source(section="custom_section", schema=Schema("custom_section")) def with_section(secret=dlt.secrets.value): - @dlt.resource def mod_state(): dlt.current.source_state()["val"] = secret @@ -366,7 +397,6 @@ def mod_state(): def test_resource_section() -> None: - r = dlt.resource([1, 2, 3], name="T") assert r.name == "T" assert r.section is None @@ -379,14 +409,23 @@ def _inner_gen(): assert r.section == "test_decorators" from tests.extract.cases.section_source.external_resources import init_resource_f_2 + assert init_resource_f_2.name == "init_resource_f_2" assert init_resource_f_2.section == "section_source" def test_resources_injected_sections() -> None: - from tests.extract.cases.section_source.external_resources import with_external, with_bound_external, init_resource_f_2, resource_f_2 + from tests.extract.cases.section_source.external_resources import ( + with_external, + with_bound_external, + init_resource_f_2, + resource_f_2, + ) + # standalone resources must accept the injected sections for lookups - os.environ["SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL"] = "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL" + os.environ["SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL"] = ( + "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL" + ) os.environ["SOURCES__EXTERNAL_RESOURCES__VAL"] = "SOURCES__EXTERNAL_RESOURCES__VAL" os.environ["SOURCES__SECTION_SOURCE__VAL"] = "SOURCES__SECTION_SOURCE__VAL" os.environ["SOURCES__NAME_OVERRIDDEN__VAL"] = "SOURCES__NAME_OVERRIDDEN__VAL" @@ -401,44 +440,59 @@ def test_resources_injected_sections() -> None: "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__VAL", - "SOURCES__EXTERNAL_RESOURCES__VAL" + "SOURCES__EXTERNAL_RESOURCES__VAL", ] # this source will bind external resources before returning them (that is: calling them and obtaining generators) # the iterator in the source will force its sections so external resource sections are not used s = with_bound_external() - assert list(s) == list([ - "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", - "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", - "SOURCES__EXTERNAL_RESOURCES__VAL", - "SOURCES__EXTERNAL_RESOURCES__VAL" - ]) + assert list(s) == list( + [ + "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", + "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", + "SOURCES__EXTERNAL_RESOURCES__VAL", + "SOURCES__EXTERNAL_RESOURCES__VAL", + ] + ) # inject the source sections like the Pipeline object would s = with_external() assert s.name == "with_external" assert s.section == "external_resources" # from module name hosting the function - with inject_section(ConfigSectionContext(pipeline_name="injected_external", sections=("sources", s.section, s.name))): + with inject_section( + ConfigSectionContext( + pipeline_name="injected_external", sections=("sources", s.section, s.name) + ) + ): # now the external sources must adopt the injected namespace - assert(list(s)) == [ + assert (list(s)) == [ "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__VAL", - "SOURCES__EXTERNAL_RESOURCES__VAL" + "SOURCES__EXTERNAL_RESOURCES__VAL", ] # now with environ values that specify source/resource name: the module of the source, the name of the resource - os.environ["SOURCES__EXTERNAL_RESOURCES__INIT_RESOURCE_F_2__VAL"] = "SOURCES__EXTERNAL_RESOURCES__INIT_RESOURCE_F_2__VAL" - os.environ["SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL"] = "SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL" + os.environ["SOURCES__EXTERNAL_RESOURCES__INIT_RESOURCE_F_2__VAL"] = ( + "SOURCES__EXTERNAL_RESOURCES__INIT_RESOURCE_F_2__VAL" + ) + os.environ["SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL"] = ( + "SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL" + ) s = with_external() - with inject_section(ConfigSectionContext(pipeline_name="injected_external", sections=("sources", s.section, s.name))): + with inject_section( + ConfigSectionContext( + pipeline_name="injected_external", sections=("sources", s.section, s.name) + ) + ): # now the external sources must adopt the injected namespace - assert(list(s)) == [ + assert (list(s)) == [ "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__SOURCE_VAL", "SOURCES__EXTERNAL_RESOURCES__INIT_RESOURCE_F_2__VAL", - "SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL" + "SOURCES__EXTERNAL_RESOURCES__RESOURCE_F_2__VAL", ] + def test_source_schema_context() -> None: import dlt @@ -489,7 +543,6 @@ def created_global(): def test_source_state_context() -> None: - @dlt.resource(selected=False) def main(): state = dlt.current.state() @@ -497,14 +550,14 @@ def main(): # increase the multiplier each time state is obtained state["mark"] *= 2 yield [1, 2, 3] - assert dlt.state()["mark"] == mark*2 + assert dlt.state()["mark"] == mark * 2 @dlt.transformer(data_from=main) def feeding(item): # we must have state assert dlt.current.source_state()["mark"] > 1 mark = dlt.current.source_state()["mark"] - yield from map(lambda i: i*mark, item) + yield from map(lambda i: i * mark, item) @dlt.source def pass_the_state(): @@ -520,7 +573,6 @@ def pass_the_state(): def test_source_schema_modified() -> None: - @dlt.source def schema_test(): return dlt.resource(["A", "B"], name="alpha") @@ -538,13 +590,12 @@ def standalone_resource(secret=dlt.secrets.value, config=dlt.config.value, opt: def test_spec_generation() -> None: - # inner resource cannot take configuration with pytest.raises(ResourceInnerCallableConfigWrapDisallowed) as py_ex: @dlt.resource(write_disposition="merge", primary_key="id") - def inner_resource(initial_id = dlt.config.value): + def inner_resource(initial_id=dlt.config.value): yield [{"id": 1, "name": "row1"}, {"id": 1, "name": "row2"}] assert py_ex.value.resource_name == "inner_resource" @@ -573,7 +624,6 @@ def not_args_r(): def test_sources_no_arguments() -> None: - @dlt.source def no_args(): return dlt.resource([1, 2], name="data") @@ -602,7 +652,6 @@ def not_args_r_i(): def test_resource_sets_invalid_write_disposition() -> None: - @dlt.resource(write_disposition="xxxx") # type: ignore[call-overload] def invalid_disposition(): yield from [1, 2, 3] @@ -614,14 +663,12 @@ def invalid_disposition(): def test_custom_source_impl() -> None: - class TypedSource(DltSource): def users(self, mode: str) -> DltResource: return self.resources["users"](mode) @dlt.source(_impl_cls=TypedSource) def all_users(): - @dlt.resource def users(mode: str): yield mode @@ -640,7 +687,6 @@ def standalone_signature(init: int, secret_end: int = dlt.secrets.value): def test_standalone_resource() -> None: - # wrapped flag will not create the resource but just simple function wrapper that must be called before use @dlt.resource(standalone=True) def nice_signature(init: int): @@ -724,7 +770,12 @@ def test_standalone_transformer() -> None: # test configuration os.environ["SOURCES__TEST_DECORATORS__STANDALONE_SIGNATURE__SECRET_END"] = "5" os.environ["SOURCES__TEST_DECORATORS__STANDALONE_TRANSFORMER_RETURNS__INIT"] = "2" - assert list(standalone_signature(1) | standalone_transformer_returns()) == ["AA", "AAAA", "AAAAAA", "AAAAAAAA"] + assert list(standalone_signature(1) | standalone_transformer_returns()) == [ + "AA", + "AAAA", + "AAAAAA", + "AAAAAAAA", + ] @dlt.transformer(standalone=True, name=lambda args: args["res_name"]) @@ -739,9 +790,14 @@ def test_standalone_resource_with_name() -> None: # still the config comes via the function name os.environ["SOURCES__TEST_DECORATORS__STANDALONE_TX_WITH_NAME__INIT"] = "2" - assert list(dlt.resource([1, 2, 3], name="x") | my_tx) == ['my_txmy_tx', 'my_txmy_txmy_txmy_tx', 'my_txmy_txmy_txmy_txmy_txmy_tx'] + assert list(dlt.resource([1, 2, 3], name="x") | my_tx) == [ + "my_txmy_tx", + "my_txmy_txmy_txmy_tx", + "my_txmy_txmy_txmy_txmy_txmy_tx", + ] with pytest.raises(DynamicNameNotStandaloneResource): + @dlt.resource(standalone=False, name=lambda args: args["res_name"]) # type: ignore[call-overload] def standalone_name(): yield "A" @@ -767,7 +823,6 @@ def test_resource_rename_credentials_separation(): def test_class_source() -> None: - class _Source: def __init__(self, elems: int) -> None: self.elems = elems @@ -781,10 +836,11 @@ def __call__(self, more: int = 1): schema = s.discover_schema() assert schema.name == "_Source" assert "_list" in schema.tables - assert list(s) == ['A', 'V', 'A', 'V', 'A', 'V', 'A', 'V'] + assert list(s) == ["A", "V", "A", "V", "A", "V", "A", "V"] # CAN'T decorate classes themselves with pytest.raises(SourceIsAClassTypeError): + @dlt.source(name="planB") class _SourceB: def __init__(self, elems: int) -> None: diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index a045dd4f3c..b5edf1b5ed 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -10,7 +10,6 @@ def test_extract_select_tables() -> None: - def expect_tables(resource: DltResource) -> dlt.Schema: # delete files clean_test_storage() @@ -29,9 +28,8 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage.commit_extract_files(extract_id) # check resulting files assert len(storage.list_files_to_normalize_sorted()) == 2 - expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1,3,5,7,9])) - expect_extracted_file(storage, "selectables", "even_table", json.dumps([0,2,4,6,8])) - + expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1, 3, 5, 7, 9])) + expect_extracted_file(storage, "selectables", "even_table", json.dumps([0, 2, 4, 6, 8])) # delete files clean_test_storage() @@ -46,7 +44,7 @@ def expect_tables(resource: DltResource) -> dlt.Schema: assert "odd_table" in source.schema._schema_tables storage.commit_extract_files(extract_id) assert len(storage.list_files_to_normalize_sorted()) == 1 - expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1,3,5,7,9])) + expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1, 3, 5, 7, 9])) return schema @@ -65,7 +63,7 @@ def table_with_name_selectable(_range): @dlt.resource(table_name=n_f) def table_name_with_lambda(_range): - yield list(range(_range)) + yield list(range(_range)) schema = expect_tables(table_name_with_lambda) assert "table_name_with_lambda" not in schema.tables @@ -80,12 +78,14 @@ def input_gen(): yield from [1, 2, 3] input_r = DltResource.from_data(input_gen) - source = DltSource(dlt.Schema("selectables"), "module", [input_r, input_r.with_name("gen_clone")]) + source = DltSource( + dlt.Schema("selectables"), "module", [input_r, input_r.with_name("gen_clone")] + ) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() extract(extract_id, source, storage) # both tables got generated - assert "input_gen" in source.schema._schema_tables + assert "input_gen" in source.schema._schema_tables assert "gen_clone" in source.schema._schema_tables @@ -94,12 +94,14 @@ def input_gen(): yield from [1, 2, 3] def tx_step(item): - return item*2 + return item * 2 input_r = DltResource.from_data(input_gen) input_tx = DltResource.from_data(tx_step, data_from=DltResource.Empty) - source = DltSource(dlt.Schema("selectables"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")]) + source = DltSource( + dlt.Schema("selectables"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")] + ) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() extract(extract_id, source, storage) diff --git a/tests/extract/test_extract_pipe.py b/tests/extract/test_extract_pipe.py index a4e894bf94..4ad6cb6f72 100644 --- a/tests/extract/test_extract_pipe.py +++ b/tests/extract/test_extract_pipe.py @@ -15,7 +15,6 @@ def test_next_item_mode() -> None: - def nested_gen_level_2(): yield from [88, None, 89] @@ -23,25 +22,25 @@ def nested_gen(): yield from [55, 56, None, 77, nested_gen_level_2()] def source_gen1(): - yield from [1, 2, nested_gen(), 3,4] + yield from [1, 2, nested_gen(), 3, 4] def source_gen2(): yield from range(11, 16) def source_gen3(): - yield from range(20,22) + yield from range(20, 22) def get_pipes(): return [ Pipe.from_data("data1", source_gen1()), Pipe.from_data("data2", source_gen2()), Pipe.from_data("data3", source_gen3()), - ] + ] # default mode is "fifo" _l = list(PipeIterator.from_pipes(get_pipes(), next_item_mode="fifo")) # items will be in order of the pipes, nested iterator items appear inline - assert [pi.item for pi in _l] == [1, 2, 55, 56, 77, 88, 89, 3, 4, 11, 12, 13, 14, 15, 20, 21] + assert [pi.item for pi in _l] == [1, 2, 55, 56, 77, 88, 89, 3, 4, 11, 12, 13, 14, 15, 20, 21] # round robin mode _l = list(PipeIterator.from_pipes(get_pipes(), next_item_mode="round_robin")) @@ -50,7 +49,6 @@ def get_pipes(): def test_rotation_on_none() -> None: - global gen_1_started global gen_2_started global gen_3_started @@ -85,7 +83,7 @@ def get_pipes(): Pipe.from_data("data1", source_gen1()), Pipe.from_data("data2", source_gen2()), Pipe.from_data("data3", source_gen3()), - ] + ] # round robin mode _l = list(PipeIterator.from_pipes(get_pipes(), next_item_mode="round_robin")) @@ -136,7 +134,7 @@ def test_insert_remove_step() -> None: pp = Pipe.from_data("data", data) def tx(item): - yield item*2 + yield item * 2 # create pipe with transformer p = Pipe.from_data("tx", tx, parent=pp) @@ -188,7 +186,7 @@ def item_meta_step(item, meta): p.remove_step(0) assert p._gen_idx == 0 _l = list(PipeIterator.from_pipe(p)) - assert [pi.item for pi in _l] == [0.5, 1, 3/2] + assert [pi.item for pi in _l] == [0.5, 1, 3 / 2] # remove all remaining txs p.remove_step(1) pp.remove_step(1) @@ -210,7 +208,7 @@ def item_meta_step(item, meta): def tx_minus(item, meta): assert meta is None - yield item*-4 + yield item * -4 p.replace_gen(tx_minus) _l = list(PipeIterator.from_pipe(p)) @@ -233,8 +231,8 @@ def test_pipe_propagate_meta() -> None: p = Pipe.from_data("data", iter(meta_data)) def item_meta_step(item: int, meta): - assert _meta[item-1] == meta - return item*2 + assert _meta[item - 1] == meta + return item * 2 p.append_step(item_meta_step) # type: ignore[arg-type] _l = list(PipeIterator.from_pipe(p)) @@ -247,19 +245,19 @@ def item_meta_step(item: int, meta): # does not take meta def transformer(item): - yield item*item + yield item * item def item_meta_step_trans(item: int, meta): # reverse all transformations on item - meta_idx = int(item**0.5//2) - assert _meta[meta_idx-1] == meta - return item*2 + meta_idx = int(item**0.5 // 2) + assert _meta[meta_idx - 1] == meta + return item * 2 t = Pipe("tran", [transformer], parent=p) t.append_step(item_meta_step_trans) # type: ignore[arg-type] _l = list(PipeIterator.from_pipe(t)) # item got propagated through transformation -> transformer -> transformation - assert [int((pi.item//2)**0.5//2) for pi in _l] == data # type: ignore[operator] + assert [int((pi.item // 2) ** 0.5 // 2) for pi in _l] == data # type: ignore[operator] assert [pi.meta for pi in _l] == _meta # same but with the fork step @@ -270,7 +268,7 @@ def item_meta_step_trans(item: int, meta): # do not yield parents _l = list(PipeIterator.from_pipes([p, t], yield_parents=False)) # same result - assert [int((pi.item//2)**0.5//2) for pi in _l] == data # type: ignore[operator] + assert [int((pi.item // 2) ** 0.5 // 2) for pi in _l] == data # type: ignore[operator] assert [pi.meta for pi in _l] == _meta # same but yield parents @@ -281,11 +279,11 @@ def item_meta_step_trans(item: int, meta): _l = list(PipeIterator.from_pipes([p, t], yield_parents=True)) # same result for transformer tran_l = [pi for pi in _l if pi.pipe.name == t.name] - assert [int((pi.item//2)**0.5//2) for pi in tran_l] == data # type: ignore[operator] + assert [int((pi.item // 2) ** 0.5 // 2) for pi in tran_l] == data # type: ignore[operator] assert [pi.meta for pi in tran_l] == _meta data_l = [pi for pi in _l if pi.pipe.name is p.name] # data pipe went only through one transformation - assert [int(pi.item//2) for pi in data_l] == data # type: ignore[operator] + assert [int(pi.item // 2) for pi in data_l] == data # type: ignore[operator] assert [pi.meta for pi in data_l] == _meta @@ -297,9 +295,9 @@ def test_pipe_transformation_changes_meta() -> None: p = Pipe.from_data("data", iter(meta_data)) def item_meta_step(item: int, meta): - assert _meta[item-1] == meta + assert _meta[item - 1] == meta # return meta, it should overwrite existing one - return DataItemWithMeta("X" + str(item), item*2) + return DataItemWithMeta("X" + str(item), item * 2) p.append_step(item_meta_step) # type: ignore[arg-type] _l = list(PipeIterator.from_pipe(p)) @@ -309,10 +307,10 @@ def item_meta_step(item: int, meta): # also works for deferred transformations @dlt.defer def item_meta_step_defer(item: int, meta): - assert _meta[item-1] == meta + assert _meta[item - 1] == meta sleep(item * 0.2) # return meta, it should overwrite existing one - return DataItemWithMeta("X" + str(item), item*2) + return DataItemWithMeta("X" + str(item), item * 2) p = Pipe.from_data("data", iter(meta_data)) p.append_step(item_meta_step_defer) # type: ignore[arg-type] @@ -322,9 +320,9 @@ def item_meta_step_defer(item: int, meta): # also works for yielding transformations def item_meta_step_flat(item: int, meta): - assert _meta[item-1] == meta + assert _meta[item - 1] == meta # return meta, it should overwrite existing one - yield DataItemWithMeta("X" + str(item), item*2) + yield DataItemWithMeta("X" + str(item), item * 2) p = Pipe.from_data("data", iter(meta_data)) p.append_step(item_meta_step_flat) # type: ignore[arg-type] @@ -334,10 +332,10 @@ def item_meta_step_flat(item: int, meta): # also works for async async def item_meta_step_async(item: int, meta): - assert _meta[item-1] == meta + assert _meta[item - 1] == meta await asyncio.sleep(item * 0.2) # this returns awaitable - return DataItemWithMeta("X" + str(item), item*2) + return DataItemWithMeta("X" + str(item), item * 2) p = Pipe.from_data("data", iter(meta_data)) p.append_step(item_meta_step_async) # type: ignore[arg-type] @@ -348,7 +346,7 @@ async def item_meta_step_async(item: int, meta): # also lets the transformer return meta def transformer(item: int): - yield DataItemWithMeta("X" + str(item), item*2) + yield DataItemWithMeta("X" + str(item), item * 2) p = Pipe.from_data("data", iter(meta_data)) t = Pipe("tran", [transformer], parent=p) # type: ignore[list-item] # TODO: typealias not working? @@ -446,14 +444,30 @@ def test_yield_map_step() -> None: p = Pipe.from_data("data", [1, 2, 3]) # this creates number of rows as passed by the data p.append_step(YieldMapItem(lambda item: (yield from [f"item_{x}" for x in range(item)]))) - assert _f_items(list(PipeIterator.from_pipe(p))) == ["item_0", "item_0", "item_1", "item_0", "item_1", "item_2"] + assert _f_items(list(PipeIterator.from_pipe(p))) == [ + "item_0", + "item_0", + "item_1", + "item_0", + "item_1", + "item_2", + ] data = [1, 2, 3] meta = ["A", "B", "C"] # package items into meta wrapper meta_data = [DataItemWithMeta(m, d) for m, d in zip(meta, data)] p = Pipe.from_data("data", meta_data) - p.append_step(YieldMapItem(lambda item, meta: (yield from [f"item_{meta}_{x}" for x in range(item)]))) - assert _f_items(list(PipeIterator.from_pipe(p))) == ["item_A_0", "item_B_0", "item_B_1", "item_C_0", "item_C_1", "item_C_2"] + p.append_step( + YieldMapItem(lambda item, meta: (yield from [f"item_{meta}_{x}" for x in range(item)])) + ) + assert _f_items(list(PipeIterator.from_pipe(p))) == [ + "item_A_0", + "item_B_0", + "item_B_1", + "item_C_0", + "item_C_1", + "item_C_2", + ] def test_pipe_copy_on_fork() -> None: @@ -517,9 +531,8 @@ def test_clone_single_pipe() -> None: def test_clone_pipes() -> None: - def pass_gen(item, meta): - yield item*2 + yield item * 2 data = [1, 2, 3] p1 = Pipe("p1", [data]) @@ -559,13 +572,14 @@ def assert_cloned_pipes(pipes: List[Pipe], cloned_pipes: List[Pipe]) -> None: # must yield same data for pipe, cloned_pipe in zip(pipes, cloned_pipes): - assert _f_items(list(PipeIterator.from_pipe(pipe))) == _f_items(list(PipeIterator.from_pipe(cloned_pipe))) + assert _f_items(list(PipeIterator.from_pipe(pipe))) == _f_items( + list(PipeIterator.from_pipe(cloned_pipe)) + ) def test_circular_deps() -> None: - def pass_gen(item, meta): - yield item*2 + yield item * 2 c_p1_p3 = Pipe("c_p1_p3", [pass_gen]) c_p1_p4 = Pipe("c_p1_p4", [pass_gen], parent=c_p1_p3) @@ -641,7 +655,6 @@ def raise_gen(item: int): def test_close_on_sync_exception() -> None: - def long_gen(): global close_pipe_got_exit, close_pipe_yielding @@ -668,7 +681,9 @@ def assert_pipes_closed(raise_gen, long_gen) -> None: close_pipe_yielding = False pit: PipeIterator = None - with PipeIterator.from_pipe(Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen()))) as pit: + with PipeIterator.from_pipe( + Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen())) + ) as pit: with pytest.raises(ResourceExtractionError) as py_ex: list(pit) assert isinstance(py_ex.value.__cause__, RuntimeError) @@ -680,7 +695,9 @@ def assert_pipes_closed(raise_gen, long_gen) -> None: close_pipe_got_exit = False close_pipe_yielding = False - pit = ManagedPipeIterator.from_pipe(Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen()))) + pit = ManagedPipeIterator.from_pipe( + Pipe.from_data("failing", raise_gen, parent=Pipe.from_data("endless", long_gen())) + ) with pytest.raises(ResourceExtractionError) as py_ex: list(pit) assert isinstance(py_ex.value.__cause__, RuntimeError) diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index ec28018add..c37663604b 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -19,7 +19,10 @@ from dlt.extract import DltSource from dlt.sources.helpers.transform import take_first -from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import ( + IncrementalCursorPathMissing, + IncrementalPrimaryKeyMissing, +) from dlt.pipeline.exceptions import PipelineStepFailed from tests.extract.utils import AssertItems, data_item_to_list @@ -29,68 +32,71 @@ @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_single_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data = [ - {'created_at': 425}, - {'created_at': 426}, + {"created_at": 425}, + {"created_at": 426}, ] source_items = data_to_item_format(item_type, data) + @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at')): + def some_data(created_at=dlt.sources.incremental("created_at")): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = some_data.state['incremental']['created_at'] - assert s['last_value'] == 426 + s = some_data.state["incremental"]["created_at"] + assert s["last_value"] == 426 @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_single_items_last_value_state_is_updated_transformer(item_type: TDataItemFormat) -> None: data = [ - {'created_at': 425}, - {'created_at': 426}, + {"created_at": 425}, + {"created_at": 426}, ] source_items = data_to_item_format(item_type, data) @dlt.transformer - def some_data(item, created_at=dlt.sources.incremental('created_at')): + def some_data(item, created_at=dlt.sources.incremental("created_at")): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) - p.extract(dlt.resource([1,2,3], name="table") | some_data()) + p.extract(dlt.resource([1, 2, 3], name="table") | some_data()) - s = some_data().state['incremental']['created_at'] - assert s['last_value'] == 426 + s = some_data().state["incremental"]["created_at"] + assert s["last_value"] == 426 @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_batch_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: - data1 = [{'created_at': i} for i in range(5)] - data2 = [{'created_at': i} for i in range(5, 10)] + data1 = [{"created_at": i} for i in range(5)] + data2 = [{"created_at": i} for i in range(5, 10)] source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at')): + def some_data(created_at=dlt.sources.incremental("created_at")): yield source_items1 yield source_items2 p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] - assert s['last_value'] == 9 + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] + assert s["last_value"] == 9 @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_last_value_access_in_resource(item_type: TDataItemFormat) -> None: values = [] - data = [{'created_at': i} for i in range(6)] + data = [{"created_at": i} for i in range(6)] source_items = data_to_item_format(item_type, data) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at')): + def some_data(created_at=dlt.sources.incremental("created_at")): values.append(created_at.last_value) yield source_items @@ -104,30 +110,33 @@ def some_data(created_at=dlt.sources.incremental('created_at')): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_unique_keys_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ - {'created_at': 1, 'id': 'a'}, - {'created_at': 2, 'id': 'b'}, - {'created_at': 3, 'id': 'c'}, - {'created_at': 3, 'id': 'd'}, - {'created_at': 3, 'id': 'e'}, + {"created_at": 1, "id": "a"}, + {"created_at": 2, "id": "b"}, + {"created_at": 3, "id": "c"}, + {"created_at": 3, "id": "d"}, + {"created_at": 3, "id": "e"}, ] data2 = [ - {'created_at': 3, 'id': 'c'}, - {'created_at': 3, 'id': 'd'}, - {'created_at': 3, 'id': 'e'}, - {'created_at': 3, 'id': 'f'}, - {'created_at': 4, 'id': 'g'}, + {"created_at": 3, "id": "c"}, + {"created_at": 3, "id": "d"}, + {"created_at": 3, "id": "e"}, + {"created_at": 3, "id": "f"}, + {"created_at": 4, "id": "g"}, ] source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) - @dlt.resource(primary_key='id') - def some_data(created_at=dlt.sources.incremental('created_at')): + + @dlt.resource(primary_key="id") + def some_data(created_at=dlt.sources.incremental("created_at")): if created_at.last_value is None: yield from source_items1 else: yield from source_items2 - p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + p = dlt.pipeline( + pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + ) p.run(some_data()).raise_on_failed_jobs() p.run(some_data()).raise_on_failed_jobs() @@ -135,37 +144,39 @@ def some_data(created_at=dlt.sources.incremental('created_at')): with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: rows = cur.fetchall() - assert rows == [(1, 'a'), (2, 'b'), (3, 'c'), (3, 'd'), (3, 'e'), (3, 'f'), (4, 'g')] + assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_unique_rows_by_hash_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ - {'created_at': 1, 'id': 'a'}, - {'created_at': 2, 'id': 'b'}, - {'created_at': 3, 'id': 'c'}, - {'created_at': 3, 'id': 'd'}, - {'created_at': 3, 'id': 'e'}, + {"created_at": 1, "id": "a"}, + {"created_at": 2, "id": "b"}, + {"created_at": 3, "id": "c"}, + {"created_at": 3, "id": "d"}, + {"created_at": 3, "id": "e"}, ] data2 = [ - {'created_at': 3, 'id': 'c'}, - {'created_at': 3, 'id': 'd'}, - {'created_at': 3, 'id': 'e'}, - {'created_at': 3, 'id': 'f'}, - {'created_at': 4, 'id': 'g'}, + {"created_at": 3, "id": "c"}, + {"created_at": 3, "id": "d"}, + {"created_at": 3, "id": "e"}, + {"created_at": 3, "id": "f"}, + {"created_at": 4, "id": "g"}, ] source_items1 = data_to_item_format(item_type, data1) source_items2 = data_to_item_format(item_type, data2) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at')): + def some_data(created_at=dlt.sources.incremental("created_at")): if created_at.last_value is None: yield from source_items1 else: yield from source_items2 - p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + p = dlt.pipeline( + pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + ) p.run(some_data()).raise_on_failed_jobs() p.run(some_data()).raise_on_failed_jobs() @@ -173,30 +184,30 @@ def some_data(created_at=dlt.sources.incremental('created_at')): with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: rows = cur.fetchall() - assert rows == [(1, 'a'), (2, 'b'), (3, 'c'), (3, 'd'), (3, 'e'), (3, 'f'), (4, 'g')] + assert rows == [(1, "a"), (2, "b"), (3, "c"), (3, "d"), (3, "e"), (3, "f"), (4, "g")] def test_nested_cursor_path() -> None: @dlt.resource - def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): - yield {'data': {'items': [{'created_at': 2}]}} + def some_data(created_at=dlt.sources.incremental("data.items[0].created_at")): + yield {"data": {"items": [{"created_at": 2}]}} p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['data.items[0].created_at'] - assert s['last_value'] == 2 + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "data.items[0].created_at" + ] + assert s["last_value"] == 2 @pytest.mark.parametrize("item_type", ["arrow", "pandas"]) def test_nested_cursor_path_arrow_fails(item_type: TDataItemFormat) -> None: - data = [ - {'data': {'items': [{'created_at': 2}]}} - ] + data = [{"data": {"items": [{"created_at": 2}]}}] source_items = data_to_item_format(item_type, data) @dlt.resource - def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): + def some_data(created_at=dlt.sources.incremental("data.items[0].created_at")): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) @@ -211,52 +222,61 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_explicit_initial_value(item_type: TDataItemFormat) -> None: @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at')): + def some_data(created_at=dlt.sources.incremental("created_at")): data = [{"created_at": created_at.last_value}] yield from data_to_item_format(item_type, data) p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data(created_at=4242)) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] - assert s['last_value'] == 4242 + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] + assert s["last_value"] == 4242 @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_explicit_incremental_instance(item_type: TDataItemFormat) -> None: - data = [{'inserted_at': 242, 'some_uq': 444}] + data = [{"inserted_at": 242, "some_uq": 444}] source_items = data_to_item_format(item_type, data) - @dlt.resource(primary_key='some_uq') - def some_data(incremental=dlt.sources.incremental('created_at', initial_value=0)): - assert incremental.cursor_path == 'inserted_at' + @dlt.resource(primary_key="some_uq") + def some_data(incremental=dlt.sources.incremental("created_at", initial_value=0)): + assert incremental.cursor_path == "inserted_at" assert incremental.initial_value == 241 yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) - p.extract(some_data(incremental=dlt.sources.incremental('inserted_at', initial_value=241))) + p.extract(some_data(incremental=dlt.sources.incremental("inserted_at", initial_value=241))) @dlt.resource -def some_data_from_config(call_no: int, item_type: TDataItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): - assert created_at.cursor_path == 'created_at' +def some_data_from_config( + call_no: int, + item_type: TDataItemFormat, + created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value, +): + assert created_at.cursor_path == "created_at" # start value will update to the last_value on next call if call_no == 1: - assert created_at.initial_value == '2022-02-03T00:00:00Z' - assert created_at.start_value == '2022-02-03T00:00:00Z' + assert created_at.initial_value == "2022-02-03T00:00:00Z" + assert created_at.start_value == "2022-02-03T00:00:00Z" if call_no == 2: - assert created_at.initial_value == '2022-02-03T00:00:00Z' - assert created_at.start_value == '2022-02-03T00:00:01Z' - data = [{'created_at': '2022-02-03T00:00:01Z'}] + assert created_at.initial_value == "2022-02-03T00:00:00Z" + assert created_at.start_value == "2022-02-03T00:00:01Z" + data = [{"created_at": "2022-02-03T00:00:01Z"}] source_items = data_to_item_format(item_type, data) yield from source_items @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_optional_incremental_from_config(item_type: TDataItemFormat) -> None: - - os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__CURSOR_PATH'] = 'created_at' - os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2022-02-03T00:00:00Z' + os.environ["SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__CURSOR_PATH"] = ( + "created_at" + ) + os.environ["SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__INITIAL_VALUE"] = ( + "2022-02-03T00:00:00Z" + ) p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data_from_config(1, item_type)) @@ -266,7 +286,7 @@ def test_optional_incremental_from_config(item_type: TDataItemFormat) -> None: @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_optional_incremental_not_passed(item_type: TDataItemFormat) -> None: """Resource still runs when no incremental is passed""" - data = [1,2,3] + data = [1, 2, 3] source_items = data_to_item_format(item_type, data) @dlt.resource @@ -283,8 +303,10 @@ class OptionalIncrementalConfig(BaseConfiguration): @dlt.resource(spec=OptionalIncrementalConfig) -def optional_incremental_arg_resource(item_type: TDataItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: - data = [1,2,3] +def optional_incremental_arg_resource( + item_type: TDataItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None +) -> Any: + data = [1, 2, 3] source_items = data_to_item_format(item_type, data) assert incremental is None yield source_items @@ -298,15 +320,17 @@ def test_optional_arg_from_spec_not_passed(item_type: TDataItemFormat) -> None: @configspec class SomeDataOverrideConfiguration(BaseConfiguration): - created_at: dlt.sources.incremental = dlt.sources.incremental('created_at', initial_value='2022-02-03T00:00:00Z') # type: ignore[type-arg] + created_at: dlt.sources.incremental = dlt.sources.incremental("created_at", initial_value="2022-02-03T00:00:00Z") # type: ignore[type-arg] # provide what to inject via spec. the spec contain the default @dlt.resource(spec=SomeDataOverrideConfiguration) -def some_data_override_config(item_type: TDataItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): - assert created_at.cursor_path == 'created_at' - assert created_at.initial_value == '2000-02-03T00:00:00Z' - data = [{'created_at': '2023-03-03T00:00:00Z'}] +def some_data_override_config( + item_type: TDataItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value +): + assert created_at.cursor_path == "created_at" + assert created_at.initial_value == "2000-02-03T00:00:00Z" + data = [{"created_at": "2023-03-03T00:00:00Z"}] source_items = data_to_item_format(item_type, data) yield from source_items @@ -315,7 +339,7 @@ def some_data_override_config(item_type: TDataItemFormat, created_at: dlt.source def test_override_initial_value_from_config(item_type: TDataItemFormat) -> None: # use the shortest possible config version # os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_OVERRIDE_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' - os.environ['CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' + os.environ["CREATED_AT__INITIAL_VALUE"] = "2000-02-03T00:00:00Z" p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data_override_config(item_type)) @@ -323,75 +347,85 @@ def test_override_initial_value_from_config(item_type: TDataItemFormat) -> None: @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_override_primary_key_in_pipeline(item_type: TDataItemFormat) -> None: - """Primary key hint passed to pipeline is propagated through apply_hints - """ - data = [ - {'created_at': 22, 'id': 2, 'other_id': 5}, - {'created_at': 22, 'id': 2, 'other_id': 6} - ] + """Primary key hint passed to pipeline is propagated through apply_hints""" + data = [{"created_at": 22, "id": 2, "other_id": 5}, {"created_at": 22, "id": 2, "other_id": 6}] source_items = data_to_item_format(item_type, data) - @dlt.resource(primary_key='id') - def some_data(created_at=dlt.sources.incremental('created_at')): + @dlt.resource(primary_key="id") + def some_data(created_at=dlt.sources.incremental("created_at")): # TODO: this only works because incremental instance is shared across many copies of the resource - assert some_data.incremental.primary_key == ['id', 'other_id'] + assert some_data.incremental.primary_key == ["id", "other_id"] yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) - p.extract(some_data, primary_key=['id', 'other_id']) + p.extract(some_data, primary_key=["id", "other_id"]) @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_composite_primary_key(item_type: TDataItemFormat) -> None: data = [ - {'created_at': 1, 'isrc': 'AAA', 'market': 'DE'}, - {'created_at': 2, 'isrc': 'BBB', 'market': 'DE'}, - {'created_at': 2, 'isrc': 'CCC', 'market': 'US'}, - {'created_at': 2, 'isrc': 'AAA', 'market': 'DE'}, - {'created_at': 2, 'isrc': 'CCC', 'market': 'DE'}, - {'created_at': 2, 'isrc': 'DDD', 'market': 'DE'}, - {'created_at': 1, 'isrc': 'CCC', 'market': 'DE'}, + {"created_at": 1, "isrc": "AAA", "market": "DE"}, + {"created_at": 2, "isrc": "BBB", "market": "DE"}, + {"created_at": 2, "isrc": "CCC", "market": "US"}, + {"created_at": 2, "isrc": "AAA", "market": "DE"}, + {"created_at": 2, "isrc": "CCC", "market": "DE"}, + {"created_at": 2, "isrc": "DDD", "market": "DE"}, + {"created_at": 1, "isrc": "CCC", "market": "DE"}, ] source_items = data_to_item_format(item_type, data) - @dlt.resource(primary_key=['isrc', 'market']) - def some_data(created_at=dlt.sources.incremental('created_at')): + @dlt.resource(primary_key=["isrc", "market"]) + def some_data(created_at=dlt.sources.incremental("created_at")): yield from source_items - p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) + p = dlt.pipeline( + pipeline_name=uniq_id(), destination="duckdb", credentials=duckdb.connect(":memory:") + ) p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: - with c.execute_query("SELECT created_at, isrc, market FROM some_data order by created_at, isrc, market") as cur: + with c.execute_query( + "SELECT created_at, isrc, market FROM some_data order by created_at, isrc, market" + ) as cur: rows = cur.fetchall() - expected = {(1, 'AAA', 'DE'), (2, 'AAA', 'DE'), (2, 'BBB', 'DE'), (2, 'CCC', 'DE'), (2, 'CCC', 'US'), (2, 'DDD', 'DE'), (1, 'CCC', 'DE')} + expected = { + (1, "AAA", "DE"), + (2, "AAA", "DE"), + (2, "BBB", "DE"), + (2, "CCC", "DE"), + (2, "CCC", "US"), + (2, "DDD", "DE"), + (1, "CCC", "DE"), + } assert set(rows) == expected @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_last_value_func_min(item_type: TDataItemFormat) -> None: data = [ - {'created_at': 10}, - {'created_at': 11}, - {'created_at': 9}, - {'created_at': 10}, - {'created_at': 8}, - {'created_at': 22}, + {"created_at": 10}, + {"created_at": 11}, + {"created_at": 9}, + {"created_at": 10}, + {"created_at": 8}, + {"created_at": 22}, ] source_items = data_to_item_format(item_type, data) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at', last_value_func=min)): + def some_data(created_at=dlt.sources.incremental("created_at", last_value_func=min)): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] - assert s['last_value'] == 8 + assert s["last_value"] == 8 def test_last_value_func_custom() -> None: @@ -399,39 +433,43 @@ def last_value(values): return max(values) + 1 @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at', last_value_func=last_value)): - yield {'created_at': 9} - yield {'created_at': 10} + def some_data(created_at=dlt.sources.incremental("created_at", last_value_func=last_value)): + yield {"created_at": 9} + yield {"created_at": 10} p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] - assert s['last_value'] == 11 + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] + assert s["last_value"] == 11 @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_cursor_datetime_type(item_type: TDataItemFormat) -> None: initial_value = pendulum.now() data = [ - {'created_at': initial_value + timedelta(minutes=1)}, - {'created_at': initial_value + timedelta(minutes=3)}, - {'created_at': initial_value + timedelta(minutes=2)}, - {'created_at': initial_value + timedelta(minutes=4)}, - {'created_at': initial_value + timedelta(minutes=2)}, + {"created_at": initial_value + timedelta(minutes=1)}, + {"created_at": initial_value + timedelta(minutes=3)}, + {"created_at": initial_value + timedelta(minutes=2)}, + {"created_at": initial_value + timedelta(minutes=4)}, + {"created_at": initial_value + timedelta(minutes=2)}, ] source_items = data_to_item_format(item_type, data) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at', initial_value)): + def some_data(created_at=dlt.sources.incremental("created_at", initial_value)): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] - assert s['last_value'] == initial_value + timedelta(minutes=4) + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] + assert s["last_value"] == initial_value + timedelta(minutes=4) @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) @@ -439,21 +477,23 @@ def test_descending_order_unique_hashes(item_type: TDataItemFormat) -> None: """Resource returns items in descending order but using `max` last value function. Only hash matching last_value are stored. """ - data = [{'created_at': i} for i in reversed(range(15, 25))] + data = [{"created_at": i} for i in reversed(range(15, 25))] source_items = data_to_item_format(item_type, data) @dlt.resource - def some_data(created_at=dlt.sources.incremental('created_at', 20)): + def some_data(created_at=dlt.sources.incremental("created_at", 20)): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(some_data()) - s = p.state["sources"][p.default_schema_name]['resources']['some_data']['incremental']['created_at'] + s = p.state["sources"][p.default_schema_name]["resources"]["some_data"]["incremental"][ + "created_at" + ] - last_hash = digest128(json.dumps({'created_at': 24})) + last_hash = digest128(json.dumps({"created_at": 24})) - assert s['unique_hashes'] == [last_hash] + assert s["unique_hashes"] == [last_hash] # make sure nothing is returned on a next run, source will use state from the active pipeline assert list(some_data()) == [] @@ -472,7 +512,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): p = dlt.pipeline(pipeline_name=uniq_id()) p.run(some_data, destination="duckdb") # check if default schema contains normalized PK - assert p.default_schema.tables["some_data"]['columns']["del_ta"]['primary_key'] is True + assert p.default_schema.tables["some_data"]["columns"]["del_ta"]["primary_key"] is True with p.sql_client() as c: with c.execute_query("SELECT del_ta FROM some_data") as cur: rows = cur.fetchall() @@ -494,7 +534,6 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_missing_primary_key(item_type: TDataItemFormat) -> None: - @dlt.resource(primary_key="DELTA") def some_data(last_timestamp=dlt.sources.incremental("ts")): data = [{"delta": i, "ts": pendulum.now().add(days=i).timestamp()} for i in range(-10, 10)] @@ -509,6 +548,7 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_missing_cursor_field(item_type: TDataItemFormat) -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately + @dlt.resource def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): data = [{"delta": i, "ts": pendulum.now().add(days=i).timestamp()} for i in range(-10, 10)] @@ -527,40 +567,42 @@ def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): def test_json_path_cursor() -> None: - @dlt.resource def some_data(last_timestamp=dlt.sources.incremental("item.timestamp|modifiedAt")): - yield [{ - "delta": i, - "item": { - "timestamp": pendulum.now().add(days=i).timestamp() - } - } for i in range(-10, 10)] - - yield [{ - "delta": i, - "item": { - "modifiedAt": pendulum.now().add(days=i).timestamp() - } - } for i in range(-10, 10)] + yield [ + {"delta": i, "item": {"timestamp": pendulum.now().add(days=i).timestamp()}} + for i in range(-10, 10) + ] + + yield [ + {"delta": i, "item": {"modifiedAt": pendulum.now().add(days=i).timestamp()}} + for i in range(-10, 10) + ] # path should match both timestamp and modifiedAt in item list(some_data) def test_remove_incremental_with_explicit_none() -> None: - @dlt.resource - def some_data_optional(last_timestamp: Optional[dlt.sources.incremental[float]] = dlt.sources.incremental("item.timestamp")): + def some_data_optional( + last_timestamp: Optional[dlt.sources.incremental[float]] = dlt.sources.incremental( + "item.timestamp" + ), + ): assert last_timestamp is None yield 1 + # we disable incremental by typing the argument as optional assert list(some_data_optional(last_timestamp=None)) == [1] @dlt.resource(standalone=True) - def some_data(last_timestamp: dlt.sources.incremental[float] = dlt.sources.incremental("item.timestamp")): + def some_data( + last_timestamp: dlt.sources.incremental[float] = dlt.sources.incremental("item.timestamp"), + ): assert last_timestamp is None yield 1 + # we'll get the value error with pytest.raises(ValueError): assert list(some_data(last_timestamp=None)) == [1] @@ -571,8 +613,13 @@ def test_filter_processed_items(item_type: TDataItemFormat) -> None: """Checks if already processed items are filtered out""" @dlt.resource - def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): - data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] + def standalone_some_data( + item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp") + ): + data = [ + {"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} + for i in range(-10, 10) + ] source_items = data_to_item_format(item_type, data) yield from source_items @@ -589,9 +636,12 @@ def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dl assert all(v["delta"] >= 0 for v in values) # provide the initial value, use min function - values = list(standalone_some_data( - item_type, last_timestamp=dlt.sources.incremental("timestamp", pendulum.now().timestamp(), min) - )) + values = list( + standalone_some_data( + item_type, + last_timestamp=dlt.sources.incremental("timestamp", pendulum.now().timestamp(), min), + ) + ) values = data_item_to_list(item_type, values) assert len(values) == 10 # the minimum element @@ -602,10 +652,9 @@ def test_start_value_set_to_last_value() -> None: p = dlt.pipeline(pipeline_name=uniq_id()) now = pendulum.now() - @dlt.resource def some_data(step, last_timestamp=dlt.sources.incremental("ts")): - expected_last = now.add(days=step-1) + expected_last = now.add(days=step - 1) if step == -10: assert last_timestamp.start_value is None @@ -628,9 +677,9 @@ def some_data(step, last_timestamp=dlt.sources.incremental("ts")): p.run(r, destination="duckdb") -@pytest.mark.parametrize("item_type", set(ALL_DATA_ITEM_FORMATS) - {'json'}) +@pytest.mark.parametrize("item_type", set(ALL_DATA_ITEM_FORMATS) - {"json"}) def test_start_value_set_to_last_value_arrow(item_type: TDataItemFormat) -> None: - p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb') + p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") now = pendulum.now() data = [{"delta": i, "ts": now.add(days=i)} for i in range(-10, 10)] @@ -643,13 +692,13 @@ def some_data(first: bool, last_timestamp=dlt.sources.incremental("ts")): else: # print(last_timestamp.initial_value) # print(now.add(days=step-1).timestamp()) - assert last_timestamp.start_value == last_timestamp.last_value == data[-1]['ts'] + assert last_timestamp.start_value == last_timestamp.last_value == data[-1]["ts"] yield from source_items # after all yielded if first: assert last_timestamp.start_value is None else: - assert last_timestamp.start_value == data[-1]['ts'] == last_timestamp.last_value + assert last_timestamp.start_value == data[-1]["ts"] == last_timestamp.last_value p.run(some_data(True)) p.run(some_data(False)) @@ -661,8 +710,13 @@ def test_replace_resets_state(item_type: TDataItemFormat) -> None: now = pendulum.now() @dlt.resource - def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): - data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] + def standalone_some_data( + item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp") + ): + data = [ + {"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} + for i in range(-10, 10) + ] source_items = data_to_item_format(item_type, data) yield from source_items @@ -674,6 +728,7 @@ def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dl assert len(info.loads_ids) == 1 parent_r = standalone_some_data(item_type, now) + @dlt.transformer(data_from=parent_r, write_disposition="append") def child(item): state = resource_state("child") @@ -715,10 +770,12 @@ def child(item): # print(s.state) # state was reset (child is replace but parent is append! so it will not generate any more items due to incremental # so child will reset itself on replace and never set the state...) - assert 'child' not in s.state['resources'] + assert "child" not in s.state["resources"] # there will be a load package to reset the state but also a load package to update the child table - assert len(info.load_packages[0].jobs['completed_jobs']) == 2 - assert {job.job_file_info.table_name for job in info.load_packages[0].jobs['completed_jobs'] } == {"_dlt_pipeline_state", "child"} + assert len(info.load_packages[0].jobs["completed_jobs"]) == 2 + assert { + job.job_file_info.table_name for job in info.load_packages[0].jobs["completed_jobs"] + } == {"_dlt_pipeline_state", "child"} # now we add child that has parent_r as parent but we add another instance of standalone_some_data explicitly # so we have a resource with the same name as child parent but the pipe instance is different @@ -730,12 +787,13 @@ def child(item): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_incremental_as_transform(item_type: TDataItemFormat) -> None: - now = pendulum.now().timestamp() @dlt.resource def some_data(): - last_value: dlt.sources.incremental[float] = dlt.sources.incremental.from_existing_state("some_data", "ts") + last_value: dlt.sources.incremental[float] = dlt.sources.incremental.from_existing_state( + "some_data", "ts" + ) assert last_value.initial_value == now assert last_value.start_value == now assert last_value.cursor_path == "ts" @@ -768,7 +826,6 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_apply_hints_incremental(item_type: TDataItemFormat) -> None: - p = dlt.pipeline(pipeline_name=uniq_id()) data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] source_items = data_to_item_format(item_type, data) @@ -796,7 +853,7 @@ def some_data(created_at: Optional[dlt.sources.incremental[int]] = None): assert r.state["incremental"]["created_at"]["last_value"] == 1 @dlt.resource - def some_data_w_default(created_at = dlt.sources.incremental("created_at", last_value_func=min)): + def some_data_w_default(created_at=dlt.sources.incremental("created_at", last_value_func=min)): yield source_items # default is overridden by apply hints @@ -822,12 +879,12 @@ def some_data_no_incremental(): def test_last_value_func_on_dict() -> None: - """Test last value which is a dictionary""" + def by_event_type(event): last_value = None if len(event) == 1: - item, = event + (item,) = event else: item, last_value = event @@ -836,12 +893,18 @@ def by_event_type(event): else: last_value = dict(last_value) item_type = item["type"] - last_value[item_type] = max(item["created_at"], last_value.get(item_type, "1970-01-01T00:00:00Z")) + last_value[item_type] = max( + item["created_at"], last_value.get(item_type, "1970-01-01T00:00:00Z") + ) return last_value - @dlt.resource(primary_key="id", table_name=lambda i: i['type']) - def _get_shuffled_events(last_created_at = dlt.sources.incremental("$", last_value_func=by_event_type)): - with open("tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8") as f: + @dlt.resource(primary_key="id", table_name=lambda i: i["type"]) + def _get_shuffled_events( + last_created_at=dlt.sources.incremental("$", last_value_func=by_event_type) + ): + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8" + ) as f: yield json.load(f) with Container().injectable_context(StateInjectableContext(state={})): @@ -866,15 +929,22 @@ def test_timezone_naive_datetime() -> None: pendulum_start_dt = pendulum.instance(start_dt) # With timezone @dlt.resource - def some_data(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sources.incremental('updated_at', pendulum_start_dt)): - data = [{'updated_at': start_dt + timedelta(hours=1)}, {'updated_at': start_dt + timedelta(hours=2)}] + def some_data( + updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sources.incremental( + "updated_at", pendulum_start_dt + ) + ): + data = [ + {"updated_at": start_dt + timedelta(hours=1)}, + {"updated_at": start_dt + timedelta(hours=2)}, + ] yield data pipeline = dlt.pipeline(pipeline_name=uniq_id()) resource = some_data() pipeline.extract(resource) # last value has timezone added - last_value = resource.state['incremental']['updated_at']['last_value'] + last_value = resource.state["incremental"]["updated_at"]["last_value"] assert isinstance(last_value, pendulum.DateTime) assert last_value.tzname() == "UTC" @@ -882,11 +952,13 @@ def some_data(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sourc @dlt.resource def endless_sequence( item_type: TDataItemFormat, - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=1) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=1 + ), ) -> Any: max_values = 20 start = updated_at.last_value - data = [{'updated_at': i} for i in range(start, start + max_values)] + data = [{"updated_at": i} for i in range(start, start + max_values)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -895,7 +967,7 @@ def endless_sequence( def test_chunked_ranges(item_type: TDataItemFormat) -> None: """Load chunked ranges with end value along with incremental""" - pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') + pipeline = dlt.pipeline(pipeline_name="incremental_" + uniq_id(), destination="duckdb") chunks = [ # Load some start/end ranges in and out of order @@ -914,23 +986,32 @@ def test_chunked_ranges(item_type: TDataItemFormat) -> None: for start, end in chunks: pipeline.run( - endless_sequence(item_type, updated_at=dlt.sources.incremental(initial_value=start, end_value=end)), - write_disposition='append' + endless_sequence( + item_type, updated_at=dlt.sources.incremental(initial_value=start, end_value=end) + ), + write_disposition="append", ) - expected_range = list(chain( - range(10, 20), - range(20, 30), - range(40, 50), - range(50, 60), - range(60, 61), - range(62, 70), - range(70, 89), - range(89, 109), - )) + expected_range = list( + chain( + range(10, 20), + range(20, 30), + range(40, 50), + range(50, 60), + range(60, 61), + range(62, 70), + range(70, 89), + range(89, 109), + ) + ) with pipeline.sql_client() as client: - items = [row[0] for row in client.execute_sql("SELECT updated_at FROM endless_sequence ORDER BY updated_at")] + items = [ + row[0] + for row in client.execute_sql( + "SELECT updated_at FROM endless_sequence ORDER BY updated_at" + ) + ] assert items == expected_range @@ -938,57 +1019,73 @@ def test_chunked_ranges(item_type: TDataItemFormat) -> None: @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_end_value_with_batches(item_type: TDataItemFormat) -> None: """Ensure incremental with end_value works correctly when resource yields lists instead of single items""" + @dlt.resource def batched_sequence( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=1) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=1 + ) ) -> Any: start = updated_at.last_value - data = [{'updated_at': i} for i in range(start, start + 12)] + data = [{"updated_at": i} for i in range(start, start + 12)] yield data_to_item_format(item_type, data) - data = [{'updated_at': i} for i in range(start+12, start + 20)] + data = [{"updated_at": i} for i in range(start + 12, start + 20)] yield data_to_item_format(item_type, data) - pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') + pipeline = dlt.pipeline(pipeline_name="incremental_" + uniq_id(), destination="duckdb") pipeline.run( batched_sequence(updated_at=dlt.sources.incremental(initial_value=1, end_value=10)), - write_disposition='append' + write_disposition="append", ) with pipeline.sql_client() as client: - items = [row[0] for row in client.execute_sql("SELECT updated_at FROM batched_sequence ORDER BY updated_at")] + items = [ + row[0] + for row in client.execute_sql( + "SELECT updated_at FROM batched_sequence ORDER BY updated_at" + ) + ] assert items == list(range(1, 10)) pipeline.run( batched_sequence(updated_at=dlt.sources.incremental(initial_value=10, end_value=14)), - write_disposition='append' + write_disposition="append", ) with pipeline.sql_client() as client: - items = [row[0] for row in client.execute_sql("SELECT updated_at FROM batched_sequence ORDER BY updated_at")] + items = [ + row[0] + for row in client.execute_sql( + "SELECT updated_at FROM batched_sequence ORDER BY updated_at" + ) + ] assert items == list(range(1, 14)) @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_load_with_end_value_does_not_write_state(item_type: TDataItemFormat) -> None: - """When loading chunk with initial/end value range. The resource state is untouched. - """ - pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') + """When loading chunk with initial/end value range. The resource state is untouched.""" + pipeline = dlt.pipeline(pipeline_name="incremental_" + uniq_id(), destination="duckdb") - pipeline.extract(endless_sequence(item_type, updated_at=dlt.sources.incremental(initial_value=20, end_value=30))) + pipeline.extract( + endless_sequence( + item_type, updated_at=dlt.sources.incremental(initial_value=20, end_value=30) + ) + ) - assert pipeline.state.get('sources') is None + assert pipeline.state.get("sources") is None @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_end_value_initial_value_errors(item_type: TDataItemFormat) -> None: @dlt.resource def some_data( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at') + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at"), ) -> Any: - yield {'updated_at': 1} + yield {"updated_at": 1} # end_value without initial_value with pytest.raises(ConfigurationValueError) as ex: @@ -1000,33 +1097,55 @@ def some_data( with pytest.raises(ConfigurationValueError) as ex: list(some_data(updated_at=dlt.sources.incremental(initial_value=42, end_value=22))) - assert str(ex.value).startswith("Incremental 'initial_value' (42) is higher than 'end_value` (22)") + assert str(ex.value).startswith( + "Incremental 'initial_value' (42) is higher than 'end_value` (22)" + ) # max function and end_value higher than initial_value with pytest.raises(ConfigurationValueError) as ex: - list(some_data(updated_at=dlt.sources.incremental(initial_value=22, end_value=42, last_value_func=min))) + list( + some_data( + updated_at=dlt.sources.incremental( + initial_value=22, end_value=42, last_value_func=min + ) + ) + ) - assert str(ex.value).startswith("Incremental 'initial_value' (22) is lower than 'end_value` (42).") + assert str(ex.value).startswith( + "Incremental 'initial_value' (22) is lower than 'end_value` (42)." + ) def custom_last_value(items): return max(items) # custom function which evaluates end_value lower than initial with pytest.raises(ConfigurationValueError) as ex: - list(some_data(updated_at=dlt.sources.incremental(initial_value=42, end_value=22, last_value_func=custom_last_value))) + list( + some_data( + updated_at=dlt.sources.incremental( + initial_value=42, end_value=22, last_value_func=custom_last_value + ) + ) + ) - assert "The result of 'custom_last_value([end_value, initial_value])' must equal 'end_value'" in str(ex.value) + assert ( + "The result of 'custom_last_value([end_value, initial_value])' must equal 'end_value'" + in str(ex.value) + ) @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_out_of_range_flags(item_type: TDataItemFormat) -> None: """Test incremental.start_out_of_range / end_out_of_range flags are set when items are filtered out""" + @dlt.resource def descending( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=10) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=10 + ) ) -> Any: for chunk in chunks(list(reversed(range(48))), 10): - data = [{'updated_at': i} for i in chunk] + data = [{"updated_at": i} for i in chunk] yield data_to_item_format(item_type, data) # Assert flag is set only on the first item < initial_value if all(item > 9 for item in chunk): @@ -1037,10 +1156,12 @@ def descending( @dlt.resource def ascending( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=22, end_value=45) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=22, end_value=45 + ) ) -> Any: for chunk in chunks(list(range(22, 500)), 10): - data = [{'updated_at': i} for i in chunk] + data = [{"updated_at": i} for i in chunk] yield data_to_item_format(item_type, data) # Flag is set only when end_value is reached if all(item < 45 for item in chunk): @@ -1049,15 +1170,16 @@ def ascending( assert updated_at.end_out_of_range is True return - @dlt.resource def descending_single_item( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=10) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=10 + ) ) -> Any: for i in reversed(range(14)): - data = [{'updated_at': i}] + data = [{"updated_at": i}] yield from data_to_item_format(item_type, data) - yield {'updated_at': i} + yield {"updated_at": i} if i >= 10: assert updated_at.start_out_of_range is False else: @@ -1066,10 +1188,12 @@ def descending_single_item( @dlt.resource def ascending_single_item( - updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=10, end_value=22) + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", initial_value=10, end_value=22 + ) ) -> Any: for i in range(10, 500): - data = [{'updated_at': i}] + data = [{"updated_at": i}] yield from data_to_item_format(item_type, data) if i < 22: assert updated_at.end_out_of_range is False @@ -1077,7 +1201,7 @@ def ascending_single_item( assert updated_at.end_out_of_range is True return - pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') + pipeline = dlt.pipeline(pipeline_name="incremental_" + uniq_id(), destination="duckdb") pipeline.extract(descending()) @@ -1087,19 +1211,27 @@ def ascending_single_item( pipeline.extract(ascending_single_item()) + @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_get_incremental_value_type(item_type: TDataItemFormat) -> None: assert dlt.sources.incremental("id").get_incremental_value_type() is Any assert dlt.sources.incremental("id", initial_value=0).get_incremental_value_type() is int assert dlt.sources.incremental("id", initial_value=None).get_incremental_value_type() is Any assert dlt.sources.incremental[int]("id").get_incremental_value_type() is int - assert dlt.sources.incremental[pendulum.DateTime]("id").get_incremental_value_type() is pendulum.DateTime + assert ( + dlt.sources.incremental[pendulum.DateTime]("id").get_incremental_value_type() + is pendulum.DateTime + ) # typing has precedence assert dlt.sources.incremental[pendulum.DateTime]("id", initial_value=1).get_incremental_value_type() is pendulum.DateTime # type: ignore[arg-type] # pass default value @dlt.resource - def test_type(updated_at = dlt.sources.incremental[str]("updated_at", allow_external_schedulers=True)): # noqa: B008 + def test_type( + updated_at=dlt.sources.incremental[str]( # noqa: B008 + "updated_at", allow_external_schedulers=True + ) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) @@ -1109,7 +1241,11 @@ def test_type(updated_at = dlt.sources.incremental[str]("updated_at", allow_exte # use annotation @dlt.resource - def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): + def test_type_2( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", allow_external_schedulers=True + ) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) @@ -1129,7 +1265,9 @@ def test_type_3(updated_at: dlt.sources.incremental[int]): # pass explicit value overriding default that is typed @dlt.resource - def test_type_4(updated_at = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): + def test_type_4( + updated_at=dlt.sources.incremental("updated_at", allow_external_schedulers=True) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) @@ -1139,7 +1277,9 @@ def test_type_4(updated_at = dlt.sources.incremental("updated_at", allow_externa # no generic type information @dlt.resource - def test_type_5(updated_at = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): + def test_type_5( + updated_at=dlt.sources.incremental("updated_at", allow_external_schedulers=True) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) @@ -1151,33 +1291,45 @@ def test_type_5(updated_at = dlt.sources.incremental("updated_at", allow_externa @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_join_env_scheduler(item_type: TDataItemFormat) -> None: @dlt.resource - def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): + def test_type_2( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", allow_external_schedulers=True + ) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) result = list(test_type_2()) - assert data_item_to_list(item_type, result) == [{'updated_at': 1}, {'updated_at': 2}, {'updated_at': 3}] + assert data_item_to_list(item_type, result) == [ + {"updated_at": 1}, + {"updated_at": 2}, + {"updated_at": 3}, + ] # set start and end values os.environ["DLT_START_VALUE"] = "2" result = list(test_type_2()) - assert data_item_to_list(item_type, result) == [{'updated_at': 2}, {'updated_at': 3}] + assert data_item_to_list(item_type, result) == [{"updated_at": 2}, {"updated_at": 3}] os.environ["DLT_END_VALUE"] = "3" result = list(test_type_2()) - assert data_item_to_list(item_type, result) == [{'updated_at': 2}] + assert data_item_to_list(item_type, result) == [{"updated_at": 2}] @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_join_env_scheduler_pipeline(item_type: TDataItemFormat) -> None: @dlt.resource - def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): + def test_type_2( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental( + "updated_at", allow_external_schedulers=True + ) + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) - pip_1_name = 'incremental_' + uniq_id() - pipeline = dlt.pipeline(pipeline_name=pip_1_name, destination='duckdb') + pip_1_name = "incremental_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pip_1_name, destination="duckdb") r = test_type_2() - r.add_step(AssertItems([{'updated_at': 2}, {'updated_at': 3}], item_type)) + r.add_step(AssertItems([{"updated_at": 2}, {"updated_at": 3}], item_type)) os.environ["DLT_START_VALUE"] = "2" pipeline.extract(r) # state is saved next extract has no items @@ -1188,18 +1340,20 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment # setting end value will stop using state os.environ["DLT_END_VALUE"] = "3" r = test_type_2() - r.add_step(AssertItems([{'updated_at': 2}], item_type)) + r.add_step(AssertItems([{"updated_at": 2}], item_type)) pipeline.extract(r) r = test_type_2() os.environ["DLT_START_VALUE"] = "1" - r.add_step(AssertItems([{'updated_at': 1}, {'updated_at': 2}], item_type)) + r.add_step(AssertItems([{"updated_at": 1}, {"updated_at": 2}], item_type)) pipeline.extract(r) @pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) def test_allow_external_schedulers(item_type: TDataItemFormat) -> None: @dlt.resource() - def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at")): + def test_type_2( + updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at"), + ): data = [{"updated_at": d} for d in [1, 2, 3]] yield data_to_item_format(item_type, data) diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index aae95e0a3f..0a2742f04d 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -12,10 +12,17 @@ from dlt.extract import DltResource, DltSource, Incremental from dlt.extract.source import DltResourceDict -from dlt.extract.exceptions import (DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, - InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, - InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, - ParametrizedResourceUnbound, ResourcesNotFoundError) +from dlt.extract.exceptions import ( + DataItemRequiredForDynamicTableHints, + InconsistentTableTemplate, + InvalidParentResourceDataType, + InvalidParentResourceIsAFunction, + InvalidResourceDataTypeMultiplePipes, + InvalidTransformerDataTypeGeneratorFunctionRequired, + InvalidTransformerGeneratorFunction, + ParametrizedResourceUnbound, + ResourcesNotFoundError, +) from dlt.extract.pipe import Pipe @@ -25,8 +32,7 @@ def test_call_data_resource() -> None: def test_parametrized_resource() -> None: - - def parametrized(p1, /, p2, *, p3 = None): + def parametrized(p1, /, p2, *, p3=None): assert p1 == "p1" assert p2 == 1 assert p3 is None @@ -71,8 +77,7 @@ def parametrized(p1, /, p2, *, p3 = None): def test_parametrized_transformer() -> None: - - def good_transformer(item, /, p1, p2, *, p3 = None): + def good_transformer(item, /, p1, p2, *, p3=None): assert p1 == "p1" assert p2 == 2 assert p3 is None @@ -140,9 +145,9 @@ def bad_transformer_3(*, item): def assert_items(_items: TDataItems) -> None: # 2 items yielded * p2=2 - assert len(_items) == 2*2 - assert _items[0] == {'wrap': 'itemX', 'mark': 'p1', 'iter': 0} - assert _items[3] == {'wrap': 'itemY', 'mark': 'p1', 'iter': 1} + assert len(_items) == 2 * 2 + assert _items[0] == {"wrap": "itemX", "mark": "p1", "iter": 0} + assert _items[3] == {"wrap": "itemY", "mark": "p1", "iter": 1} assert_items(items) @@ -154,7 +159,6 @@ def assert_items(_items: TDataItems) -> None: def test_resource_bind_when_in_source() -> None: - @dlt.resource def parametrized(_range: int): yield list(range(_range)) @@ -191,7 +195,6 @@ def test_source(): def test_resource_bind_call_forms() -> None: - @dlt.resource def returns_res(_input): # resource returning resource @@ -232,7 +235,6 @@ def regular(_input): b_returns_pipe = returns_pipe("ABCA") assert len(b_returns_pipe._pipe) == 1 - @dlt.source def test_source(): return returns_res, returns_pipe, regular @@ -245,7 +247,7 @@ def test_source(): assert s.regular._pipe is not regular._pipe # will repeat each string 3 times - s.regular.add_map(lambda i: i*3) + s.regular.add_map(lambda i: i * 3) assert len(regular._pipe) == 2 assert len(s.regular._pipe) == 3 @@ -256,14 +258,14 @@ def test_source(): assert list(s.regular) == ["AAA", "AAA", "AAA"] # binding resource that returns resource will replace the object content, keeping the object id - s.returns_res.add_map(lambda i: i*3) + s.returns_res.add_map(lambda i: i * 3) s.returns_res.bind(["X", "Y", "Z"]) # got rid of all mapping and filter functions assert len(s.returns_res._pipe) == 1 assert list(s.returns_res) == ["X", "Y", "Z"] # same for resource returning pipe - s.returns_pipe.add_map(lambda i: i*3) + s.returns_pipe.add_map(lambda i: i * 3) s.returns_pipe.bind(["X", "Y", "M"]) # got rid of all mapping and filter functions assert len(s.returns_pipe._pipe) == 1 @@ -271,12 +273,11 @@ def test_source(): # s.regular is exhausted so set it again # add lambda that after filtering for A, will multiply it by 4 - s.resources["regular"] = regular.add_map(lambda i: i*4)(["A", "Y"]) - assert list(s) == ['X', 'Y', 'Z', 'X', 'Y', 'M', 'AAAA'] + s.resources["regular"] = regular.add_map(lambda i: i * 4)(["A", "Y"]) + assert list(s) == ["X", "Y", "Z", "X", "Y", "M", "AAAA"] def test_call_clone_separate_pipe() -> None: - all_yields = [] def some_data_gen(param: str): @@ -297,14 +298,13 @@ def some_data(param: str): def test_resource_bind_lazy_eval() -> None: - @dlt.resource def needs_param(param): yield from range(param) @dlt.transformer(data_from=needs_param(3)) def tx_form(item, multi): - yield item*multi + yield item * multi @dlt.transformer(data_from=tx_form(2)) def tx_form_fin(item, div): @@ -312,7 +312,7 @@ def tx_form_fin(item, div): @dlt.transformer(data_from=needs_param) def tx_form_dir(item, multi): - yield item*multi + yield item * multi # tx_form takes data from needs_param(3) which is lazily evaluated assert list(tx_form(2)) == [0, 2, 4] @@ -320,8 +320,8 @@ def tx_form_dir(item, multi): assert list(tx_form(2)) == [0, 2, 4] # same for tx_form_fin - assert list(tx_form_fin(3)) == [0, 2/3, 4/3] - assert list(tx_form_fin(3)) == [0, 2/3, 4/3] + assert list(tx_form_fin(3)) == [0, 2 / 3, 4 / 3] + assert list(tx_form_fin(3)) == [0, 2 / 3, 4 / 3] # binding `needs_param`` in place will not affect the tx_form and tx_form_fin (they operate on copies) needs_param.bind(4) @@ -335,7 +335,6 @@ def tx_form_dir(item, multi): def test_transformer_preliminary_step() -> None: - def yield_twice(item): yield item.upper() yield item.upper() @@ -344,13 +343,20 @@ def yield_twice(item): # filter out small caps and insert this before the head tx_stage.add_filter(lambda letter: letter.isupper(), 0) # be got filtered out before duplication - assert list(dlt.resource(["A", "b", "C"], name="data") | tx_stage) == ['A', 'A', 'C', 'C'] + assert list(dlt.resource(["A", "b", "C"], name="data") | tx_stage) == ["A", "A", "C", "C"] # filter after duplication tx_stage = dlt.transformer()(yield_twice)() tx_stage.add_filter(lambda letter: letter.isupper()) # nothing is filtered out: on duplicate we also capitalize so filter does not trigger - assert list(dlt.resource(["A", "b", "C"], name="data") | tx_stage) == ['A', 'A', 'B', 'B', 'C', 'C'] + assert list(dlt.resource(["A", "b", "C"], name="data") | tx_stage) == [ + "A", + "A", + "B", + "B", + "C", + "C", + ] def test_set_table_name() -> None: @@ -363,7 +369,6 @@ def test_set_table_name() -> None: def test_select_resources() -> None: - @dlt.source def test_source(no_resources): for i in range(no_resources): @@ -389,7 +394,11 @@ def test_source(no_resources): s_sel = s.with_resources("resource_1", "resource_7") # returns a clone assert s is not s_sel - assert list(s_sel.selected_resources) == ["resource_1", "resource_7"] == list(s_sel.resources.selected) + assert ( + list(s_sel.selected_resources) + == ["resource_1", "resource_7"] + == list(s_sel.resources.selected) + ) assert list(s_sel.resources) == all_resource_names info = str(s_sel) assert "resource resource_0 is not selected" in info @@ -407,7 +416,6 @@ def test_source(no_resources): def test_clone_source() -> None: @dlt.source def test_source(no_resources): - def _gen(i): yield "A" * i @@ -426,7 +434,7 @@ def _gen(i): # but we keep pipe names assert s.resources[name].name == clone_s.resources[name].name - assert list(s) == ['', 'A', 'AA', 'AAA'] + assert list(s) == ["", "A", "AA", "AAA"] # we expired generators assert list(clone_s) == [] @@ -434,7 +442,6 @@ def _gen(i): @dlt.source # type: ignore[no-redef] def test_source(no_resources): - def _gen(i): yield "A" * i @@ -449,15 +456,13 @@ def _gen(i): clone_s.resources[name].bind(idx) # now thanks to late eval both sources evaluate separately - assert list(s) == ['', 'A', 'AA', 'AAA'] - assert list(clone_s) == ['', 'A', 'AA', 'AAA'] + assert list(s) == ["", "A", "AA", "AAA"] + assert list(clone_s) == ["", "A", "AA", "AAA"] def test_multiple_parametrized_transformers() -> None: - @dlt.source def _source(test_set: int = 1): - @dlt.resource(selected=False) def _r1(): yield ["a", "b", "c"] @@ -468,7 +473,7 @@ def _t1(items, suffix): @dlt.transformer(data_from=_t1) def _t2(items, mul): - yield items*mul + yield items * mul if test_set == 1: return _r1, _t1, _t2 @@ -481,8 +486,7 @@ def _t2(items, mul): # true pipelining fun return _r1() | _t1("2") | _t2(2) - - expected_data = ['a_2', 'b_2', 'c_2', 'a_2', 'b_2', 'c_2'] + expected_data = ["a_2", "b_2", "c_2", "a_2", "b_2", "c_2"] # this s contains all resources s = _source(1) @@ -543,7 +547,6 @@ def _t2(items, mul): def test_extracted_resources_selector() -> None: @dlt.source def _source(test_set: int = 1): - @dlt.resource(selected=False, write_disposition="append") def _r1(): yield ["a", "b", "c"] @@ -554,7 +557,7 @@ def _t1(items, suffix): @dlt.transformer(data_from=_r1, write_disposition="merge") def _t2(items, mul): - yield items*mul + yield items * mul if test_set == 1: return _r1, _t1, _t2 @@ -592,10 +595,8 @@ def _t2(items, mul): def test_source_decompose() -> None: - @dlt.source def _source(): - @dlt.resource(selected=True) def _r_init(): yield ["-", "x", "!"] @@ -610,18 +611,18 @@ def _t1(items, suffix): @dlt.transformer(data_from=_r1) def _t2(items, mul): - yield items*mul + yield items * mul @dlt.transformer(data_from=_r1) def _t3(items, mul): for item in items: - yield item.upper()*mul + yield item.upper() * mul # add something to init @dlt.transformer(data_from=_r_init) def _t_init_post(items): for item in items: - yield item*2 + yield item * 2 @dlt.resource def _r_isolee(): @@ -644,7 +645,14 @@ def _r_isolee(): # keeps order of resources inside # here we didn't eliminate (_r_init, _r_init) as this not impacts decomposition, however this edge is not necessary - assert _source().resources.selected_dag == [("_r_init", "_r_init"), ("_r_init", "_t_init_post"), ('_r1', '_t1'), ('_r1', '_t2'), ('_r1', '_t3'), ('_r_isolee', '_r_isolee')] + assert _source().resources.selected_dag == [ + ("_r_init", "_r_init"), + ("_r_init", "_t_init_post"), + ("_r1", "_t1"), + ("_r1", "_t2"), + ("_r1", "_t3"), + ("_r_isolee", "_r_isolee"), + ] components = _source().decompose("scc") # first element contains _r_init assert "_r_init" in components[0].resources.selected.keys() @@ -688,7 +696,6 @@ def _gen(): @dlt.resource def res_in_res(table_name, w_d): - def _gen(s): yield from s @@ -696,7 +703,6 @@ def _gen(s): def test_resource_returning_resource() -> None: - @dlt.source def source_r_in_r(): yield res_in_res @@ -729,6 +735,7 @@ def test_source_resource_attrs_with_conflicting_attrs() -> None: """Resource names that conflict with DltSource attributes do not work with attribute access""" dlt.pipeline(full_refresh=True) # Create pipeline so state property can be accessed names = ["state", "resources", "schema", "name", "clone"] + @dlt.source def test_source() -> Iterator[DltResource]: for name in names: @@ -745,13 +752,19 @@ def test_source() -> Iterator[DltResource]: def test_add_transform_steps() -> None: # add all step types, using indexes. final steps # gen -> map that converts to str and multiplies character -> filter str of len 2 -> yield all characters in str separately - r = dlt.resource([1, 2, 3, 4], name="all").add_limit(3).add_yield_map(lambda i: (yield from i)).add_map(lambda i: str(i) * i, 1).add_filter(lambda i: len(i) == 2, 2) + r = ( + dlt.resource([1, 2, 3, 4], name="all") + .add_limit(3) + .add_yield_map(lambda i: (yield from i)) + .add_map(lambda i: str(i) * i, 1) + .add_filter(lambda i: len(i) == 2, 2) + ) assert list(r) == ["2", "2"] def test_add_transform_steps_pipe() -> None: r = dlt.resource([1, 2, 3], name="all") | (lambda i: str(i) * i) | (lambda i: (yield from i)) - assert list(r) == ['1', '2', '2', '3', '3', '3'] + assert list(r) == ["1", "2", "2", "3", "3", "3"] def test_limit_infinite_counter() -> None: @@ -760,7 +773,6 @@ def test_limit_infinite_counter() -> None: def test_limit_source() -> None: - def mul_c(item): yield from "A" * (item + 2) @@ -772,11 +784,10 @@ def infinite_source(): yield r | dlt.transformer(name=f"mul_c_{idx}")(mul_c) # transformer is not limited to 2 elements, infinite resource is, we have 3 resources - assert list(infinite_source().add_limit(2)) == ['A', 'A', 0, 'A', 'A', 'A', 1] * 3 + assert list(infinite_source().add_limit(2)) == ["A", "A", 0, "A", "A", "A", 1] * 3 def test_source_state() -> None: - @dlt.source def test_source(expected_state): assert source_state() == expected_state @@ -786,17 +797,16 @@ def test_source(expected_state): test_source({}).state dlt.pipeline(full_refresh=True) - assert test_source({}).state == {} + assert test_source({}).state == {} # inject state to see if what we write in state is there with Container().injectable_context(StateInjectableContext(state={})) as state: test_source({}).state["value"] = 1 # type: ignore[index] test_source({"value": 1}) - assert state.state == {'sources': {'test_source': {'value': 1}}} + assert state.state == {"sources": {"test_source": {"value": 1}}} def test_resource_state() -> None: - @dlt.resource def test_resource(): yield [1, 2, 3] @@ -827,10 +837,14 @@ def test_source(): # resource section is current module print(state.state) # the resource that is a part of the source will create a resource state key in the source state key - assert state.state["sources"]["schema_section"] == {'resources': {'test_resource': {'in-source': True}}} - assert s.state == {'resources': {'test_resource': {'in-source': True}}} + assert state.state["sources"]["schema_section"] == { + "resources": {"test_resource": {"in-source": True}} + } + assert s.state == {"resources": {"test_resource": {"in-source": True}}} # the standalone resource will create key which is default schema name - assert state.state["sources"][p._make_schema_with_default_name().name] == {'resources': {'test_resource': {'direct': True}}} + assert state.state["sources"][p._make_schema_with_default_name().name] == { + "resources": {"test_resource": {"direct": True}} + } # def test_add_resources_to_source_simple() -> None: @@ -842,7 +856,7 @@ def input_gen(): yield from [1, 2, 3] def tx_step(item): - return item*2 + return item * 2 res_dict = DltResourceDict("source", "section") input_r = DltResource.from_data(input_gen) @@ -872,10 +886,9 @@ def tx_step(item): assert input_r_orig_pipe == input_r._pipe assert input_tx_orig_pipe == input_tx._pipe - # add all together res_dict = DltResourceDict("source", "section") - res_dict.add(input_r , input_r | input_tx) + res_dict.add(input_r, input_r | input_tx) assert res_dict._new_pipes == [] assert res_dict._suppress_clone_on_setitem is False assert res_dict["input_gen"]._pipe is res_dict["tx_step"]._pipe.parent @@ -883,7 +896,6 @@ def tx_step(item): assert input_r_orig_pipe == input_r._pipe assert input_tx_orig_pipe == input_tx._pipe - # replace existing resource which has the old pipe res_dict["input_gen"] = input_r # an existing clone got assigned @@ -900,8 +912,6 @@ def tx_step(item): assert input_r_orig_pipe == input_r._pipe assert input_tx_orig_pipe == input_tx._pipe - - # can't set with different name than resource really has with pytest.raises(ValueError): res_dict["input_gen_x"] = input_r.with_name("uniq") @@ -917,7 +927,6 @@ def test_add_transformer_to_source(add_mode: str) -> None: def number_gen(init): yield from range(init, init + 5) - @dlt.source def number_source(): return number_gen @@ -926,7 +935,7 @@ def number_source(): @dlt.transformer def multiplier(item): - return item*2 + return item * 2 mul_pipe = source.numbers | multiplier() @@ -951,7 +960,6 @@ def test_unknown_resource_access() -> None: def number_gen(init): yield from range(init, init + 5) - @dlt.source def number_source(): return number_gen @@ -1009,7 +1017,6 @@ def multiplier(number, mul): def test_source_multiple_iterations() -> None: - def some_data(): yield [1, 2, 3] yield [1, 2, 3] @@ -1024,23 +1031,31 @@ def some_data(): def test_exhausted_property() -> None: - # this example will be exhausted after iteration def open_generator_data(): yield from [1, 2, 3, 4] + s = DltSource(Schema("source"), "module", [dlt.resource(open_generator_data())]) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is True # lists will not exhaust - s = DltSource(Schema("source"), "module", [dlt.resource([1, 2, 3, 4], table_name="table", name="resource")]) + s = DltSource( + Schema("source"), + "module", + [dlt.resource([1, 2, 3, 4], table_name="table", name="resource")], + ) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is False # iterators will not exhaust - s = DltSource(Schema("source"), "module", [dlt.resource(iter([1, 2, 3, 4]), table_name="table", name="resource")]) + s = DltSource( + Schema("source"), + "module", + [dlt.resource(iter([1, 2, 3, 4]), table_name="table", name="resource")], + ) assert s.exhausted is False assert next(iter(s)) == 1 assert s.exhausted is False @@ -1048,23 +1063,31 @@ def open_generator_data(): # having on exhausted generator resource will make the whole source exhausted def open_generator_data(): # type: ignore[no-redef] yield from [1, 2, 3, 4] - s = DltSource(Schema("source"), "module", [ dlt.resource([1, 2, 3, 4], table_name="table", name="resource"), dlt.resource(open_generator_data())]) + + s = DltSource( + Schema("source"), + "module", + [ + dlt.resource([1, 2, 3, 4], table_name="table", name="resource"), + dlt.resource(open_generator_data()), + ], + ) assert s.exhausted is False # execute the whole source list(s) assert s.exhausted is True - # source with transformers also exhausts @dlt.source def mysource(): r = dlt.resource(itertools.count(start=1), name="infinity").add_limit(5) yield r yield r | dlt.transformer(name="double")(lambda x: x * 2) + s = mysource() assert s.exhausted is False - assert next(iter(s)) == 2 # transformer is returned befor resource + assert next(iter(s)) == 2 # transformer is returned befor resource assert s.exhausted is True @@ -1077,7 +1100,6 @@ def _r1(): def _t1(items, suffix): yield list(map(lambda i: i + "_" + suffix, items)) - r1 = _r1() r1_clone = r1.with_name("r1_clone") # new name of resource and pipe @@ -1100,8 +1122,8 @@ def _t1(items, suffix): assert bound_t1_clone_2._pipe.parent is bound_t1_clone._pipe.parent # evaluate transformers - assert list(bound_t1_clone) == ['a_ax', 'b_ax', 'c_ax'] - assert list(bound_t1_clone_2) == ['a_ax_2', 'b_ax_2', 'c_ax_2'] + assert list(bound_t1_clone) == ["a_ax", "b_ax", "c_ax"] + assert list(bound_t1_clone_2) == ["a_ax_2", "b_ax_2", "c_ax_2"] # clone pipes (bound transformer) pipe_r1 = _r1() @@ -1144,7 +1166,13 @@ def _t1(items, suffix): def test_apply_hints() -> None: def empty_gen(): yield [1, 2, 3] - empty_table_schema = {"name": "empty_gen", 'columns': {}, 'resource': 'empty_gen', 'write_disposition': 'append'} + + empty_table_schema = { + "name": "empty_gen", + "columns": {}, + "resource": "empty_gen", + "write_disposition": "append", + } empty = DltResource.from_data(empty_gen) @@ -1164,19 +1192,44 @@ def empty_gen(): empty_r.write_disposition = "append" assert empty_r.compute_table_schema()["write_disposition"] == "append" - empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"], schema_contract="freeze") + empty_r.apply_hints( + table_name="table", + parent_table_name="parent", + primary_key=["a", "b"], + merge_key=["c", "a"], + schema_contract="freeze", + ) table = empty_r.compute_table_schema() - assert table["columns"]["a"] == {'merge_key': True, 'name': 'a', 'nullable': False, 'primary_key': True} - assert table["columns"]["b"] == {'name': 'b', 'nullable': False, 'primary_key': True} - assert table["columns"]["c"] == {'merge_key': True, 'name': 'c', 'nullable': False} + assert table["columns"]["a"] == { + "merge_key": True, + "name": "a", + "nullable": False, + "primary_key": True, + } + assert table["columns"]["b"] == {"name": "b", "nullable": False, "primary_key": True} + assert table["columns"]["c"] == {"merge_key": True, "name": "c", "nullable": False} assert table["name"] == "table" assert table["parent"] == "parent" assert empty_r.table_name == "table" assert table["schema_contract"] == "freeze" # reset - empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY, schema_contract={}) - assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append', 'original_columns': {}} + empty_r.apply_hints( + table_name="", + parent_table_name="", + primary_key=[], + merge_key="", + columns={}, + incremental=Incremental.EMPTY, + schema_contract={}, + ) + assert empty_r._table_schema_template == { + "columns": {}, + "incremental": None, + "validator": None, + "write_disposition": "append", + "original_columns": {}, + } table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" assert "parent" not in table @@ -1185,11 +1238,20 @@ def empty_gen(): # combine columns with primary key empty_r = empty() - empty_r.apply_hints(columns={"tags": {"data_type": "complex", "primary_key": False}}, primary_key="tags", merge_key="tags") + empty_r.apply_hints( + columns={"tags": {"data_type": "complex", "primary_key": False}}, + primary_key="tags", + merge_key="tags", + ) # primary key not set here assert empty_r.columns["tags"] == {"data_type": "complex", "name": "tags", "primary_key": False} # only in the computed table - assert empty_r.compute_table_schema()["columns"]["tags"] == {"data_type": "complex", "name": "tags", "primary_key": True, "merge_key": True} + assert empty_r.compute_table_schema()["columns"]["tags"] == { + "data_type": "complex", + "name": "tags", + "primary_key": True, + "merge_key": True, + } def test_apply_dynamic_hints() -> None: @@ -1214,17 +1276,23 @@ def empty_gen(): # try write disposition and primary key empty_r.apply_hints(primary_key=lambda ev: ev["pk"], write_disposition=lambda ev: ev["wd"]) - table = empty_r.compute_table_schema({"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "skip"}) + table = empty_r.compute_table_schema( + {"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "skip"} + ) assert table["write_disposition"] == "skip" assert "a" in table["columns"] # validate fails with pytest.raises(DictValidationException): - empty_r.compute_table_schema({"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "x-skip"}) + empty_r.compute_table_schema( + {"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "x-skip"} + ) # dynamic columns empty_r.apply_hints(columns=lambda ev: ev["c"]) - table = empty_r.compute_table_schema({"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "skip", "c": [{"name": "tags"}]}) + table = empty_r.compute_table_schema( + {"t": "table", "p": "parent", "pk": ["a", "b"], "wd": "skip", "c": [{"name": "tags"}]} + ) assert table["columns"]["tags"] == {"name": "tags"} @@ -1233,7 +1301,7 @@ def input_gen(): yield from [1, 2, 3] def tx_step(item): - return item*2 + return item * 2 input_r = DltResource.from_data(input_gen) input_r_clone = input_r.with_name("input_gen_2") @@ -1250,17 +1318,23 @@ def tx_step(item): assert list(source) == [1, 2, 3, 1, 2, 3] # cloned from fresh resource - source = DltSource(Schema("dupes"), "module", [DltResource.from_data(input_gen), DltResource.from_data(input_gen).with_name("gen_2")]) + source = DltSource( + Schema("dupes"), + "module", + [DltResource.from_data(input_gen), DltResource.from_data(input_gen).with_name("gen_2")], + ) assert list(source) == [1, 2, 3, 1, 2, 3] # clone transformer input_r = DltResource.from_data(input_gen) input_tx = DltResource.from_data(tx_step, data_from=DltResource.Empty) - source = DltSource(Schema("dupes"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")]) + source = DltSource( + Schema("dupes"), "module", [input_r, (input_r | input_tx).with_name("tx_clone")] + ) pipes = source.resources.pipes assert len(pipes) == 2 assert source.resources[pipes[0].name] == source.input_gen assert source.resources[pipes[1].name] == source.tx_clone selected_pipes = source.resources.selected_pipes assert len(selected_pipes) == 2 - assert list(source) == [1, 2, 3, 2, 4, 6] \ No newline at end of file + assert list(source) == [1, 2, 3, 2, 4, 6] diff --git a/tests/extract/test_utils.py b/tests/extract/test_utils.py index ad5584bab6..0ed352b5fc 100644 --- a/tests/extract/test_utils.py +++ b/tests/extract/test_utils.py @@ -11,7 +11,7 @@ def test_column_schema_from_list() -> None: result = ensure_table_schema_columns_hint(TABLE_UPDATE) for col in TABLE_UPDATE: - assert result[col['name']] == col # type: ignore[index] + assert result[col["name"]] == col # type: ignore[index] def test_dynamic_columns_schema_from_list() -> None: @@ -23,7 +23,7 @@ def dynamic_columns(item: Dict[str, Any]) -> List[TColumnSchema]: result = result_func({}) # type: ignore[operator] for col in TABLE_UPDATE: - assert result[col['name']] == col + assert result[col["name"]] == col def test_dynamic_columns_schema_from_pydantic() -> None: @@ -38,5 +38,5 @@ def dynamic_columns(item: Dict[str, Any]) -> Type[BaseModel]: result = result_func({}) # type: ignore[operator] - assert result['a']['data_type'] == 'bigint' - assert result['b']['data_type'] == 'text' + assert result["a"]["data_type"] == "bigint" + assert result["b"]["data_type"] == "text" diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index db39530567..45d75e0b92 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -60,7 +60,6 @@ def some_data() -> t.Iterator[TDataItems]: @pytest.mark.parametrize("yield_list", [True, False]) def test_remove_validator(yield_list: bool) -> None: - @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: items = [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] @@ -78,7 +77,6 @@ def some_data() -> t.Iterator[TDataItems]: @pytest.mark.parametrize("yield_list", [True, False]) def test_replace_validator_model(yield_list: bool) -> None: - @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: items = [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] @@ -99,7 +97,9 @@ class AnotherModel(BaseModel): data = list(resource) # Items are validated with the new model - assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) + assert json.dumpb(data) == json.dumpb( + [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + ) # Ensure only one validator is applied in steps steps = resource._pipe.steps @@ -112,7 +112,6 @@ class AnotherModel(BaseModel): @pytest.mark.parametrize("yield_list", [True, False]) def test_validator_property_setter(yield_list: bool) -> None: - @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: items = [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] @@ -123,7 +122,9 @@ def some_data() -> t.Iterator[TDataItems]: resource = some_data() - assert isinstance(resource.validator, PydanticValidator) and resource.validator.model.__name__.startswith(SimpleModel.__name__) + assert isinstance( + resource.validator, PydanticValidator + ) and resource.validator.model.__name__.startswith(SimpleModel.__name__) class AnotherModel(BaseModel): a: int @@ -132,11 +133,15 @@ class AnotherModel(BaseModel): resource.validator = PydanticValidator(AnotherModel, column_mode="freeze", data_mode="freeze") - assert resource.validator and resource.validator.model.__name__.startswith(AnotherModel.__name__) + assert resource.validator and resource.validator.model.__name__.startswith( + AnotherModel.__name__ + ) data = list(resource) # Items are validated with the new model - assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) + assert json.dumpb(data) == json.dumpb( + [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + ) @pytest.mark.parametrize("yield_list", [True, False]) @@ -151,7 +156,11 @@ def some_data() -> t.Iterator[TDataItems]: yield from items # some_data must have default Pydantic schema contract - assert some_data().schema_contract == {"tables": "evolve", "columns": "discard_value", "data_type": "freeze"} + assert some_data().schema_contract == { + "tables": "evolve", + "columns": "discard_value", + "data_type": "freeze", + } # extraction fails with ValidationError with pytest.raises(ResourceExtractionError) as exinfo: @@ -178,8 +187,8 @@ def some_data_extra() -> t.Iterator[TDataItems]: pipeline = dlt.pipeline() with pytest.raises(PipelineStepFailed) as py_ex: pipeline.extract(some_data_extra()) - assert isinstance(py_ex.value.__cause__, ResourceExtractionError) - assert isinstance(py_ex.value.__cause__.__cause__, DataValidationError) + assert isinstance(py_ex.value.__cause__, ResourceExtractionError) + assert isinstance(py_ex.value.__cause__.__cause__, DataValidationError) val_ex = py_ex.value.__cause__.__cause__ assert val_ex.table_name == "some_data_extra" assert val_ex.contract_entity == "data_type" # extra field is the cause @@ -188,7 +197,6 @@ def some_data_extra() -> t.Iterator[TDataItems]: @pytest.mark.parametrize("yield_list", [True, False]) def test_validation_with_contracts(yield_list: bool) -> None: - def some_data() -> t.Iterator[TDataItems]: # yield item that fails schema validation items = [{"a": 1, "b": "z"}, {"a": "not_int", "b": "x"}, {"c": "not_int"}] @@ -231,7 +239,11 @@ def some_data() -> t.Iterator[TDataItems]: with pytest.raises(NotImplementedError): # pydantic data_type cannot be discard_value dlt.resource(some_data(), schema_contract="discard_value", columns=SimpleModel) - r = dlt.resource(some_data(), schema_contract={"columns": "discard_value", "data_type": "evolve"}, columns=SimpleModel) + r = dlt.resource( + some_data(), + schema_contract={"columns": "discard_value", "data_type": "evolve"}, + columns=SimpleModel, + ) validator = r.validator # type: ignore[assignment] assert validator.column_mode == "discard_value" assert validator.data_mode == "evolve" diff --git a/tests/extract/utils.py b/tests/extract/utils.py index 006816b5cd..ee1eddeef5 100644 --- a/tests/extract/utils.py +++ b/tests/extract/utils.py @@ -10,9 +10,16 @@ from tests.utils import TDataItemFormat -def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_name: str, content: str) -> None: +def expect_extracted_file( + storage: ExtractorStorage, schema_name: str, table_name: str, content: str +) -> None: files = storage.list_files_to_normalize_sorted() - gen = (file for file in files if storage.get_schema_name(file) == schema_name and storage.parse_normalize_file_name(file).table_name == table_name) + gen = ( + file + for file in files + if storage.get_schema_name(file) == schema_name + and storage.parse_normalize_file_name(file).table_name == table_name + ) file = next(gen, None) if file is None: raise FileNotFoundError(storage.build_extracted_file_stem(schema_name, table_name, "***")) @@ -29,11 +36,11 @@ def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_nam class AssertItems(ItemTransform[TDataItem]): - def __init__(self, expected_items: Any, item_type: TDataItemFormat = "json") -> None: - self.expected_items = expected_items - self.item_type = item_type + def __init__(self, expected_items: Any, item_type: TDataItemFormat = "json") -> None: + self.expected_items = expected_items + self.item_type = item_type - def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: + def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: assert data_item_to_list(self.item_type, item) == self.expected_items return item diff --git a/tests/helpers/airflow_tests/conftest.py b/tests/helpers/airflow_tests/conftest.py index 023aab88c2..3d040b4a11 100644 --- a/tests/helpers/airflow_tests/conftest.py +++ b/tests/helpers/airflow_tests/conftest.py @@ -1,2 +1,2 @@ from tests.helpers.airflow_tests.utils import initialize_airflow_db -from tests.utils import preserve_environ, autouse_test_storage, TEST_STORAGE_ROOT, patch_home_dir \ No newline at end of file +from tests.utils import preserve_environ, autouse_test_storage, TEST_STORAGE_ROOT, patch_home_dir diff --git a/tests/helpers/airflow_tests/test_airflow_provider.py b/tests/helpers/airflow_tests/test_airflow_provider.py index 447006932b..68e426deb9 100644 --- a/tests/helpers/airflow_tests/test_airflow_provider.py +++ b/tests/helpers/airflow_tests/test_airflow_provider.py @@ -12,7 +12,7 @@ from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.configuration.providers.toml import SECRETS_TOML_KEY -DEFAULT_DATE = pendulum.datetime(2023, 4, 18, tz='Europe/Berlin') +DEFAULT_DATE = pendulum.datetime(2023, 4, 18, tz="Europe/Berlin") # Test data SECRETS_TOML_CONTENT = """ [sources] @@ -21,7 +21,6 @@ def test_airflow_secrets_toml_provider() -> None: - @dag(start_date=DEFAULT_DATE) def test_dag(): from dlt.common.configuration.providers.airflow import AirflowSecretsTomlProvider @@ -33,18 +32,17 @@ def test_dag(): @task() def test_task(): - provider = AirflowSecretsTomlProvider() - api_key, _ = provider.get_value('api_key', str, None, 'sources') + api_key, _ = provider.get_value("api_key", str, None, "sources") # There's no pytest context here in the task, so we need to return # the results as a dict and assert them in the test function. # See ti.xcom_pull() below. return { - 'name': provider.name, - 'supports_secrets': provider.supports_secrets, - 'api_key_from_provider': api_key, + "name": provider.name, + "supports_secrets": provider.supports_secrets, + "api_key_from_provider": api_key, } test_task() @@ -61,12 +59,12 @@ def test_task(): ti.run() # print(task_def.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)) - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['name'] == 'Airflow Secrets TOML Provider' - assert result['supports_secrets'] - assert result['api_key_from_provider'] == 'test_value' + assert result["name"] == "Airflow Secrets TOML Provider" + assert result["supports_secrets"] + assert result["api_key_from_provider"] == "test_value" def test_airflow_secrets_toml_provider_import_dlt_dag() -> None: @@ -86,7 +84,7 @@ def test_dag(): @task() def test_task(): return { - 'api_key_from_provider': api_key, + "api_key_from_provider": api_key, } test_task() @@ -103,10 +101,10 @@ def test_task(): ti.run() # print(task_def.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)) - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['api_key_from_provider'] == 'test_value' + assert result["api_key_from_provider"] == "test_value" def test_airflow_secrets_toml_provider_import_dlt_task() -> None: @@ -114,7 +112,6 @@ def test_airflow_secrets_toml_provider_import_dlt_task() -> None: @dag(start_date=DEFAULT_DATE) def test_dag(): - @task() def test_task(): Variable.set(SECRETS_TOML_KEY, SECRETS_TOML_CONTENT) @@ -125,7 +122,7 @@ def test_task(): api_key = secrets["sources.api_key"] return { - 'api_key_from_provider': api_key, + "api_key_from_provider": api_key, } test_task() @@ -142,14 +139,14 @@ def test_task(): ti.run() # print(task_def.run(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE)) - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['api_key_from_provider'] == 'test_value' + assert result["api_key_from_provider"] == "test_value" def test_airflow_secrets_toml_provider_is_loaded(): - dag = DAG(dag_id='test_dag', start_date=DEFAULT_DATE) + dag = DAG(dag_id="test_dag", start_date=DEFAULT_DATE) def test_task(): from dlt.common.configuration.providers.airflow import AirflowSecretsTomlProvider @@ -177,13 +174,11 @@ def test_task(): # the results as a dict and assert them in the test function. # See ti.xcom_pull() below. return { - 'airflow_secrets_toml_provider_is_loaded': astp_is_loaded, - 'api_key_from_provider': api_key, + "airflow_secrets_toml_provider_is_loaded": astp_is_loaded, + "api_key_from_provider": api_key, } - task = PythonOperator( - task_id='test_task', python_callable=test_task, dag=dag - ) + task = PythonOperator(task_id="test_task", python_callable=test_task, dag=dag) dag.create_dagrun( state=DagRunState.RUNNING, @@ -196,15 +191,15 @@ def test_task(): ti.run() - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['airflow_secrets_toml_provider_is_loaded'] - assert result['api_key_from_provider'] == 'test_value' + assert result["airflow_secrets_toml_provider_is_loaded"] + assert result["api_key_from_provider"] == "test_value" def test_airflow_secrets_toml_provider_missing_variable(): - dag = DAG(dag_id='test_dag', start_date=DEFAULT_DATE) + dag = DAG(dag_id="test_dag", start_date=DEFAULT_DATE) def test_task(): from dlt.common.configuration.specs import config_providers_context @@ -213,14 +208,14 @@ def test_task(): # Make sure the variable is not set Variable.delete(SECRETS_TOML_KEY) providers = config_providers_context._extra_providers() - provider = next(provider for provider in providers if isinstance(provider, AirflowSecretsTomlProvider)) + provider = next( + provider for provider in providers if isinstance(provider, AirflowSecretsTomlProvider) + ) return { - 'airflow_secrets_toml': provider._toml.as_string(), + "airflow_secrets_toml": provider._toml.as_string(), } - task = PythonOperator( - task_id='test_task', python_callable=test_task, dag=dag - ) + task = PythonOperator(task_id="test_task", python_callable=test_task, dag=dag) dag.create_dagrun( state=DagRunState.RUNNING, @@ -233,20 +228,20 @@ def test_task(): ti.run() - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['airflow_secrets_toml'] == "" + assert result["airflow_secrets_toml"] == "" def test_airflow_secrets_toml_provider_invalid_content(): - dag = DAG(dag_id='test_dag', start_date=DEFAULT_DATE) + dag = DAG(dag_id="test_dag", start_date=DEFAULT_DATE) def test_task(): import tomlkit from dlt.common.configuration.providers.airflow import AirflowSecretsTomlProvider - Variable.set(SECRETS_TOML_KEY, 'invalid_content') + Variable.set(SECRETS_TOML_KEY, "invalid_content") # There's no pytest context here in the task, so we need # to catch the exception manually and return the result @@ -258,12 +253,10 @@ def test_task(): exception_raised = True return { - 'exception_raised': exception_raised, + "exception_raised": exception_raised, } - task = PythonOperator( - task_id='test_task', python_callable=test_task, dag=dag - ) + task = PythonOperator(task_id="test_task", python_callable=test_task, dag=dag) dag.create_dagrun( state=DagRunState.RUNNING, @@ -276,7 +269,7 @@ def test_task(): ti.run() - result = ti.xcom_pull(task_ids='test_task') + result = ti.xcom_pull(task_ids="test_task") assert ti.state == State.SUCCESS - assert result['exception_raised'] + assert result["exception_raised"] diff --git a/tests/helpers/airflow_tests/test_airflow_wrapper.py b/tests/helpers/airflow_tests/test_airflow_wrapper.py index e6b622c1c4..ad6631d1fc 100644 --- a/tests/helpers/airflow_tests/test_airflow_wrapper.py +++ b/tests/helpers/airflow_tests/test_airflow_wrapper.py @@ -18,21 +18,20 @@ from tests.utils import TEST_STORAGE_ROOT -DEFAULT_DATE = pendulum.datetime(2023, 4, 18, tz='Europe/Berlin') +DEFAULT_DATE = pendulum.datetime(2023, 4, 18, tz="Europe/Berlin") default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 0, - 'max_active_runs': 1 + "owner": "airflow", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, + "max_active_runs": 1, } @dlt.source def mock_data_source(): - @dlt.resource(selected=True) def _r_init(): yield ["-", "x", "!"] @@ -47,18 +46,18 @@ def _t1(items, suffix): @dlt.transformer(data_from=_r1) def _t2(items, mul): - yield items*mul + yield items * mul @dlt.transformer(data_from=_r1) def _t3(items, mul): for item in items: - yield item.upper()*mul + yield item.upper() * mul # add something to init @dlt.transformer(data_from=_r_init) def _t_init_post(items): for item in items: - yield item*2 + yield item * 2 @dlt.resource def _r_isolee(): @@ -69,7 +68,6 @@ def _r_isolee(): @dlt.source(section="mock_data_source_state") def mock_data_source_state(): - @dlt.resource(selected=True) def _r_init(): dlt.current.source_state()["counter"] = 1 @@ -94,7 +92,7 @@ def _t2(items, mul): dlt.current.source_state()["counter"] += 1 dlt.current.resource_state("_r1")["counter"] += 1 dlt.current.resource_state()["counter"] = 1 - yield items*mul + yield items * mul @dlt.transformer(data_from=_r1) def _t3(items, mul): @@ -102,13 +100,13 @@ def _t3(items, mul): dlt.current.resource_state("_r1")["counter"] += 1 dlt.current.resource_state()["counter"] = 1 for item in items: - yield item.upper()*mul + yield item.upper() * mul # add something to init @dlt.transformer(data_from=_r_init) def _t_init_post(items): for item in items: - yield item*2 + yield item * 2 @dlt.resource def _r_isolee(): @@ -121,53 +119,83 @@ def _r_isolee(): def test_regular_run() -> None: # run the pipeline normally pipeline_standalone = dlt.pipeline( - pipeline_name="pipeline_standalone", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") + pipeline_name="pipeline_standalone", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) pipeline_standalone.run(mock_data_source()) - pipeline_standalone_counts = load_table_counts(pipeline_standalone, *[t["name"] for t in pipeline_standalone.default_schema.data_tables()]) + pipeline_standalone_counts = load_table_counts( + pipeline_standalone, *[t["name"] for t in pipeline_standalone.default_schema.data_tables()] + ) tasks_list: List[PythonOperator] = None - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_regular(): nonlocal tasks_list - tasks = PipelineTasksGroup("pipeline_dag_regular", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False) + tasks = PipelineTasksGroup( + "pipeline_dag_regular", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False + ) pipeline_dag_regular = dlt.pipeline( - pipeline_name="pipeline_dag_regular", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") - tasks_list = tasks.add_run(pipeline_dag_regular, mock_data_source(), decompose="none", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name="pipeline_dag_regular", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) + tasks_list = tasks.add_run( + pipeline_dag_regular, + mock_data_source(), + decompose="none", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) dag_def: DAG = dag_regular() assert len(tasks_list) == 1 # composite task name - assert tasks_list[0].task_id == "pipeline_dag_regular.mock_data_source__r_init-_t_init_post-_t1-_t2-2-more" + assert ( + tasks_list[0].task_id + == "pipeline_dag_regular.mock_data_source__r_init-_t_init_post-_t1-_t2-2-more" + ) dag_def.test() # we should be able to attach to pipeline state created within Airflow pipeline_dag_regular = dlt.attach(pipeline_name="pipeline_dag_regular") - pipeline_dag_regular_counts = load_table_counts(pipeline_dag_regular, *[t["name"] for t in pipeline_dag_regular.default_schema.data_tables()]) + pipeline_dag_regular_counts = load_table_counts( + pipeline_dag_regular, + *[t["name"] for t in pipeline_dag_regular.default_schema.data_tables()], + ) # same data should be loaded assert pipeline_dag_regular_counts == pipeline_standalone_counts quackdb_path = os.path.join(TEST_STORAGE_ROOT, "pipeline_dag_decomposed.duckdb") - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_decomposed(): nonlocal tasks_list - tasks = PipelineTasksGroup("pipeline_dag_decomposed", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False) + tasks = PipelineTasksGroup( + "pipeline_dag_decomposed", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False + ) # set duckdb to be outside of pipeline folder which is dropped on each task pipeline_dag_decomposed = dlt.pipeline( - pipeline_name="pipeline_dag_decomposed", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=quackdb_path) - tasks_list = tasks.add_run(pipeline_dag_decomposed, mock_data_source(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name="pipeline_dag_decomposed", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=quackdb_path, + ) + tasks_list = tasks.add_run( + pipeline_dag_decomposed, + mock_data_source(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) dag_def = dag_decomposed() assert len(tasks_list) == 3 @@ -177,7 +205,10 @@ def dag_decomposed(): assert tasks_list[2].task_id == "pipeline_dag_decomposed.mock_data_source__r_isolee" dag_def.test() pipeline_dag_decomposed = dlt.attach(pipeline_name="pipeline_dag_decomposed") - pipeline_dag_decomposed_counts = load_table_counts(pipeline_dag_decomposed, *[t["name"] for t in pipeline_dag_decomposed.default_schema.data_tables()]) + pipeline_dag_decomposed_counts = load_table_counts( + pipeline_dag_decomposed, + *[t["name"] for t in pipeline_dag_decomposed.default_schema.data_tables()], + ) assert pipeline_dag_decomposed_counts == pipeline_standalone_counts @@ -200,7 +231,6 @@ def dag_decomposed(): def test_run_with_retry() -> None: - retries = 2 now = pendulum.now() @@ -212,19 +242,22 @@ def _fail_3(): raise Exception(f"Failed on retry #{retries}") yield from "ABC" - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_fail_3(): # by default we do not retry so this will fail - tasks = PipelineTasksGroup("pipeline_fail_3", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False) + tasks = PipelineTasksGroup( + "pipeline_fail_3", local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False + ) pipeline_fail_3 = dlt.pipeline( - pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") - tasks.add_run(pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name="pipeline_fail_3", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) + tasks.add_run( + pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True + ) dag_def: DAG = dag_fail_3() ti = get_task_run(dag_def, "pipeline_fail_3.pipeline_fail_3", now) @@ -233,19 +266,25 @@ def dag_fail_3(): ti._run_raw_task() assert pip_ex.value.step == "extract" - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_fail_4(): # by default we do not retry extract so we fail - tasks = PipelineTasksGroup("pipeline_fail_3", retry_policy=DEFAULT_RETRY_BACKOFF, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False) + tasks = PipelineTasksGroup( + "pipeline_fail_3", + retry_policy=DEFAULT_RETRY_BACKOFF, + local_data_folder=TEST_STORAGE_ROOT, + wipe_local_data=False, + ) pipeline_fail_3 = dlt.pipeline( - pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") - tasks.add_run(pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name="pipeline_fail_3", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) + tasks.add_run( + pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True + ) dag_def = dag_fail_4() ti = get_task_run(dag_def, "pipeline_fail_3.pipeline_fail_3", now) @@ -255,19 +294,26 @@ def dag_fail_4(): ti._run_raw_task() assert pip_ex.value.step == "extract" - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_fail_5(): # this will retry - tasks = PipelineTasksGroup("pipeline_fail_3", retry_policy=DEFAULT_RETRY_BACKOFF, retry_pipeline_steps=("load", "extract"), local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=False) + tasks = PipelineTasksGroup( + "pipeline_fail_3", + retry_policy=DEFAULT_RETRY_BACKOFF, + retry_pipeline_steps=("load", "extract"), + local_data_folder=TEST_STORAGE_ROOT, + wipe_local_data=False, + ) pipeline_fail_3 = dlt.pipeline( - pipeline_name="pipeline_fail_3", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") - tasks.add_run(pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name="pipeline_fail_3", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) + tasks.add_run( + pipeline_fail_3, _fail_3, trigger_rule="all_done", retries=0, provide_context=True + ) dag_def = dag_fail_5() ti = get_task_run(dag_def, "pipeline_fail_3.pipeline_fail_3", now) @@ -277,22 +323,30 @@ def dag_fail_5(): def test_run_decomposed_with_state_wipe() -> None: - dataset_name = "mock_data_" + uniq_id() pipeline_name = "pipeline_dag_regular_" + uniq_id() - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_regular(): - tasks = PipelineTasksGroup(pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True, save_load_info=True, save_trace_info=True) + tasks = PipelineTasksGroup( + pipeline_name, + local_data_folder=TEST_STORAGE_ROOT, + wipe_local_data=True, + save_load_info=True, + save_trace_info=True, + ) pipeline_dag_regular = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") - tasks.add_run(pipeline_dag_regular, mock_data_source_state(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) + tasks.add_run( + pipeline_dag_regular, + mock_data_source_state(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) dag_def: DAG = dag_regular() dag_def.test() @@ -302,7 +356,8 @@ def dag_regular(): dlt.attach(pipeline_name=pipeline_name) pipeline_dag_regular = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) pipeline_dag_regular.sync_destination() # print(pipeline_dag_regular.state) # now source can attach to state in the pipeline @@ -311,9 +366,9 @@ def dag_regular(): # end state was increased twice (in init and in isolee at the end) assert post_source.state["end_counter"] == 2 # the source counter was increased in init, _r1 and in 3 transformers * 3 items - assert post_source.state["counter"] == 1 + 1 + 3*3 + assert post_source.state["counter"] == 1 + 1 + 3 * 3 # resource counter _r1 - assert post_source._r1.state["counter"] == 1 + 3*3 + assert post_source._r1.state["counter"] == 1 + 3 * 3 # each transformer has a counter assert post_source._t1.state["counter"] == 1 assert post_source._t2.state["counter"] == 1 @@ -324,68 +379,114 @@ def test_run_multiple_sources() -> None: dataset_name = "mock_data_" + uniq_id() pipeline_name = "pipeline_dag_regular_" + uniq_id() - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_serialize(): - tasks = PipelineTasksGroup(pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True) + tasks = PipelineTasksGroup( + pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True + ) pipeline_dag_regular = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") - st_tasks = tasks.add_run(pipeline_dag_regular, mock_data_source_state(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) - nst_tasks = tasks.add_run(pipeline_dag_regular, mock_data_source(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) + st_tasks = tasks.add_run( + pipeline_dag_regular, + mock_data_source_state(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) + nst_tasks = tasks.add_run( + pipeline_dag_regular, + mock_data_source(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) # connect end of first run to a head of a second st_tasks[-1] >> nst_tasks[0] - dag_def: DAG = dag_serialize() dag_def.test() pipeline_dag_serial = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) pipeline_dag_serial.sync_destination() # we should have two schemas - assert set(pipeline_dag_serial.schema_names) == {'mock_data_source_state', 'mock_data_source'} - counters_st_tasks = load_table_counts(pipeline_dag_serial, *[t["name"] for t in pipeline_dag_serial.schemas['mock_data_source_state'].data_tables()]) - counters_nst_tasks = load_table_counts(pipeline_dag_serial, *[t["name"] for t in pipeline_dag_serial.schemas['mock_data_source'].data_tables()]) + assert set(pipeline_dag_serial.schema_names) == {"mock_data_source_state", "mock_data_source"} + counters_st_tasks = load_table_counts( + pipeline_dag_serial, + *[t["name"] for t in pipeline_dag_serial.schemas["mock_data_source_state"].data_tables()], + ) + counters_nst_tasks = load_table_counts( + pipeline_dag_serial, + *[t["name"] for t in pipeline_dag_serial.schemas["mock_data_source"].data_tables()], + ) # print(counters_st_tasks) # print(counters_nst_tasks) # this state is confirmed in other test - assert pipeline_dag_serial.state["sources"]["mock_data_source_state"] == {'counter': 11, 'end_counter': 2, 'resources': {'_r1': {'counter': 10}, '_t3': {'counter': 1}, '_t2': {'counter': 1}, '_t1': {'counter': 1}}} + assert pipeline_dag_serial.state["sources"]["mock_data_source_state"] == { + "counter": 11, + "end_counter": 2, + "resources": { + "_r1": {"counter": 10}, + "_t3": {"counter": 1}, + "_t2": {"counter": 1}, + "_t1": {"counter": 1}, + }, + } # next DAG does not connect subgraphs dataset_name = "mock_data_" + uniq_id() pipeline_name = "pipeline_dag_regular_" + uniq_id() - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_parallel(): - tasks = PipelineTasksGroup(pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True) + tasks = PipelineTasksGroup( + pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True + ) pipeline_dag_regular = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") - tasks.add_run(pipeline_dag_regular, mock_data_source_state(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) - tasks.add_run(pipeline_dag_regular, mock_data_source(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) + tasks.add_run( + pipeline_dag_regular, + mock_data_source_state(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) + tasks.add_run( + pipeline_dag_regular, + mock_data_source(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) # do not connect graph dag_def = dag_parallel() dag_def.test() pipeline_dag_parallel = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) pipeline_dag_parallel.sync_destination() # we should have two schemas - assert set(pipeline_dag_parallel.schema_names) == {'mock_data_source_state', 'mock_data_source'} - counters_st_tasks_par = load_table_counts(pipeline_dag_parallel, *[t["name"] for t in pipeline_dag_parallel.schemas['mock_data_source_state'].data_tables()]) - counters_nst_tasks_par = load_table_counts(pipeline_dag_parallel, *[t["name"] for t in pipeline_dag_parallel.schemas['mock_data_source'].data_tables()]) + assert set(pipeline_dag_parallel.schema_names) == {"mock_data_source_state", "mock_data_source"} + counters_st_tasks_par = load_table_counts( + pipeline_dag_parallel, + *[t["name"] for t in pipeline_dag_parallel.schemas["mock_data_source_state"].data_tables()], + ) + counters_nst_tasks_par = load_table_counts( + pipeline_dag_parallel, + *[t["name"] for t in pipeline_dag_parallel.schemas["mock_data_source"].data_tables()], + ) assert counters_st_tasks == counters_st_tasks_par assert counters_nst_tasks == counters_nst_tasks_par assert pipeline_dag_serial.state["sources"] == pipeline_dag_parallel.state["sources"] @@ -395,19 +496,31 @@ def dag_parallel(): dataset_name = "mock_data_" + uniq_id() pipeline_name = "pipeline_dag_regular_" + uniq_id() - @dag( - schedule=None, - start_date=DEFAULT_DATE, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=DEFAULT_DATE, catchup=False, default_args=default_args) def dag_mixed(): - tasks = PipelineTasksGroup(pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True) + tasks = PipelineTasksGroup( + pipeline_name, local_data_folder=TEST_STORAGE_ROOT, wipe_local_data=True + ) pipeline_dag_regular = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") - pd_tasks = tasks.add_run(pipeline_dag_regular, mock_data_source_state(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) - hb_tasks = tasks.add_run(pipeline_dag_regular, mock_data_source(), decompose="serialize", trigger_rule="all_done", retries=0, provide_context=True) + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) + pd_tasks = tasks.add_run( + pipeline_dag_regular, + mock_data_source_state(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) + hb_tasks = tasks.add_run( + pipeline_dag_regular, + mock_data_source(), + decompose="serialize", + trigger_rule="all_done", + retries=0, + provide_context=True, + ) # create almost randomly connected tasks across two runs for pd_t, hb_t in zip(pd_tasks, hb_tasks): pd_t >> hb_t @@ -416,12 +529,19 @@ def dag_mixed(): dag_def.test() pipeline_dag_mixed = dlt.pipeline( - pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb") + pipeline_name=pipeline_name, dataset_name=dataset_name, destination="duckdb" + ) pipeline_dag_mixed.sync_destination() # we should have two schemas - assert set(pipeline_dag_mixed.schema_names) == {'mock_data_source_state', 'mock_data_source'} - counters_st_tasks_par = load_table_counts(pipeline_dag_mixed, *[t["name"] for t in pipeline_dag_mixed.schemas['mock_data_source_state'].data_tables()]) - counters_nst_tasks_par = load_table_counts(pipeline_dag_mixed, *[t["name"] for t in pipeline_dag_mixed.schemas['mock_data_source'].data_tables()]) + assert set(pipeline_dag_mixed.schema_names) == {"mock_data_source_state", "mock_data_source"} + counters_st_tasks_par = load_table_counts( + pipeline_dag_mixed, + *[t["name"] for t in pipeline_dag_mixed.schemas["mock_data_source_state"].data_tables()], + ) + counters_nst_tasks_par = load_table_counts( + pipeline_dag_mixed, + *[t["name"] for t in pipeline_dag_mixed.schemas["mock_data_source"].data_tables()], + ) assert counters_st_tasks == counters_st_tasks_par assert counters_nst_tasks == counters_nst_tasks_par assert pipeline_dag_serial.state["sources"] == pipeline_dag_mixed.state["sources"] @@ -434,7 +554,7 @@ def get_task_run(dag_def: DAG, task_name: str, now: pendulum.DateTime) -> TaskIn state=DagRunState.RUNNING, execution_date=now, run_type=DagRunType.MANUAL, - data_interval=(now, now) + data_interval=(now, now), ) dag_def.run(start_date=now, run_at_least_once=True) task_def = dag_def.task_dict[task_name] diff --git a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py index e65c11967e..8c1992c506 100644 --- a/tests/helpers/airflow_tests/test_join_airflow_scheduler.py +++ b/tests/helpers/airflow_tests/test_join_airflow_scheduler.py @@ -18,27 +18,32 @@ CATCHUP_BEGIN = pendulum.datetime(2023, 1, 1, tz="Europe/Berlin") default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'email_on_failure': False, - 'email_on_retry': False, - 'retries': 0, + "owner": "airflow", + "depends_on_past": False, + "email_on_failure": False, + "email_on_retry": False, + "retries": 0, } + @dlt.resource() -def existing_incremental(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): +def existing_incremental( + updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sources.incremental( + "updated_at", allow_external_schedulers=True + ) +): yield {"updated_at": CATCHUP_BEGIN, "state": updated_at.get_state()} def test_date_coercion() -> None: - @dag(schedule_interval='@daily', + @dag( + schedule_interval="@daily", start_date=CATCHUP_BEGIN, catchup=False, max_active_runs=1, - default_args=default_args + default_args=default_args, ) def dag_regular(): - @task def scheduled() -> None: context = get_current_context() @@ -50,49 +55,78 @@ def scheduled() -> None: assert state["updated_at"] == CATCHUP_BEGIN assert "Europe/Berlin" in str(state["updated_at"].tz) # must have UTC timezone - assert state["state"]["initial_value"] == CATCHUP_BEGIN == context["data_interval_start"] + assert ( + state["state"]["initial_value"] == CATCHUP_BEGIN == context["data_interval_start"] + ) assert state["state"]["initial_value"].tz == UTC assert state["state"]["last_value"] == CATCHUP_BEGIN == context["data_interval_start"] assert state["state"]["last_value"].tz == UTC # end date assert r.incremental._incremental.end_value == context["data_interval_end"] assert r.incremental._incremental.end_value.tz == UTC - assert (r.incremental._incremental.end_value - state["state"]["initial_value"]) == datetime.timedelta(hours=24) + assert ( + r.incremental._incremental.end_value - state["state"]["initial_value"] + ) == datetime.timedelta(hours=24) # datetime.datetime coercion must be pendulum anyway @dlt.resource() - def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime]("updated_at", allow_external_schedulers=True)): + def incremental_datetime( + updated_at=dlt.sources.incremental[datetime.datetime]( + "updated_at", allow_external_schedulers=True + ) + ): yield {"updated_at": CATCHUP_BEGIN, "state": updated_at.get_state()} r = incremental_datetime() state = list(r)[0] # must have UTC timezone - assert state["state"]["initial_value"] == CATCHUP_BEGIN == context["data_interval_start"] + assert ( + state["state"]["initial_value"] == CATCHUP_BEGIN == context["data_interval_start"] + ) assert state["state"]["initial_value"].tz == UTC # datetime.date coercion also works @dlt.resource() # type: ignore[no-redef] - def incremental_datetime(updated_at = dlt.sources.incremental[datetime.date]("updated_at", allow_external_schedulers=True)): - yield {"updated_at": ensure_pendulum_date(CATCHUP_BEGIN), "state": updated_at.get_state()} + def incremental_datetime( + updated_at=dlt.sources.incremental[datetime.date]( + "updated_at", allow_external_schedulers=True + ) + ): + yield { + "updated_at": ensure_pendulum_date(CATCHUP_BEGIN), + "state": updated_at.get_state(), + } r = incremental_datetime() state = list(r)[0] - assert state["state"]["initial_value"] == ensure_pendulum_date(context["data_interval_start"]) + assert state["state"]["initial_value"] == ensure_pendulum_date( + context["data_interval_start"] + ) assert isinstance(state["state"]["initial_value"], datetime.date) # coerce to int @dlt.resource() # type: ignore[no-redef] - def incremental_datetime(updated_at = dlt.sources.incremental[int]("updated_at", allow_external_schedulers=True)): + def incremental_datetime( + updated_at=dlt.sources.incremental[int]( + "updated_at", allow_external_schedulers=True + ) + ): yield {"updated_at": CATCHUP_BEGIN.int_timestamp, "state": updated_at.get_state()} r = incremental_datetime() state = list(r)[0] assert state["state"]["initial_value"] == context["data_interval_start"].int_timestamp - assert r.incremental._incremental.end_value == context["data_interval_end"].int_timestamp + assert ( + r.incremental._incremental.end_value == context["data_interval_end"].int_timestamp + ) # coerce to float @dlt.resource() # type: ignore[no-redef] - def incremental_datetime(updated_at = dlt.sources.incremental[float]("updated_at", allow_external_schedulers=True)): + def incremental_datetime( + updated_at=dlt.sources.incremental[float]( + "updated_at", allow_external_schedulers=True + ) + ): yield {"updated_at": CATCHUP_BEGIN.timestamp(), "state": updated_at.get_state()} r = incremental_datetime() @@ -102,14 +136,27 @@ def incremental_datetime(updated_at = dlt.sources.incremental[float]("updated_at # coerce to str @dlt.resource() # type: ignore[no-redef] - def incremental_datetime(updated_at = dlt.sources.incremental[str]("updated_at", allow_external_schedulers=True)): - yield {"updated_at": CATCHUP_BEGIN.in_tz("UTC").isoformat(), "state": updated_at.get_state()} + def incremental_datetime( + updated_at=dlt.sources.incremental[str]( + "updated_at", allow_external_schedulers=True + ) + ): + yield { + "updated_at": CATCHUP_BEGIN.in_tz("UTC").isoformat(), + "state": updated_at.get_state(), + } r = incremental_datetime() state = list(r)[0] # must have UTC timezone - assert state["state"]["initial_value"] == context["data_interval_start"].in_tz("UTC").isoformat() - assert r.incremental._incremental.end_value == context["data_interval_end"].in_tz("UTC").isoformat() + assert ( + state["state"]["initial_value"] + == context["data_interval_start"].in_tz("UTC").isoformat() + ) + assert ( + r.incremental._incremental.end_value + == context["data_interval_end"].in_tz("UTC").isoformat() + ) scheduled() @@ -122,11 +169,12 @@ def incremental_datetime(updated_at = dlt.sources.incremental[str]("updated_at", def test_no_next_execution_date() -> None: now = pendulum.now() - @dag(schedule=None, + @dag( + schedule=None, catchup=False, start_date=CATCHUP_BEGIN, default_args=default_args, - max_active_runs=1 + max_active_runs=1, ) def dag_no_schedule(): @task @@ -134,8 +182,15 @@ def unscheduled(): context = get_current_context() @dlt.resource() - def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime]("updated_at", allow_external_schedulers=True)): - yield {"updated_at": context["data_interval_start"], "state": updated_at.get_state()} + def incremental_datetime( + updated_at=dlt.sources.incremental[datetime.datetime]( + "updated_at", allow_external_schedulers=True + ) + ): + yield { + "updated_at": context["data_interval_start"], + "state": updated_at.get_state(), + } r = incremental_datetime() state = list(r)[0] @@ -151,8 +206,15 @@ def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime] # will be filtered out (now earlier than data_interval_start) @dlt.resource() # type: ignore[no-redef] - def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime]("updated_at", allow_external_schedulers=True)): - yield {"updated_at": now.subtract(hours=1, seconds=1), "state": updated_at.get_state()} + def incremental_datetime( + updated_at=dlt.sources.incremental[datetime.datetime]( + "updated_at", allow_external_schedulers=True + ) + ): + yield { + "updated_at": now.subtract(hours=1, seconds=1), + "state": updated_at.get_state(), + } r = incremental_datetime() assert len(list(r)) == 0 @@ -172,18 +234,27 @@ def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime] ti.run() assert ti.state == State.SUCCESS - @dag(schedule_interval='@daily', + @dag( + schedule_interval="@daily", start_date=CATCHUP_BEGIN, catchup=True, - default_args=default_args + default_args=default_args, ) def dag_daily_schedule(): @task def scheduled(): context = get_current_context() + @dlt.resource() - def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime]("updated_at", allow_external_schedulers=True)): - yield {"updated_at": context["data_interval_start"], "state": updated_at.get_state()} + def incremental_datetime( + updated_at=dlt.sources.incremental[datetime.datetime]( + "updated_at", allow_external_schedulers=True + ) + ): + yield { + "updated_at": context["data_interval_start"], + "state": updated_at.get_state(), + } r = incremental_datetime() state = list(r)[0] @@ -208,7 +279,7 @@ def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime] state=DagRunState.RUNNING, execution_date=now, run_type=DagRunType.MANUAL, - data_interval=(now, now) + data_interval=(now, now), ) dag_def.run(start_date=now, run_at_least_once=True) task_def = dag_def.task_dict["scheduled"] @@ -219,16 +290,20 @@ def incremental_datetime(updated_at = dlt.sources.incremental[datetime.datetime] def test_scheduler_pipeline_state() -> None: pipeline = dlt.pipeline( - pipeline_name="pipeline_dag_regular", dataset_name="mock_data_" + uniq_id(), destination="duckdb", credentials=":pipeline:") + pipeline_name="pipeline_dag_regular", + dataset_name="mock_data_" + uniq_id(), + destination="duckdb", + credentials=":pipeline:", + ) now = pendulum.now() - @dag(schedule_interval='@daily', + @dag( + schedule_interval="@daily", start_date=CATCHUP_BEGIN, catchup=False, - default_args=default_args + default_args=default_args, ) def dag_regular(): - @task def scheduled() -> None: r = existing_incremental() @@ -252,7 +327,7 @@ def scheduled() -> None: state=DagRunState.RUNNING, execution_date=now, run_type=DagRunType.MANUAL, - data_interval=(now, now) + data_interval=(now, now), ) dag_def.run(start_date=now, run_at_least_once=True) task_def = dag_def.task_dict["scheduled"] @@ -261,20 +336,13 @@ def scheduled() -> None: assert ti.state == State.SUCCESS assert "sources" not in pipeline.state - pipeline = pipeline.drop() dag_def.test(execution_date=CATCHUP_BEGIN) assert "sources" not in pipeline.state - @dag( - schedule=None, - start_date=CATCHUP_BEGIN, - catchup=False, - default_args=default_args - ) + @dag(schedule=None, start_date=CATCHUP_BEGIN, catchup=False, default_args=default_args) def dag_no_schedule(): - @task def unscheduled() -> None: r = existing_incremental() diff --git a/tests/helpers/airflow_tests/utils.py b/tests/helpers/airflow_tests/utils.py index 4fe7472eb0..50aab77505 100644 --- a/tests/helpers/airflow_tests/utils.py +++ b/tests/helpers/airflow_tests/utils.py @@ -11,7 +11,7 @@ from dlt.common.configuration.providers.toml import SECRETS_TOML_KEY -@pytest.fixture(scope='function', autouse=True) +@pytest.fixture(scope="function", autouse=True) def initialize_airflow_db(): setup_airflow() # backup context providers @@ -30,14 +30,14 @@ def initialize_airflow_db(): def setup_airflow() -> None: # Disable loading examples try: - conf.add_section('core') + conf.add_section("core") except DuplicateSectionError: pass - conf.set('core', 'load_examples', 'False') + conf.set("core", "load_examples", "False") # Prepare the arguments for the initdb function args = argparse.Namespace() # becomes database/sql_alchemy_conn in apache 2.7.0 - args.backend = conf.get(section='core', key='sql_alchemy_conn') + args.backend = conf.get(section="core", key="sql_alchemy_conn") # Run Airflow resetdb before running any tests args.yes = True diff --git a/tests/helpers/dbt_cloud_tests/test_dbt_cloud.py b/tests/helpers/dbt_cloud_tests/test_dbt_cloud.py index 92d0f722b9..600a11558b 100644 --- a/tests/helpers/dbt_cloud_tests/test_dbt_cloud.py +++ b/tests/helpers/dbt_cloud_tests/test_dbt_cloud.py @@ -2,7 +2,7 @@ from dlt.helpers.dbt_cloud import run_dbt_cloud_job, get_dbt_cloud_run_status -@pytest.mark.parametrize('wait_outcome', [False, True]) +@pytest.mark.parametrize("wait_outcome", [False, True]) def test_trigger_run(wait_outcome): # Trigger job run and wait for an outcome run_status = run_dbt_cloud_job(wait_for_outcome=wait_outcome) @@ -12,7 +12,7 @@ def test_trigger_run(wait_outcome): assert not run_status.get("is_error") -@pytest.mark.parametrize('wait_outcome', [False, True]) +@pytest.mark.parametrize("wait_outcome", [False, True]) def test_run_status(wait_outcome): # Trigger job run and wait for an outcome run_status = run_dbt_cloud_job(wait_for_outcome=False) diff --git a/tests/helpers/dbt_tests/local/test_dbt_utils.py b/tests/helpers/dbt_tests/local/test_dbt_utils.py index 133ecf1617..6c2d28ed23 100644 --- a/tests/helpers/dbt_tests/local/test_dbt_utils.py +++ b/tests/helpers/dbt_tests/local/test_dbt_utils.py @@ -8,7 +8,12 @@ from dlt.common.utils import uniq_id from dlt.destinations.impl.postgres.configuration import PostgresCredentials -from dlt.helpers.dbt.dbt_utils import DBTProcessingError, initialize_dbt_logging, run_dbt_command, is_incremental_schema_out_of_sync_error +from dlt.helpers.dbt.dbt_utils import ( + DBTProcessingError, + initialize_dbt_logging, + run_dbt_command, + is_incremental_schema_out_of_sync_error, +) from tests.utils import test_storage, preserve_environ from tests.helpers.dbt_tests.utils import clone_jaffle_repo, load_test_case @@ -16,8 +21,18 @@ def test_is_incremental_schema_out_of_sync_error() -> None: # in case of --fail-fast detect on a single run result - assert is_incremental_schema_out_of_sync_error(decode_obj(load_test_case("run_result_incremental_fail.pickle.hex"))) is True - assert is_incremental_schema_out_of_sync_error(decode_obj(load_test_case("run_execution_incremental_fail.pickle.hex"))) is True + assert ( + is_incremental_schema_out_of_sync_error( + decode_obj(load_test_case("run_result_incremental_fail.pickle.hex")) + ) + is True + ) + assert ( + is_incremental_schema_out_of_sync_error( + decode_obj(load_test_case("run_execution_incremental_fail.pickle.hex")) + ) + is True + ) assert is_incremental_schema_out_of_sync_error("AAA") is False @@ -27,24 +42,36 @@ def test_dbt_commands(test_storage: FileStorage) -> None: dbt_vars = {"dbt_schema": schema_name} # extract postgres creds from env, parse and emit - credentials = resolve_configuration(PostgresCredentials(), sections=("destination", "postgres")) + credentials = resolve_configuration(PostgresCredentials(), sections=("destination", "postgres")) add_config_to_env(credentials, ("dlt",)) repo_path = clone_jaffle_repo(test_storage) # copy profile - shutil.copy("./tests/helpers/dbt_tests/cases/profiles_invalid_credentials.yml", os.path.join(repo_path, "profiles.yml")) + shutil.copy( + "./tests/helpers/dbt_tests/cases/profiles_invalid_credentials.yml", + os.path.join(repo_path, "profiles.yml"), + ) # initialize logging global_args = initialize_dbt_logging("ERROR", False) # run deps, results are None assert run_dbt_command(repo_path, "deps", ".", global_args=global_args) is None # run list, results are list of strings - results = run_dbt_command(repo_path, "list", ".", global_args=global_args, package_vars=dbt_vars) + results = run_dbt_command( + repo_path, "list", ".", global_args=global_args, package_vars=dbt_vars + ) assert isinstance(results, list) assert len(results) == 28 assert "jaffle_shop.not_null_orders_amount" in results # run list for specific selector - results = run_dbt_command(repo_path, "list", ".", global_args=global_args, command_args=["-s", "jaffle_shop.not_null_orders_amount"], package_vars=dbt_vars) + results = run_dbt_command( + repo_path, + "list", + ".", + global_args=global_args, + command_args=["-s", "jaffle_shop.not_null_orders_amount"], + package_vars=dbt_vars, + ) assert len(results) == 1 assert results[0] == "jaffle_shop.not_null_orders_amount" # run debug, that will fail @@ -61,26 +88,46 @@ def test_dbt_commands(test_storage: FileStorage) -> None: # same for run with pytest.raises(DBTProcessingError) as dbt_err: - run_dbt_command(repo_path, "run", ".", global_args=global_args, package_vars=dbt_vars, command_args=["--fail-fast", "--full-refresh"]) + run_dbt_command( + repo_path, + "run", + ".", + global_args=global_args, + package_vars=dbt_vars, + command_args=["--fail-fast", "--full-refresh"], + ) # in that case test results are bool, not list of tests runs assert dbt_err.value.command == "run" # copy a correct profile - shutil.copy("./tests/helpers/dbt_tests/cases/profiles.yml", os.path.join(repo_path, "profiles.yml")) + shutil.copy( + "./tests/helpers/dbt_tests/cases/profiles.yml", os.path.join(repo_path, "profiles.yml") + ) - results = run_dbt_command(repo_path, "seed", ".", global_args=global_args, package_vars=dbt_vars) + results = run_dbt_command( + repo_path, "seed", ".", global_args=global_args, package_vars=dbt_vars + ) assert isinstance(results, list) assert len(results) == 3 assert results[0].model_name == "raw_customers" assert results[0].status == "success" - results = run_dbt_command(repo_path, "run", ".", global_args=global_args, package_vars=dbt_vars, command_args=["--fail-fast", "--full-refresh"]) + results = run_dbt_command( + repo_path, + "run", + ".", + global_args=global_args, + package_vars=dbt_vars, + command_args=["--fail-fast", "--full-refresh"], + ) assert isinstance(results, list) assert len(results) == 5 assert results[-1].model_name == "orders" assert results[-1].status == "success" - results = run_dbt_command(repo_path, "test", ".", global_args=global_args, package_vars=dbt_vars) + results = run_dbt_command( + repo_path, "test", ".", global_args=global_args, package_vars=dbt_vars + ) assert isinstance(results, list) assert len(results) == 20 assert results[-1].status == "pass" diff --git a/tests/helpers/dbt_tests/local/test_runner_destinations.py b/tests/helpers/dbt_tests/local/test_runner_destinations.py index 547fdb991c..c9e4b7c83b 100644 --- a/tests/helpers/dbt_tests/local/test_runner_destinations.py +++ b/tests/helpers/dbt_tests/local/test_runner_destinations.py @@ -11,10 +11,16 @@ from tests.utils import TEST_STORAGE_ROOT, clean_test_storage, preserve_environ from tests.common.utils import modify_and_commit_file, load_secret -from tests.helpers.dbt_tests.local.utils import setup_rasa_runner_client, setup_rasa_runner, DBTDestinationInfo +from tests.helpers.dbt_tests.local.utils import ( + setup_rasa_runner_client, + setup_rasa_runner, + DBTDestinationInfo, +) DESTINATION_DATASET_NAME = "test_" + uniq_id() -ALL_DBT_DESTINATIONS = [DBTDestinationInfo("bigquery", "CREATE TABLE", "MERGE")] # DBTDestinationInfo("redshift", "SELECT", "INSERT") +ALL_DBT_DESTINATIONS = [ + DBTDestinationInfo("bigquery", "CREATE TABLE", "MERGE") +] # DBTDestinationInfo("redshift", "SELECT", "INSERT") ALL_DBT_DESTINATIONS_NAMES = ["bigquery"] # "redshift", @@ -27,29 +33,36 @@ def destination_info(request: Any) -> Iterator[DBTDestinationInfo]: def test_setup_dbt_runner() -> None: - runner = setup_rasa_runner("redshift", "carbon_bot_3", override_values={ - "package_additional_vars": {"add_var_name": "add_var_value"}, - "runtime": { - "log_format": "JSON", - "log_level": "INFO" - } - }) + runner = setup_rasa_runner( + "redshift", + "carbon_bot_3", + override_values={ + "package_additional_vars": {"add_var_name": "add_var_value"}, + "runtime": {"log_format": "JSON", "log_level": "INFO"}, + }, + ) assert runner.package_path.endswith("rasa_semantic_schema") assert runner.config.package_profile_name == "redshift" assert runner.config.package_additional_vars == {"add_var_name": "add_var_value"} - assert runner._get_package_vars() == {"source_dataset_name": "carbon_bot_3", "add_var_name": "add_var_value"} + assert runner._get_package_vars() == { + "source_dataset_name": "carbon_bot_3", + "add_var_name": "add_var_value", + } assert runner.source_dataset_name == "carbon_bot_3" assert runner.cloned_package_name == "rasa_semantic_schema" assert runner.working_dir == TEST_STORAGE_ROOT def test_initialize_package_wrong_key() -> None: - runner = setup_rasa_runner("redshift", override_values={ - # private repo - "package_location": "git@github.com:dlt-hub/rasa_bot_experiments.git", - "package_repository_branch": None, - "package_repository_ssh_key": load_secret("DEPLOY_KEY") - }) + runner = setup_rasa_runner( + "redshift", + override_values={ + # private repo + "package_location": "git@github.com:dlt-hub/rasa_bot_experiments.git", + "package_repository_branch": None, + "package_repository_ssh_key": load_secret("DEPLOY_KEY"), + }, + ) with pytest.raises(GitCommandError) as gce: runner.run_all() @@ -60,12 +73,17 @@ def test_reinitialize_package() -> None: runner = setup_rasa_runner("redshift") runner.ensure_newest_package() # mod the package - readme_path, _ = modify_and_commit_file(runner.package_path, "README.md", content=runner.config.package_profiles_dir) + readme_path, _ = modify_and_commit_file( + runner.package_path, "README.md", content=runner.config.package_profiles_dir + ) assert os.path.isfile(readme_path) # this will wipe out old package and clone again runner.ensure_newest_package() # we have old file back - assert runner.repo_storage.load(f"{runner.cloned_package_name}/README.md") != runner.config.package_profiles_dir + assert ( + runner.repo_storage.load(f"{runner.cloned_package_name}/README.md") + != runner.config.package_profiles_dir + ) def test_dbt_test_no_raw_schema(destination_info: DBTDestinationInfo) -> None: @@ -76,7 +94,7 @@ def test_dbt_test_no_raw_schema(destination_info: DBTDestinationInfo) -> None: runner.run_all( destination_dataset_name=DESTINATION_DATASET_NAME, run_params=["--fail-fast", "--full-refresh"], - source_tests_selector="tag:prerequisites" + source_tests_selector="tag:prerequisites", ) assert isinstance(prq_ex.value.args[0], DBTProcessingError) @@ -89,16 +107,21 @@ def test_dbt_run_full_refresh(destination_info: DBTDestinationInfo) -> None: destination_dataset_name=DESTINATION_DATASET_NAME, run_params=["--fail-fast", "--full-refresh"], additional_vars={"user_id": "metadata__user_id"}, - source_tests_selector="tag:prerequisites" + source_tests_selector="tag:prerequisites", ) assert all(r.message.startswith(destination_info.replace_strategy) for r in run_results) is True assert find_run_result(run_results, "_loads") is not None # all models must be SELECT as we do full refresh - assert find_run_result(run_results, "_loads").message.startswith(destination_info.replace_strategy) + assert find_run_result(run_results, "_loads").message.startswith( + destination_info.replace_strategy + ) assert all(m.message.startswith(destination_info.replace_strategy) for m in run_results) is True # all tests should pass - runner.test(destination_dataset_name=DESTINATION_DATASET_NAME, additional_vars={"user_id": "metadata__user_id"}) + runner.test( + destination_dataset_name=DESTINATION_DATASET_NAME, + additional_vars={"user_id": "metadata__user_id"}, + ) def test_dbt_run_error_via_additional_vars(destination_info: DBTDestinationInfo) -> None: @@ -110,8 +133,11 @@ def test_dbt_run_error_via_additional_vars(destination_info: DBTDestinationInfo) runner.run_all( destination_dataset_name=DESTINATION_DATASET_NAME, run_params=["--fail-fast", "--full-refresh"], - additional_vars={"user_id": "metadata__user_id", "external_session_id": "metadata__sess_id"}, - source_tests_selector="tag:prerequisites" + additional_vars={ + "user_id": "metadata__user_id", + "external_session_id": "metadata__sess_id", + }, + source_tests_selector="tag:prerequisites", ) stg_interactions = find_run_result(dbt_err.value.run_results, "stg_interactions") assert "metadata__sess_id" in stg_interactions.message @@ -127,7 +153,7 @@ def test_dbt_incremental_schema_out_of_sync_error(destination_info: DBTDestinati run_params=["--fail-fast", "--model", "+interactions"], # remove all counter metrics additional_vars={"count_metrics": []}, - source_tests_selector="tag:prerequisites" + source_tests_selector="tag:prerequisites", ) # generate schema error on incremental load @@ -140,7 +166,9 @@ def test_dbt_incremental_schema_out_of_sync_error(destination_info: DBTDestinati ) # metrics: StrStr = get_metrics_from_prometheus([runner.model_exec_info])["dbtrunner_model_status_info"] # full refresh on interactions - assert find_run_result(results, "interactions").message.startswith(destination_info.replace_strategy) + assert find_run_result(results, "interactions").message.startswith( + destination_info.replace_strategy + ) # now incremental load should happen results = runner.run( diff --git a/tests/helpers/dbt_tests/local/utils.py b/tests/helpers/dbt_tests/local/utils.py index 2993753a0c..7097140a83 100644 --- a/tests/helpers/dbt_tests/local/utils.py +++ b/tests/helpers/dbt_tests/local/utils.py @@ -1,4 +1,3 @@ - import contextlib from typing import Iterator, NamedTuple @@ -23,10 +22,13 @@ class DBTDestinationInfo(NamedTuple): incremental_strategy: str -def setup_rasa_runner(profile_name: str, dataset_name: str = None, override_values: StrAny = None) -> DBTPackageRunner: - +def setup_rasa_runner( + profile_name: str, dataset_name: str = None, override_values: StrAny = None +) -> DBTPackageRunner: C = DBTRunnerConfiguration() - C.package_location = "https://github.com/scale-vector/rasa_semantic_schema.git" # "/home/rudolfix/src/dbt/rasa_semantic_schema" + C.package_location = ( # "/home/rudolfix/src/dbt/rasa_semantic_schema" + "https://github.com/scale-vector/rasa_semantic_schema.git" + ) C.package_repository_branch = "dlt-dbt-runner-ci-do-not-delete" # override values including the defaults above @@ -41,7 +43,7 @@ def setup_rasa_runner(profile_name: str, dataset_name: str = None, override_valu DestinationClientDwhConfiguration(dataset_name=dataset_name or FIXTURES_DATASET_NAME), TEST_STORAGE_ROOT, package_profile_name=profile_name, - config=C + config=C, ) # now C is resolved init_test_logging(C.runtime) @@ -49,7 +51,9 @@ def setup_rasa_runner(profile_name: str, dataset_name: str = None, override_valu @contextlib.contextmanager -def setup_rasa_runner_client(destination_name: str, destination_dataset_name: str) -> Iterator[None]: +def setup_rasa_runner_client( + destination_name: str, destination_dataset_name: str +) -> Iterator[None]: with cm_yield_client(destination_name, FIXTURES_DATASET_NAME) as client: # emit environ so credentials are passed to dbt profile add_config_to_env(client.config, ("DLT",)) diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index 1037908e59..a47828a9ea 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -18,9 +18,20 @@ from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.helpers.dbt.configuration import DBTRunnerConfiguration from dlt.helpers.dbt.exceptions import PrerequisitesException, DBTProcessingError -from dlt.helpers.dbt import package_runner, create_venv, _create_dbt_deps, _default_profile_name, DEFAULT_DBT_VERSION - -from tests.helpers.dbt_tests.utils import JAFFLE_SHOP_REPO, assert_jaffle_completed, clone_jaffle_repo, find_run_result +from dlt.helpers.dbt import ( + package_runner, + create_venv, + _create_dbt_deps, + _default_profile_name, + DEFAULT_DBT_VERSION, +) + +from tests.helpers.dbt_tests.utils import ( + JAFFLE_SHOP_REPO, + assert_jaffle_completed, + clone_jaffle_repo, + find_run_result, +) from tests.utils import test_storage, preserve_environ from tests.load.utils import yield_client_with_storage, cm_yield_client_with_storage @@ -40,14 +51,14 @@ def client() -> Iterator[PostgresClient]: ("postgres", None), ("snowflake", "1.4.0"), ("snowflake", "1.5.2"), - ("snowflake", None) + ("snowflake", None), ] PACKAGE_IDS = [ - f"{destination}-venv-{version}" - if version else f"{destination}-local" + f"{destination}-venv-{version}" if version else f"{destination}-local" for destination, version in PACKAGE_PARAMS ] + @pytest.fixture(scope="module", params=PACKAGE_PARAMS, ids=PACKAGE_IDS) def dbt_package_f(request: Any) -> Iterator[Tuple[str, AnyFun]]: destination_name, version = request.param @@ -89,7 +100,10 @@ def test_dbt_configuration() -> None: # check names normalized C: DBTRunnerConfiguration = resolve_configuration( DBTRunnerConfiguration(), - explicit_value={"package_repository_ssh_key": "---NO NEWLINE---", "package_location": "/var/local"} + explicit_value={ + "package_repository_ssh_key": "---NO NEWLINE---", + "package_location": "/var/local", + }, ) assert C.package_repository_ssh_key == "---NO NEWLINE---\n" assert C.package_additional_vars is None @@ -98,7 +112,11 @@ def test_dbt_configuration() -> None: C = resolve_configuration( DBTRunnerConfiguration(), - explicit_value={"package_repository_ssh_key": "---WITH NEWLINE---\n", "package_location": "/var/local", "package_additional_vars": {"a": 1}} + explicit_value={ + "package_repository_ssh_key": "---WITH NEWLINE---\n", + "package_location": "/var/local", + "package_additional_vars": {"a": 1}, + }, ) assert C.package_repository_ssh_key == "---WITH NEWLINE---\n" assert C.package_additional_vars == {"a": 1} @@ -108,9 +126,9 @@ def test_dbt_run_exception_pickle() -> None: obj = decode_obj( encode_obj( DBTProcessingError("test", "A", "B"), # type: ignore[arg-type] - ignore_pickle_errors=False + ignore_pickle_errors=False, ), - ignore_pickle_errors=False + ignore_pickle_errors=False, ) assert obj.command == "test" assert obj.run_results == "A" @@ -119,12 +137,21 @@ def test_dbt_run_exception_pickle() -> None: def test_runner_setup(client: PostgresClient, test_storage: FileStorage) -> None: - add_vars = {"source_dataset_name": "overwritten", "destination_dataset_name": "destination", "schema_name": "this_Schema"} + add_vars = { + "source_dataset_name": "overwritten", + "destination_dataset_name": "destination", + "schema_name": "this_Schema", + } os.environ["DBT_PACKAGE_RUNNER__PACKAGE_ADDITIONAL_VARS"] = json.dumps(add_vars) os.environ["AUTO_FULL_REFRESH_WHEN_OUT_OF_SYNC"] = "False" os.environ["DBT_PACKAGE_RUNNER__RUNTIME__LOG_LEVEL"] = "CRITICAL" test_storage.create_folder("jaffle") - r = package_runner(Venv.restore_current(), client.config, test_storage.make_full_path("jaffle"), JAFFLE_SHOP_REPO) + r = package_runner( + Venv.restore_current(), + client.config, + test_storage.make_full_path("jaffle"), + JAFFLE_SHOP_REPO, + ) # runner settings assert r.credentials is client.config assert r.working_dir == test_storage.make_full_path("jaffle") @@ -140,55 +167,76 @@ def test_runner_setup(client: PostgresClient, test_storage: FileStorage) -> None assert r.config.runtime.log_level == "CRITICAL" assert r.config.auto_full_refresh_when_out_of_sync is False - assert r._get_package_vars() == {"source_dataset_name": client.config.dataset_name, "destination_dataset_name": "destination", "schema_name": "this_Schema"} - assert r._get_package_vars(destination_dataset_name="dest_test_123") == {"source_dataset_name": client.config.dataset_name, "destination_dataset_name": "dest_test_123", "schema_name": "this_Schema"} + assert r._get_package_vars() == { + "source_dataset_name": client.config.dataset_name, + "destination_dataset_name": "destination", + "schema_name": "this_Schema", + } + assert r._get_package_vars(destination_dataset_name="dest_test_123") == { + "source_dataset_name": client.config.dataset_name, + "destination_dataset_name": "dest_test_123", + "schema_name": "this_Schema", + } assert r._get_package_vars(additional_vars={"add": 1, "schema_name": "ovr"}) == { - "source_dataset_name": client.config.dataset_name, - "destination_dataset_name": "destination", "schema_name": "ovr", - "add": 1 - } + "source_dataset_name": client.config.dataset_name, + "destination_dataset_name": "destination", + "schema_name": "ovr", + "add": 1, + } -def test_runner_dbt_destinations(test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun]) -> None: +def test_runner_dbt_destinations( + test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun] +) -> None: destination_name, dbt_func = dbt_package_f with cm_yield_client_with_storage(destination_name) as client: - jaffle_base_dir = 'jaffle_' + destination_name + jaffle_base_dir = "jaffle_" + destination_name test_storage.create_folder(jaffle_base_dir) results = dbt_func( client.config, test_storage.make_full_path(jaffle_base_dir), JAFFLE_SHOP_REPO ).run_all(["--fail-fast", "--full-refresh"]) - assert_jaffle_completed(test_storage, results, destination_name, jaffle_dir=jaffle_base_dir + '/jaffle_shop') + assert_jaffle_completed( + test_storage, results, destination_name, jaffle_dir=jaffle_base_dir + "/jaffle_shop" + ) -def test_run_jaffle_from_folder_incremental(test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun]) -> None: +def test_run_jaffle_from_folder_incremental( + test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun] +) -> None: destination_name, dbt_func = dbt_package_f with cm_yield_client_with_storage(destination_name) as client: repo_path = clone_jaffle_repo(test_storage) # copy model with error into package to force run error in model - shutil.copy("./tests/helpers/dbt_tests/cases/jaffle_customers_incremental.sql", os.path.join(repo_path, "models", "customers.sql")) + shutil.copy( + "./tests/helpers/dbt_tests/cases/jaffle_customers_incremental.sql", + os.path.join(repo_path, "models", "customers.sql"), + ) results = dbt_func(client.config, None, repo_path).run_all(run_params=None) assert_jaffle_completed(test_storage, results, destination_name, jaffle_dir="jaffle_shop") results = dbt_func(client.config, None, repo_path).run_all() # out of 100 records 0 was inserted customers = find_run_result(results, "customers") - assert customers.message in JAFFLE_MESSAGES_INCREMENTAL[destination_name]['customers'] + assert customers.message in JAFFLE_MESSAGES_INCREMENTAL[destination_name]["customers"] # change the column name. that will force dbt to fail (on_schema_change='fail'). the runner should do a full refresh - shutil.copy("./tests/helpers/dbt_tests/cases/jaffle_customers_incremental_new_column.sql", os.path.join(repo_path, "models", "customers.sql")) + shutil.copy( + "./tests/helpers/dbt_tests/cases/jaffle_customers_incremental_new_column.sql", + os.path.join(repo_path, "models", "customers.sql"), + ) results = dbt_func(client.config, None, repo_path).run_all(run_params=None) assert_jaffle_completed(test_storage, results, destination_name, jaffle_dir="jaffle_shop") -def test_run_jaffle_fail_prerequisites(test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun]) -> None: +def test_run_jaffle_fail_prerequisites( + test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun] +) -> None: destination_name, dbt_func = dbt_package_f with cm_yield_client_with_storage(destination_name) as client: test_storage.create_folder("jaffle") # we run all the tests before tables are materialized with pytest.raises(PrerequisitesException) as pr_exc: dbt_func( - client.config, - test_storage.make_full_path("jaffle"), - JAFFLE_SHOP_REPO - ).run_all(["--fail-fast", "--full-refresh"], source_tests_selector="*") + client.config, test_storage.make_full_path("jaffle"), JAFFLE_SHOP_REPO + ).run_all(["--fail-fast", "--full-refresh"], source_tests_selector="*") proc_err = pr_exc.value.args[0] assert isinstance(proc_err, DBTProcessingError) customers = find_run_result(proc_err.run_results, "unique_customers_customer_id") @@ -197,23 +245,32 @@ def test_run_jaffle_fail_prerequisites(test_storage: FileStorage, dbt_package_f: assert all(r.status == "error" for r in proc_err.run_results) -def test_run_jaffle_invalid_run_args(test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun]) -> None: +def test_run_jaffle_invalid_run_args( + test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun] +) -> None: destination_name, dbt_func = dbt_package_f with cm_yield_client_with_storage(destination_name) as client: test_storage.create_folder("jaffle") # we run all the tests before tables are materialized with pytest.raises(DBTProcessingError) as pr_exc: - dbt_func(client.config, test_storage.make_full_path("jaffle"), JAFFLE_SHOP_REPO).run_all(["--wrong_flag"]) + dbt_func( + client.config, test_storage.make_full_path("jaffle"), JAFFLE_SHOP_REPO + ).run_all(["--wrong_flag"]) # dbt < 1.5 raises systemexit, dbt >= 1.5 just returns success False assert isinstance(pr_exc.value.dbt_results, SystemExit) or pr_exc.value.dbt_results is None -def test_run_jaffle_failed_run(test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun]) -> None: +def test_run_jaffle_failed_run( + test_storage: FileStorage, dbt_package_f: Tuple[str, AnyFun] +) -> None: destination_name, dbt_func = dbt_package_f with cm_yield_client_with_storage(destination_name) as client: repo_path = clone_jaffle_repo(test_storage) # copy model with error into package to force run error in model - shutil.copy("./tests/helpers/dbt_tests/cases/jaffle_customers_with_error.sql", os.path.join(repo_path, "models", "customers.sql")) + shutil.copy( + "./tests/helpers/dbt_tests/cases/jaffle_customers_with_error.sql", + os.path.join(repo_path, "models", "customers.sql"), + ) with pytest.raises(DBTProcessingError) as pr_exc: dbt_func(client.config, None, repo_path).run_all(run_params=None) assert len(pr_exc.value.run_results) == 5 @@ -222,11 +279,9 @@ def test_run_jaffle_failed_run(test_storage: FileStorage, dbt_package_f: Tuple[s JAFFLE_MESSAGES_INCREMENTAL: Dict[str, Any] = { - 'snowflake': { + "snowflake": { # Different message per version - 'customers': ('SUCCESS 1', 'SUCCESS 100'), + "customers": ("SUCCESS 1", "SUCCESS 100"), }, - 'postgres': { - 'customers': ("INSERT 0 100", ) - } + "postgres": {"customers": ("INSERT 0 100",)}, } diff --git a/tests/helpers/dbt_tests/utils.py b/tests/helpers/dbt_tests/utils.py index 65e0eae2cb..59fa67476b 100644 --- a/tests/helpers/dbt_tests/utils.py +++ b/tests/helpers/dbt_tests/utils.py @@ -9,15 +9,15 @@ TEST_CASES_PATH = "./tests/helpers/dbt_tests/cases/" JAFFLE_RESULT_MESSAGES = { - 'postgres': { - 'stg_orders': 'CREATE VIEW', - 'customers': 'SELECT 100', + "postgres": { + "stg_orders": "CREATE VIEW", + "customers": "SELECT 100", }, # Snowflake only returns generic success messages - 'snowflake': { - 'stg_orders': 'SUCCESS 1', - 'customers': 'SUCCESS 1', - } + "snowflake": { + "stg_orders": "SUCCESS 1", + "customers": "SUCCESS 1", + }, } @@ -33,17 +33,24 @@ def find_run_result(results: Sequence[DBTNodeResult], model_name: str) -> DBTNod def clone_jaffle_repo(test_storage: FileStorage) -> str: repo_path = test_storage.make_full_path("jaffle_shop") # clone jaffle shop for dbt 1.0.0 - clone_repo(JAFFLE_SHOP_REPO, repo_path, with_git_command=None, branch="main").close() # core-v1.0.0 + clone_repo( + JAFFLE_SHOP_REPO, repo_path, with_git_command=None, branch="main" + ).close() # core-v1.0.0 return repo_path -def assert_jaffle_completed(test_storage: FileStorage, results: List[DBTNodeResult], destination_name: str, jaffle_dir: str = "jaffle/jaffle_shop") -> None: +def assert_jaffle_completed( + test_storage: FileStorage, + results: List[DBTNodeResult], + destination_name: str, + jaffle_dir: str = "jaffle/jaffle_shop", +) -> None: assert len(results) == 5 assert all(r.status == "success" for r in results) - stg_orders = find_run_result(results, 'stg_orders') - assert stg_orders.message == JAFFLE_RESULT_MESSAGES[destination_name]['stg_orders'] + stg_orders = find_run_result(results, "stg_orders") + assert stg_orders.message == JAFFLE_RESULT_MESSAGES[destination_name]["stg_orders"] customers = find_run_result(results, "customers") - assert customers.message == JAFFLE_RESULT_MESSAGES[destination_name]['customers'] + assert customers.message == JAFFLE_RESULT_MESSAGES[destination_name]["customers"] # `run_dbt` has injected credentials into environ. make sure that credentials were removed assert "CREDENTIALS__PASSWORD" not in os.environ # make sure jaffle_shop was cloned into right dir diff --git a/tests/helpers/providers/test_google_secrets_provider.py b/tests/helpers/providers/test_google_secrets_provider.py index 8e1f14f655..9d32648862 100644 --- a/tests/helpers/providers/test_google_secrets_provider.py +++ b/tests/helpers/providers/test_google_secrets_provider.py @@ -12,7 +12,7 @@ from dlt.common.configuration.resolve import resolve_configuration -DLT_SECRETS_TOML_CONTENT=""" +DLT_SECRETS_TOML_CONTENT = """ secret_value=2137 api.secret_key="ABCD" @@ -26,7 +26,9 @@ def test_regular_keys() -> None: logger.init_logging(RunConfiguration()) # copy bigquery credentials into providers credentials - c = resolve_configuration(GcpServiceAccountCredentials(), sections=(known_sections.DESTINATION, "bigquery")) + c = resolve_configuration( + GcpServiceAccountCredentials(), sections=(known_sections.DESTINATION, "bigquery") + ) secrets[f"{known_sections.PROVIDERS}.google_secrets.credentials"] = dict(c) # c = secrets.get("destination.credentials", GcpServiceAccountCredentials) # print(c) @@ -37,22 +39,46 @@ def test_regular_keys() -> None: # load secrets toml per pipeline provider.get_value("secret_key", AnyType, "pipeline", "api") - assert provider.get_value("secret_key", AnyType, "pipeline", "api") == ("ABCDE", "pipeline-api-secret_key") - assert provider.get_value("credentials", AnyType, "pipeline") == ({"project_id": "mock-credentials-pipeline"}, "pipeline-credentials") + assert provider.get_value("secret_key", AnyType, "pipeline", "api") == ( + "ABCDE", + "pipeline-api-secret_key", + ) + assert provider.get_value("credentials", AnyType, "pipeline") == ( + {"project_id": "mock-credentials-pipeline"}, + "pipeline-credentials", + ) # load source test_source which should also load "sources", "pipeline-sources", "sources-test_source" and "pipeline-sources-test_source" - assert provider.get_value("only_pipeline", AnyType, "pipeline", "sources", "test_source") == ("ONLY", "pipeline-sources-test_source-only_pipeline") + assert provider.get_value("only_pipeline", AnyType, "pipeline", "sources", "test_source") == ( + "ONLY", + "pipeline-sources-test_source-only_pipeline", + ) # we set sources.test_source.secret_prop_1="OVR_A" in pipeline-sources to override value in sources - assert provider.get_value("secret_prop_1", AnyType, None, "sources", "test_source") == ("OVR_A", "sources-test_source-secret_prop_1") + assert provider.get_value("secret_prop_1", AnyType, None, "sources", "test_source") == ( + "OVR_A", + "sources-test_source-secret_prop_1", + ) # get element unique to pipeline-sources - assert provider.get_value("only_pipeline_top", AnyType, "pipeline", "sources") == ("TOP", "pipeline-sources-only_pipeline_top") + assert provider.get_value("only_pipeline_top", AnyType, "pipeline", "sources") == ( + "TOP", + "pipeline-sources-only_pipeline_top", + ) # get element unique to sources - assert provider.get_value("all_sources_present", AnyType, None, "sources") == (True, "sources-all_sources_present") + assert provider.get_value("all_sources_present", AnyType, None, "sources") == ( + True, + "sources-all_sources_present", + ) # get element unique to sources-test_source - assert provider.get_value("secret_prop_2", AnyType, None, "sources", "test_source") == ("B", "sources-test_source-secret_prop_2") + assert provider.get_value("secret_prop_2", AnyType, None, "sources", "test_source") == ( + "B", + "sources-test_source-secret_prop_2", + ) # this destination will not be found - assert provider.get_value("url", AnyType, "pipeline", "destination", "filesystem") == (None, "pipeline-destination-filesystem-url") + assert provider.get_value("url", AnyType, "pipeline", "destination", "filesystem") == ( + None, + "pipeline-destination-filesystem-url", + ) # try a single secret value assert provider.get_value("secret", TSecretValue, "pipeline") == (None, "pipeline-secret") @@ -63,7 +89,10 @@ def test_regular_keys() -> None: assert provider.get_value("secret", str, "pipeline") == (None, "pipeline-secret") provider.only_secrets = False # non secrets allowed - assert provider.get_value("secret", str, "pipeline") == ("THIS IS SECRET VALUE", "pipeline-secret") + assert provider.get_value("secret", str, "pipeline") == ( + "THIS IS SECRET VALUE", + "pipeline-secret", + ) # request json # print(provider._toml.as_string()) @@ -73,12 +102,12 @@ def test_regular_keys() -> None: # def test_special_sections() -> None: # pass - # with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": "_secrets/pipelines-ci-secrets-65c0517a9b30.json"}): - # provider = _google_secrets_provider() - # print(provider.get_value("credentials", GcpServiceAccountCredentials, None, "destination", "bigquery")) - # print(provider._toml.as_string()) - # print(provider.get_value("subdomain", AnyType, None, "sources", "zendesk", "credentials")) - # print(provider._toml.as_string()) +# with custom_environ({"GOOGLE_APPLICATION_CREDENTIALS": "_secrets/pipelines-ci-secrets-65c0517a9b30.json"}): +# provider = _google_secrets_provider() +# print(provider.get_value("credentials", GcpServiceAccountCredentials, None, "destination", "bigquery")) +# print(provider._toml.as_string()) +# print(provider.get_value("subdomain", AnyType, None, "sources", "zendesk", "credentials")) +# print(provider._toml.as_string()) # def test_provider_insertion() -> None: @@ -88,4 +117,3 @@ def test_regular_keys() -> None: # }): # # - diff --git a/tests/helpers/streamlit_tests/test_streamlit_show_resources.py b/tests/helpers/streamlit_tests/test_streamlit_show_resources.py index fcf232ea76..b63fc3d472 100644 --- a/tests/helpers/streamlit_tests/test_streamlit_show_resources.py +++ b/tests/helpers/streamlit_tests/test_streamlit_show_resources.py @@ -57,9 +57,9 @@ def test_multiple_resources_pipeline(): ) load_info = pipeline.run([source1(10), source2(20)]) - source1_schema = load_info.pipeline.schemas.get("source1") # type: ignore[attr-defined] + source1_schema = load_info.pipeline.schemas.get("source1") # type: ignore[attr-defined] - assert load_info.pipeline.schema_names == ["source2", "source1"] # type: ignore[attr-defined] + assert load_info.pipeline.schema_names == ["source2", "source1"] # type: ignore[attr-defined] assert source1_schema.data_tables()[0]["name"] == "one" assert source1_schema.data_tables()[0]["columns"]["column_1"].get("primary_key") is True diff --git a/tests/libs/test_parquet_writer.py b/tests/libs/test_parquet_writer.py index 2f4bafa719..92d4950624 100644 --- a/tests/libs/test_parquet_writer.py +++ b/tests/libs/test_parquet_writer.py @@ -21,12 +21,19 @@ def get_writer( buffer_max_items: int = 10, file_max_items: int = 10, file_max_bytes: int = None, - _caps: DestinationCapabilitiesContext = None + _caps: DestinationCapabilitiesContext = None, ) -> BufferedDataWriter[ParquetDataWriter]: caps = _caps or DestinationCapabilitiesContext.generic_capabilities() caps.preferred_loader_file_format = _format file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") - return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, _caps=caps, file_max_items=file_max_items, file_max_bytes=file_max_bytes) + return BufferedDataWriter( + _format, + file_template, + buffer_max_items=buffer_max_items, + _caps=caps, + file_max_items=file_max_items, + file_max_bytes=file_max_bytes, + ) def test_parquet_writer_schema_evolution_with_big_buffer() -> None: @@ -36,8 +43,13 @@ def test_parquet_writer_schema_evolution_with_big_buffer() -> None: c4 = new_column("col4", "text") with get_writer("parquet") as writer: - writer.write_data_item([{"col1": 1, "col2": 2, "col3": "3"}], {"col1": c1, "col2": c2, "col3": c3}) - writer.write_data_item([{"col1": 1, "col2": 2, "col3": "3", "col4": "4", "col5": {"hello": "marcin"}}], {"col1": c1, "col2": c2, "col3": c3, "col4": c4}) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": "3"}], {"col1": c1, "col2": c2, "col3": c3} + ) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": "3", "col4": "4", "col5": {"hello": "marcin"}}], + {"col1": c1, "col2": c2, "col3": c3, "col4": c4}, + ) with open(writer.closed_files[0], "rb") as f: table = pq.read_table(f) @@ -55,9 +67,14 @@ def test_parquet_writer_schema_evolution_with_small_buffer() -> None: with get_writer("parquet", buffer_max_items=4, file_max_items=50) as writer: for _ in range(0, 20): - writer.write_data_item([{"col1": 1, "col2": 2, "col3": "3"}], {"col1": c1, "col2": c2, "col3": c3}) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": "3"}], {"col1": c1, "col2": c2, "col3": c3} + ) for _ in range(0, 20): - writer.write_data_item([{"col1": 1, "col2": 2, "col3": "3", "col4": "4", "col5": {"hello": "marcin"}}], {"col1": c1, "col2": c2, "col3": c3, "col4": c4}) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": "3", "col4": "4", "col5": {"hello": "marcin"}}], + {"col1": c1, "col2": c2, "col3": c3, "col4": c4}, + ) assert len(writer.closed_files) == 2 @@ -76,20 +93,34 @@ def test_parquet_writer_json_serialization() -> None: c3 = new_column("col3", "complex") with get_writer("parquet") as writer: - writer.write_data_item([{"col1": 1, "col2": 2, "col3": {"hello":"dave"}}], {"col1": c1, "col2": c2, "col3": c3}) - writer.write_data_item([{"col1": 1, "col2": 2, "col3": {"hello":"marcin"}}], {"col1": c1, "col2": c2, "col3": c3}) - writer.write_data_item([{"col1": 1, "col2": 2, "col3": {}}], {"col1": c1, "col2": c2, "col3": c3}) - writer.write_data_item([{"col1": 1, "col2": 2, "col3": []}], {"col1": c1, "col2": c2, "col3": c3}) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": {"hello": "dave"}}], + {"col1": c1, "col2": c2, "col3": c3}, + ) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": {"hello": "marcin"}}], + {"col1": c1, "col2": c2, "col3": c3}, + ) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": {}}], {"col1": c1, "col2": c2, "col3": c3} + ) + writer.write_data_item( + [{"col1": 1, "col2": 2, "col3": []}], {"col1": c1, "col2": c2, "col3": c3} + ) with open(writer.closed_files[0], "rb") as f: table = pq.read_table(f) assert table.column("col1").to_pylist() == [1, 1, 1, 1] assert table.column("col2").to_pylist() == [2, 2, 2, 2] - assert table.column("col3").to_pylist() == ["""{"hello":"dave"}""","""{"hello":"marcin"}""","""{}""","""[]"""] + assert table.column("col3").to_pylist() == [ + """{"hello":"dave"}""", + """{"hello":"marcin"}""", + """{}""", + """[]""", + ] def test_parquet_writer_all_data_fields() -> None: - data = dict(TABLE_ROW_ALL_DATA_TYPES) # fix dates to use pendulum data["col4"] = ensure_pendulum_datetime(data["col4"]) # type: ignore[arg-type] @@ -158,15 +189,17 @@ def test_parquet_writer_size_file_rotation() -> None: def test_parquet_writer_config() -> None: - os.environ["NORMALIZE__DATA_WRITER__VERSION"] = "2.0" os.environ["NORMALIZE__DATA_WRITER__DATA_PAGE_SIZE"] = str(1024 * 512) os.environ["NORMALIZE__DATA_WRITER__TIMESTAMP_TIMEZONE"] = "America/New York" - with inject_section(ConfigSectionContext(pipeline_name=None, sections=("normalize", ))): + with inject_section(ConfigSectionContext(pipeline_name=None, sections=("normalize",))): with get_writer("parquet", file_max_bytes=2**8, buffer_max_items=2) as writer: for i in range(0, 5): - writer.write_data_item([{"col1": i, "col2": pendulum.now()}], {"col1": new_column("col1", "bigint"), "col2": new_column("col2", "timestamp")}) + writer.write_data_item( + [{"col1": i, "col2": pendulum.now()}], + {"col1": new_column("col1", "bigint"), "col2": new_column("col2", "timestamp")}, + ) # force the parquet writer to be created writer._flush_items() @@ -190,7 +223,11 @@ def test_parquet_writer_schema_from_caps() -> None: for _ in range(0, 5): writer.write_data_item( [{"col1": Decimal("2617.27"), "col2": pendulum.now(), "col3": Decimal(2**250)}], - {"col1": new_column("col1", "decimal"), "col2": new_column("col2", "timestamp"), "col3": new_column("col3", "wei")} + { + "col1": new_column("col1", "decimal"), + "col2": new_column("col2", "timestamp"), + "col3": new_column("col3", "wei"), + }, ) # force the parquet writer to be created writer._flush_items() diff --git a/tests/libs/test_pyarrow.py b/tests/libs/test_pyarrow.py index 6dbdae00cb..dffda35005 100644 --- a/tests/libs/test_pyarrow.py +++ b/tests/libs/test_pyarrow.py @@ -7,39 +7,39 @@ from tests.cases import TABLE_UPDATE_COLUMNS_SCHEMA - def test_py_arrow_to_table_schema_columns(): dlt_schema = deepcopy(TABLE_UPDATE_COLUMNS_SCHEMA) caps = DestinationCapabilitiesContext.generic_capabilities() # The arrow schema will add precision - dlt_schema['col4']['precision'] = caps.timestamp_precision - dlt_schema['col6']['precision'], dlt_schema['col6']['scale'] = caps.decimal_precision - dlt_schema['col11']['precision'] = caps.timestamp_precision - dlt_schema['col4_null']['precision'] = caps.timestamp_precision - dlt_schema['col6_null']['precision'], dlt_schema['col6_null']['scale'] = caps.decimal_precision - dlt_schema['col11_null']['precision'] = caps.timestamp_precision + dlt_schema["col4"]["precision"] = caps.timestamp_precision + dlt_schema["col6"]["precision"], dlt_schema["col6"]["scale"] = caps.decimal_precision + dlt_schema["col11"]["precision"] = caps.timestamp_precision + dlt_schema["col4_null"]["precision"] = caps.timestamp_precision + dlt_schema["col6_null"]["precision"], dlt_schema["col6_null"]["scale"] = caps.decimal_precision + dlt_schema["col11_null"]["precision"] = caps.timestamp_precision # Ignoring wei as we can't distinguish from decimal - dlt_schema['col8']['precision'], dlt_schema['col8']['scale'] = (76, 0) - dlt_schema['col8']['data_type'] = 'decimal' - dlt_schema['col8_null']['precision'], dlt_schema['col8_null']['scale'] = (76, 0) - dlt_schema['col8_null']['data_type'] = 'decimal' + dlt_schema["col8"]["precision"], dlt_schema["col8"]["scale"] = (76, 0) + dlt_schema["col8"]["data_type"] = "decimal" + dlt_schema["col8_null"]["precision"], dlt_schema["col8_null"]["scale"] = (76, 0) + dlt_schema["col8_null"]["data_type"] = "decimal" # No json type - dlt_schema['col9']['data_type'] = 'text' - del dlt_schema['col9']['variant'] - dlt_schema['col9_null']['data_type'] = 'text' - del dlt_schema['col9_null']['variant'] + dlt_schema["col9"]["data_type"] = "text" + del dlt_schema["col9"]["variant"] + dlt_schema["col9_null"]["data_type"] = "text" + del dlt_schema["col9_null"]["variant"] # arrow string fields don't have precision - del dlt_schema['col5_precision']['precision'] - + del dlt_schema["col5_precision"]["precision"] # Convert to arrow schema arrow_schema = pa.schema( [ pa.field( - column["name"], get_py_arrow_datatype(column, caps, "UTC"), nullable=column["nullable"] + column["name"], + get_py_arrow_datatype(column, caps, "UTC"), + nullable=column["nullable"], ) for column in dlt_schema.values() ] diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py index 5606bd25b2..1021a3037d 100644 --- a/tests/libs/test_pydantic.py +++ b/tests/libs/test_pydantic.py @@ -1,13 +1,32 @@ from copy import copy import pytest -from typing import ClassVar, Sequence, Mapping, Dict, MutableMapping, MutableSequence, Union, Optional, List, Dict, Any +from typing import ( + ClassVar, + Sequence, + Mapping, + Dict, + MutableMapping, + MutableSequence, + Union, + Optional, + List, + Dict, + Any, +) from enum import Enum from datetime import datetime, date, time # noqa: I251 from dlt.common import Decimal from dlt.common import json -from dlt.common.libs.pydantic import DltConfig, pydantic_to_table_schema_columns, apply_schema_contract_to_model, validate_item, validate_items, create_list_model +from dlt.common.libs.pydantic import ( + DltConfig, + pydantic_to_table_schema_columns, + apply_schema_contract_to_model, + validate_item, + validate_items, + create_list_model, +) from pydantic import BaseModel, Json, AnyHttpUrl, ConfigDict, ValidationError from dlt.common.schema.exceptions import DataValidationError @@ -73,8 +92,12 @@ class ModelWithConfig(Model): TEST_MODEL_INSTANCE = Model( - bigint_field=1, text_field="text", timestamp_field=datetime.now(), - date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, + bigint_field=1, + text_field="text", + timestamp_field=datetime.now(), + date_field=date.today(), + decimal_field=Decimal(1.1), + double_field=1.1, time_field=time(1, 2, 3, 12345), nested_field=NestedModel(nested_field="nested"), list_field=["a", "b", "c"], @@ -93,7 +116,7 @@ class ModelWithConfig(Model): ) -@pytest.mark.parametrize('instance', [True, False]) +@pytest.mark.parametrize("instance", [True, False]) def test_pydantic_model_to_columns(instance: bool) -> None: if instance: model = TEST_MODEL_INSTANCE @@ -110,22 +133,22 @@ def test_pydantic_model_to_columns(instance: bool) -> None: assert result["double_field"]["data_type"] == "double" assert result["time_field"]["data_type"] == "time" assert result["nested_field"]["data_type"] == "complex" - assert result['list_field']['data_type'] == 'complex' - assert result['union_field']['data_type'] == 'bigint' - assert result['optional_field']['data_type'] == 'double' - assert result['optional_field']['nullable'] is True - assert result['blank_dict_field']['data_type'] == 'complex' - assert result['parametrized_dict_field']['data_type'] == 'complex' - assert result['str_enum_field']['data_type'] == 'text' - assert result['int_enum_field']['data_type'] == 'bigint' - assert result['mixed_enum_int_field']['data_type'] == 'text' - assert result['mixed_enum_str_field']['data_type'] == 'text' - assert result['json_field']['data_type'] == 'complex' - assert result['url_field']['data_type'] == 'text' + assert result["list_field"]["data_type"] == "complex" + assert result["union_field"]["data_type"] == "bigint" + assert result["optional_field"]["data_type"] == "double" + assert result["optional_field"]["nullable"] is True + assert result["blank_dict_field"]["data_type"] == "complex" + assert result["parametrized_dict_field"]["data_type"] == "complex" + assert result["str_enum_field"]["data_type"] == "text" + assert result["int_enum_field"]["data_type"] == "bigint" + assert result["mixed_enum_int_field"]["data_type"] == "text" + assert result["mixed_enum_str_field"]["data_type"] == "text" + assert result["json_field"]["data_type"] == "complex" + assert result["url_field"]["data_type"] == "text" # Any type fields are excluded from schema - assert 'any_field' not in result - assert 'json_any_field' not in result + assert "any_field" not in result + assert "json_any_field" not in result def test_pydantic_model_skip_complex_types() -> None: @@ -168,17 +191,17 @@ def test_model_for_column_mode() -> None: # assert "frozen" in model_freeze.model_config with pytest.raises(ValidationError) as py_ex: model_freeze.parse_obj(instance_extra) - assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + assert py_ex.value.errors()[0]["loc"] == ("extra_prop",) model_freeze = apply_schema_contract_to_model(Model, "freeze") # type: ignore[arg-type] with pytest.raises(ValidationError) as py_ex: model_freeze.parse_obj(instance_extra) - assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + assert py_ex.value.errors()[0]["loc"] == ("extra_prop",) # discard row - same as freeze model_freeze = apply_schema_contract_to_model(ModelWithConfig, "discard_row") with pytest.raises(ValidationError) as py_ex: model_freeze.parse_obj(instance_extra) - assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + assert py_ex.value.errors()[0]["loc"] == ("extra_prop",) # discard value - ignore extra fields model_discard = apply_schema_contract_to_model(ModelWithConfig, "discard_value") @@ -232,13 +255,12 @@ class User(BaseModel): model_freeze = apply_schema_contract_to_model(User, "evolve", "freeze") from typing import get_type_hints + print(get_type_hints(model_freeze)) print(get_type_hints(model_freeze.model_fields["address"].annotation)) - def test_item_list_validation() -> None: - class ItemModel(BaseModel): b: bool opt: Optional[int] = None @@ -252,7 +274,8 @@ class ItemModel(BaseModel): "items", discard_list_model, [{"b": True}, {"b": 2, "opt": "not int", "extra": 1.2}, {"b": 3}, {"b": False}], - "discard_row", "discard_row" + "discard_row", + "discard_row", ) # {"b": 2, "opt": "not int", "extra": 1.2} - note that this will generate 3 errors for the same item # and is crucial in our tests when discarding rows @@ -260,7 +283,13 @@ class ItemModel(BaseModel): assert items[0].b is True assert items[1].b is False # violate extra field - items = validate_items("items", discard_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "discard_row") + items = validate_items( + "items", + discard_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], + "discard_row", + "discard_row", + ) assert len(items) == 1 assert items[0].b is True @@ -269,20 +298,32 @@ class ItemModel(BaseModel): freeze_list_model = create_list_model(freeze_model) # violate data type with pytest.raises(DataValidationError) as val_ex: - validate_items("items", freeze_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "freeze") + validate_items( + "items", + freeze_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], + "freeze", + "freeze", + ) assert val_ex.value.schema_name is None assert val_ex.value.table_name == "items" - assert val_ex.value.column_name == str(("items", 1 , 'b')) # pydantic location + assert val_ex.value.column_name == str(("items", 1, "b")) # pydantic location assert val_ex.value.contract_entity == "data_type" assert val_ex.value.contract_mode == "freeze" assert val_ex.value.table_schema is freeze_list_model assert val_ex.value.data_item == {"b": 2} # extra type with pytest.raises(DataValidationError) as val_ex: - validate_items("items", freeze_list_model, [{"b": True}, {"a": 2, "b": False}, {"b": 3}, {"b": False}], "freeze", "freeze") + validate_items( + "items", + freeze_list_model, + [{"b": True}, {"a": 2, "b": False}, {"b": 3}, {"b": False}], + "freeze", + "freeze", + ) assert val_ex.value.schema_name is None assert val_ex.value.table_name == "items" - assert val_ex.value.column_name == str(("items", 1 , 'a')) # pydantic location + assert val_ex.value.column_name == str(("items", 1, "a")) # pydantic location assert val_ex.value.contract_entity == "columns" assert val_ex.value.contract_mode == "freeze" assert val_ex.value.table_schema is freeze_list_model @@ -292,7 +333,13 @@ class ItemModel(BaseModel): discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") discard_list_model = create_list_model(discard_value_model) # violate extra field - items = validate_items("items", discard_list_model, [{"b": True}, {"b": False, "a": False}], "discard_value", "freeze") + items = validate_items( + "items", + discard_list_model, + [{"b": True}, {"b": False, "a": False}], + "discard_value", + "freeze", + ) assert len(items) == 2 # "a" extra got remove assert items[1].dict() == {"b": False, "opt": None} @@ -304,12 +351,24 @@ class ItemModel(BaseModel): evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") evolve_list_model = create_list_model(evolve_model) # for data types a lenient model will be created that accepts any type - items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "evolve", "evolve") + items = validate_items( + "items", + evolve_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], + "evolve", + "evolve", + ) assert len(items) == 4 assert items[0].b is True assert items[1].b == 2 # extra fields allowed - items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "evolve", "evolve") + items = validate_items( + "items", + evolve_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], + "evolve", + "evolve", + ) assert len(items) == 4 assert items[3].b is False assert items[3].a is False # type: ignore[attr-defined] @@ -318,29 +377,43 @@ class ItemModel(BaseModel): mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") mixed_list_model = create_list_model(mixed_model) # for data types a lenient model will be created that accepts any type - items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "discard_row", "evolve") + items = validate_items( + "items", + mixed_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], + "discard_row", + "evolve", + ) assert len(items) == 4 assert items[0].b is True assert items[1].b == 2 # extra fields forbidden - full rows discarded - items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "evolve") + items = validate_items( + "items", + mixed_list_model, + [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], + "discard_row", + "evolve", + ) assert len(items) == 3 def test_item_validation() -> None: - class ItemModel(BaseModel): b: bool dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} - - # non validating items removed from the list (both extra and declared) discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") # violate data type assert validate_item("items", discard_model, {"b": 2}, "discard_row", "discard_row") is None # violate extra field - assert validate_item("items", discard_model, {"b": False, "a": False}, "discard_row", "discard_row") is None + assert ( + validate_item( + "items", discard_model, {"b": False, "a": False}, "discard_row", "discard_row" + ) + is None + ) # freeze on non validating items (both extra and declared) freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") @@ -349,7 +422,7 @@ class ItemModel(BaseModel): validate_item("items", freeze_model, {"b": 2}, "freeze", "freeze") assert val_ex.value.schema_name is None assert val_ex.value.table_name == "items" - assert val_ex.value.column_name == str(('b',)) # pydantic location + assert val_ex.value.column_name == str(("b",)) # pydantic location assert val_ex.value.contract_entity == "data_type" assert val_ex.value.contract_mode == "freeze" assert val_ex.value.table_schema is freeze_model @@ -359,7 +432,7 @@ class ItemModel(BaseModel): validate_item("items", freeze_model, {"a": 2, "b": False}, "freeze", "freeze") assert val_ex.value.schema_name is None assert val_ex.value.table_name == "items" - assert val_ex.value.column_name == str(('a',)) # pydantic location + assert val_ex.value.column_name == str(("a",)) # pydantic location assert val_ex.value.contract_entity == "columns" assert val_ex.value.contract_mode == "freeze" assert val_ex.value.table_schema is freeze_model @@ -368,7 +441,9 @@ class ItemModel(BaseModel): # discard values discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") # violate extra field - item = validate_item("items", discard_value_model, {"b": False, "a": False}, "discard_value", "freeze") + item = validate_item( + "items", discard_value_model, {"b": False, "a": False}, "discard_value", "freeze" + ) # "a" extra got removed assert item.dict() == {"b": False} @@ -388,4 +463,7 @@ class ItemModel(BaseModel): item = validate_item("items", mixed_model, {"b": 3}, "discard_row", "evolve") assert item.b == 3 # extra fields forbidden - full rows discarded - assert validate_item("items", mixed_model, {"b": False, "a": False}, "discard_row", "evolve") is None + assert ( + validate_item("items", mixed_model, {"b": False, "a": False}, "discard_row", "evolve") + is None + ) diff --git a/tests/load/athena_iceberg/test_athena_iceberg.py b/tests/load/athena_iceberg/test_athena_iceberg.py index 72772b0e2d..0b18f22639 100644 --- a/tests/load/athena_iceberg/test_athena_iceberg.py +++ b/tests/load/athena_iceberg/test_athena_iceberg.py @@ -1,4 +1,3 @@ - import pytest import os import datetime # noqa: I251 @@ -7,7 +6,7 @@ import dlt from dlt.common import pendulum from dlt.common.utils import uniq_id -from tests.load.pipeline.utils import load_table_counts +from tests.load.pipeline.utils import load_table_counts from tests.cases import table_update_and_row, assert_all_data_types_row from tests.pipeline.utils import assert_load_info @@ -25,21 +24,20 @@ def test_iceberg() -> None: We write two tables, one with the iceberg flag, one without. We expect the iceberg table and its subtables to accept update commands and the other table to reject them. """ - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "s3://dlt-ci-test-bucket" + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "s3://dlt-ci-test-bucket" - pipeline = dlt.pipeline(pipeline_name="aaaaathena-iceberg", destination="athena", staging="filesystem", full_refresh=True) + pipeline = dlt.pipeline( + pipeline_name="aaaaathena-iceberg", + destination="athena", + staging="filesystem", + full_refresh=True, + ) def items() -> Iterator[Any]: yield { "id": 1, "name": "item", - "sub_items": [{ - "id": 101, - "name": "sub item 101" - },{ - "id": 101, - "name": "sub item 102" - }] + "sub_items": [{"id": 101, "name": "sub item 101"}, {"id": 101, "name": "sub item 102"}], } @dlt.resource(name="items_normal", write_disposition="append") @@ -53,7 +51,9 @@ def items_iceberg(): print(pipeline.run([items_normal, items_iceberg])) # see if we have athena tables with items - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values()] + ) assert table_counts["items_normal"] == 1 assert table_counts["items_normal__sub_items"] == 2 assert table_counts["_dlt_loads"] == 1 @@ -75,4 +75,3 @@ def items_iceberg(): # modifying iceberg table will succeed client.execute_sql("UPDATE items_iceberg SET name='new name'") client.execute_sql("UPDATE items_iceberg__sub_items SET name='super new name'") - diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index abbaf8d414..5bbc7b79be 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -8,7 +8,12 @@ from dlt.common.arithmetics import numeric_default_context from dlt.common.configuration.exceptions import ConfigFieldMissingException from dlt.common.configuration.resolve import resolve_configuration -from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpServiceAccountCredentialsWithoutDefaults, GcpOAuthCredentials, GcpOAuthCredentialsWithoutDefaults +from dlt.common.configuration.specs import ( + GcpServiceAccountCredentials, + GcpServiceAccountCredentialsWithoutDefaults, + GcpOAuthCredentials, + GcpOAuthCredentialsWithoutDefaults, +) from dlt.common.configuration.specs import gcp_credentials from dlt.common.configuration.specs.exceptions import InvalidGoogleNativeCredentialsType from dlt.common.storages import FileStorage @@ -20,7 +25,13 @@ from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, preserve_environ from tests.common.utils import json_case_path as common_json_case_path from tests.common.configuration.utils import environment -from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage, cm_yield_client_with_storage +from tests.load.utils import ( + expect_load_file, + prepare_table, + yield_client_with_storage, + cm_yield_client_with_storage, +) + @pytest.fixture(scope="module") def client() -> Iterator[BigQueryClient]: @@ -42,7 +53,7 @@ def test_service_credentials_with_default(environment: Any) -> None: # resolve will miss values and try to find default credentials on the machine with pytest.raises(ConfigFieldMissingException) as py_ex: resolve_configuration(gcpc) - assert py_ex.value.fields == ['project_id', 'private_key', 'client_email'] + assert py_ex.value.fields == ["project_id", "private_key", "client_email"] # prepare real service.json services_str, dest_path = prepare_service_json() @@ -106,7 +117,7 @@ def test_oauth_credentials_with_default(environment: Any) -> None: # resolve will miss values and try to find default credentials on the machine with pytest.raises(ConfigFieldMissingException) as py_ex: resolve_configuration(gcoauth) - assert py_ex.value.fields == ['client_id', 'client_secret', 'refresh_token', 'project_id'] + assert py_ex.value.fields == ["client_id", "client_secret", "refresh_token", "project_id"] # prepare real service.json oauth_str, _ = prepare_oauth_json() @@ -180,7 +191,9 @@ def test_get_oauth_access_token() -> None: def test_bigquery_configuration() -> None: - config = resolve_configuration(BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery")) + config = resolve_configuration( + BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + ) assert config.location == "US" assert config.get_location() == "US" assert config.http_timeout == 15.0 @@ -190,16 +203,22 @@ def test_bigquery_configuration() -> None: # credentials location is deprecated os.environ["CREDENTIALS__LOCATION"] = "EU" - config = resolve_configuration(BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery")) + config = resolve_configuration( + BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + ) assert config.location == "US" assert config.credentials.location == "EU" # but if it is set, we propagate it to the config assert config.get_location() == "EU" os.environ["LOCATION"] = "ATLANTIS" - config = resolve_configuration(BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery")) + config = resolve_configuration( + BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + ) assert config.get_location() == "ATLANTIS" os.environ["DESTINATION__FILE_UPLOAD_TIMEOUT"] = "20000" - config = resolve_configuration(BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery")) + config = resolve_configuration( + BigQueryClientConfiguration(dataset_name="dataset"), sections=("destination", "bigquery") + ) assert config.file_upload_timeout == 20000.0 # default fingerprint is empty @@ -230,30 +249,40 @@ def test_bigquery_job_errors(client: BigQueryClient, file_storage: FileStorage) load_json = { "_dlt_id": uniq_id(), "_dlt_root_id": uniq_id(), - "sender_id":'90238094809sajlkjxoiewjhduuiuehd', - "timestamp": str(pendulum.now()) + "sender_id": "90238094809sajlkjxoiewjhduuiuehd", + "timestamp": str(pendulum.now()), } job = expect_load_file(client, file_storage, json.dumps(load_json), user_table_name) # start a job from the same file. it should fallback to retrieve job silently - r_job = client.start_file_load(client.schema.get_table(user_table_name), file_storage.make_full_path(job.file_name()), uniq_id()) + r_job = client.start_file_load( + client.schema.get_table(user_table_name), + file_storage.make_full_path(job.file_name()), + uniq_id(), + ) assert r_job.state() == "completed" -@pytest.mark.parametrize('location', ["US", "EU"]) +@pytest.mark.parametrize("location", ["US", "EU"]) def test_bigquery_location(location: str, file_storage: FileStorage) -> None: - with cm_yield_client_with_storage("bigquery", default_config_values={"credentials": {"location": location}}) as client: + with cm_yield_client_with_storage( + "bigquery", default_config_values={"credentials": {"location": location}} + ) as client: user_table_name = prepare_table(client) load_json = { "_dlt_id": uniq_id(), "_dlt_root_id": uniq_id(), - "sender_id": '90238094809sajlkjxoiewjhduuiuehd', - "timestamp": str(pendulum.now()) + "sender_id": "90238094809sajlkjxoiewjhduuiuehd", + "timestamp": str(pendulum.now()), } job = expect_load_file(client, file_storage, json.dumps(load_json), user_table_name) # start a job from the same file. it should fallback to retrieve job silently - client.start_file_load(client.schema.get_table(user_table_name), file_storage.make_full_path(job.file_name()), uniq_id()) + client.start_file_load( + client.schema.get_table(user_table_name), + file_storage.make_full_path(job.file_name()), + uniq_id(), + ) canonical_name = client.sql_client.make_qualified_table_name(user_table_name, escape=False) t = client.sql_client.native_connection.get_table(canonical_name) assert t.location == location @@ -265,58 +294,84 @@ def test_loading_errors(client: BigQueryClient, file_storage: FileStorage) -> No load_json: Dict[str, Any] = { "_dlt_id": uniq_id(), "_dlt_root_id": uniq_id(), - "sender_id":'90238094809sajlkjxoiewjhduuiuehd', - "timestamp": str(pendulum.now()) + "sender_id": "90238094809sajlkjxoiewjhduuiuehd", + "timestamp": str(pendulum.now()), } insert_json = copy(load_json) insert_json["_unk_"] = None - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) assert "No such field: _unk_" in job.exception() # insert null value insert_json = copy(load_json) insert_json["timestamp"] = None - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) assert "Only optional fields can be set to NULL. Field: timestamp;" in job.exception() # insert wrong type insert_json = copy(load_json) insert_json["timestamp"] = "AA" - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) assert "Couldn't convert value to timestamp:" in job.exception() # numeric overflow on bigint insert_json = copy(load_json) # 2**64//2 - 1 is a maximum bigint value - insert_json["metadata__rasa_x_id"] = 2**64//2 - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") + insert_json["metadata__rasa_x_id"] = 2**64 // 2 + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) assert "Could not convert value" in job.exception() # numeric overflow on NUMERIC insert_json = copy(load_json) # default decimal is (38, 9) (128 bit), use local context to generate decimals with 38 precision with numeric_default_context(): - below_limit = Decimal(10**29) - Decimal('0.001') + below_limit = Decimal(10**29) - Decimal("0.001") above_limit = Decimal(10**29) # this will pass insert_json["parse_data__intent__id"] = below_limit - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="completed") + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="completed" + ) # this will fail insert_json["parse_data__intent__id"] = above_limit - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") - assert "Invalid NUMERIC value: 100000000000000000000000000000 Field: parse_data__intent__id;" in job.exception() + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) + assert ( + "Invalid NUMERIC value: 100000000000000000000000000000 Field: parse_data__intent__id;" + in job.exception() + ) # max bigquery decimal is (76, 76) (256 bit) = 5.7896044618658097711785492504343953926634992332820282019728792003956564819967E+38 insert_json = copy(load_json) - insert_json["parse_data__metadata__rasa_x_id"] = Decimal("5.7896044618658097711785492504343953926634992332820282019728792003956564819968E+38") - job = expect_load_file(client, file_storage, json.dumps(insert_json), user_table_name, status="failed") - assert "Invalid BIGNUMERIC value: 578960446186580977117854925043439539266.34992332820282019728792003956564819968 Field: parse_data__metadata__rasa_x_id;" in job.exception() + insert_json["parse_data__metadata__rasa_x_id"] = Decimal( + "5.7896044618658097711785492504343953926634992332820282019728792003956564819968E+38" + ) + job = expect_load_file( + client, file_storage, json.dumps(insert_json), user_table_name, status="failed" + ) + assert ( + "Invalid BIGNUMERIC value:" + " 578960446186580977117854925043439539266.34992332820282019728792003956564819968 Field:" + " parse_data__metadata__rasa_x_id;" + in job.exception() + ) def prepare_oauth_json() -> Tuple[str, str]: # prepare real service.json storage = FileStorage("_secrets", makedirs=True) - with open(common_json_case_path("oauth_client_secret_929384042504"), mode="r", encoding="utf-8") as f: + with open( + common_json_case_path("oauth_client_secret_929384042504"), mode="r", encoding="utf-8" + ) as f: oauth_str = f.read() dest_path = storage.save("oauth_client_secret_929384042504.json", oauth_str) return oauth_str, dest_path diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index 0d8ab1c8c2..d622f9205c 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -14,6 +14,7 @@ from tests.load.utils import TABLE_UPDATE + @pytest.fixture def schema() -> Schema: return Schema("event") @@ -37,7 +38,7 @@ def gcp_client(schema: Schema) -> BigQueryClient: creds.project_id = "test_project_id" return BigQueryClient( schema, - BigQueryClientConfiguration(dataset_name="test_" + uniq_id(), credentials=creds) # type: ignore[arg-type] + BigQueryClientConfiguration(dataset_name="test_" + uniq_id(), credentials=creds), # type: ignore[arg-type] ) diff --git a/tests/load/conftest.py b/tests/load/conftest.py index 23c7a2b8c4..1d40e912e6 100644 --- a/tests/load/conftest.py +++ b/tests/load/conftest.py @@ -6,23 +6,21 @@ from tests.utils import preserve_environ -@pytest.fixture(scope='function', params=DEFAULT_BUCKETS) +@pytest.fixture(scope="function", params=DEFAULT_BUCKETS) def default_buckets_env(request) -> Iterator[str]: - """Parametrized fixture to configure filesystem destination bucket in env for each test bucket - """ - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = request.param + """Parametrized fixture to configure filesystem destination bucket in env for each test bucket""" + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = request.param yield request.param - -@pytest.fixture(scope='function', params=ALL_BUCKETS) +@pytest.fixture(scope="function", params=ALL_BUCKETS) def all_buckets_env(request) -> Iterator[str]: if isinstance(request.param, dict): - bucket_url = request.param['bucket_url'] + bucket_url = request.param["bucket_url"] # R2 bucket needs to override all credentials - for key, value in request.param['credentials'].items(): - os.environ[f'DESTINATION__FILESYSTEM__CREDENTIALS__{key.upper()}'] = value + for key, value in request.param["credentials"].items(): + os.environ[f"DESTINATION__FILESYSTEM__CREDENTIALS__{key.upper()}"] = value else: bucket_url = request.param - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = bucket_url + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = bucket_url yield bucket_url diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index ddfc681a84..ef151833e4 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -6,12 +6,18 @@ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.utils import get_resolved_traces -from dlt.destinations.impl.duckdb.configuration import DUCK_DB_NAME, DuckDbClientConfiguration, DuckDbCredentials, DEFAULT_DUCK_DB_NAME +from dlt.destinations.impl.duckdb.configuration import ( + DUCK_DB_NAME, + DuckDbClientConfiguration, + DuckDbCredentials, + DEFAULT_DUCK_DB_NAME, +) from dlt.destinations import duckdb from tests.load.pipeline.utils import drop_pipeline, assert_table from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT + @pytest.fixture(autouse=True) def delete_default_duckdb_credentials() -> Iterator[None]: # remove the default duckdb config @@ -68,7 +74,9 @@ def test_duckdb_database_path() -> None: os.unlink(db_path) # test special :pipeline: path to create in pipeline folder - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=":pipeline:")) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=":pipeline:") + ) db_path = os.path.abspath(os.path.join(p.working_dir, DEFAULT_DUCK_DB_NAME)) assert c.credentials._conn_str().lower() == db_path.lower() # connect @@ -80,7 +88,11 @@ def test_duckdb_database_path() -> None: # provide relative path db_path = "_storage/test_quack.duckdb" - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials="duckdb:///_storage/test_quack.duckdb")) + c = resolve_configuration( + DuckDbClientConfiguration( + dataset_name="test_dataset", credentials="duckdb:///_storage/test_quack.duckdb" + ) + ) assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) @@ -89,7 +101,9 @@ def test_duckdb_database_path() -> None: # provide absolute path db_path = os.path.abspath("_storage/abs_test_quack.duckdb") - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=f"duckdb:///{db_path}")) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=f"duckdb:///{db_path}") + ) assert os.path.isabs(c.credentials.database) assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) @@ -99,7 +113,9 @@ def test_duckdb_database_path() -> None: # set just path as credentials db_path = "_storage/path_test_quack.duckdb" - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path) + ) assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) @@ -107,7 +123,9 @@ def test_duckdb_database_path() -> None: p = p.drop() db_path = os.path.abspath("_storage/abs_path_test_quack.duckdb") - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path) + ) assert os.path.isabs(c.credentials.database) assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) @@ -119,7 +137,9 @@ def test_duckdb_database_path() -> None: import duckdb with pytest.raises(duckdb.IOException): - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=TEST_STORAGE_ROOT)) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=TEST_STORAGE_ROOT) + ) conn = c.credentials.borrow_conn(read_only=False) @@ -204,7 +224,9 @@ def test_external_duckdb_database() -> None: # pass explicit in memory database conn = duckdb.connect(":memory:") - c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=conn)) + c = resolve_configuration( + DuckDbClientConfiguration(dataset_name="test_dataset", credentials=conn) + ) assert c.credentials._conn_borrows == 0 assert c.credentials._conn is conn int_conn = c.credentials.borrow_conn(read_only=False) @@ -216,6 +238,7 @@ def test_external_duckdb_database() -> None: assert hasattr(c.credentials, "_conn") conn.close() + def test_default_duckdb_dataset_name() -> None: # Check if dataset_name does not collide with pipeline_name data = ["a", "b", "c"] diff --git a/tests/load/duckdb/test_duckdb_table_builder.py b/tests/load/duckdb/test_duckdb_table_builder.py index a5870763fc..0e6f799047 100644 --- a/tests/load/duckdb/test_duckdb_table_builder.py +++ b/tests/load/duckdb/test_duckdb_table_builder.py @@ -82,7 +82,7 @@ def test_create_table_with_hints(client: DuckDbClient) -> None: mod_update[0]["sort"] = True mod_update[1]["unique"] = True mod_update[4]["foreign_key"] = True - sql = ';'.join(client._get_table_update_sql("event_test_table", mod_update, False)) + sql = ";".join(client._get_table_update_sql("event_test_table", mod_update, False)) assert '"col1" BIGINT NOT NULL' in sql assert '"col2" DOUBLE NOT NULL' in sql assert '"col5" VARCHAR ' in sql @@ -92,7 +92,10 @@ def test_create_table_with_hints(client: DuckDbClient) -> None: assert '"col4" TIMESTAMP WITH TIME ZONE NOT NULL' in sql # same thing with indexes - client = DuckDbClient(client.schema, DuckDbClientConfiguration(dataset_name="test_" + uniq_id(), create_indexes=True)) + client = DuckDbClient( + client.schema, + DuckDbClientConfiguration(dataset_name="test_" + uniq_id(), create_indexes=True), + ) sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] sqlfluff.parse(sql) assert '"col2" DOUBLE UNIQUE NOT NULL' in sql diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index 582847bfa2..d57cf58f53 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -3,12 +3,16 @@ from dlt.common.configuration.resolve import resolve_configuration -from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck.configuration import ( + MotherDuckCredentials, + MotherDuckClientConfiguration, +) from tests.utils import patch_home_dir, preserve_environ, skip_if_not_active skip_if_not_active("motherduck") + def test_motherduck_database() -> None: # set HOME env otherwise some internal components in ducdkb (HTTPS) do not initialize os.environ["HOME"] = "/tmp" @@ -20,7 +24,9 @@ def test_motherduck_database() -> None: cred.parse_native_representation("md:///?token=TOKEN") assert cred.password == "TOKEN" - config = resolve_configuration(MotherDuckClientConfiguration(dataset_name="test"), sections=("destination", "motherduck")) + config = resolve_configuration( + MotherDuckClientConfiguration(dataset_name="test"), sections=("destination", "motherduck") + ) # connect con = config.credentials.borrow_conn(read_only=False) con.sql("SHOW DATABASES") diff --git a/tests/load/filesystem/test_aws_credentials.py b/tests/load/filesystem/test_aws_credentials.py index d34bc7ed24..7a0d42eb6d 100644 --- a/tests/load/filesystem/test_aws_credentials.py +++ b/tests/load/filesystem/test_aws_credentials.py @@ -10,8 +10,8 @@ from tests.load.utils import ALL_FILESYSTEM_DRIVERS from tests.utils import preserve_environ, autouse_test_storage -if 's3' not in ALL_FILESYSTEM_DRIVERS: - pytest.skip('s3 filesystem driver not configured', allow_module_level=True) +if "s3" not in ALL_FILESYSTEM_DRIVERS: + pytest.skip("s3 filesystem driver not configured", allow_module_level=True) def test_aws_credentials_resolved_from_default(environment: Dict[str, str]) -> None: @@ -19,9 +19,9 @@ def test_aws_credentials_resolved_from_default(environment: Dict[str, str]) -> N config = resolve_configuration(AwsCredentials()) - assert config.aws_access_key_id == 'fake_access_key' - assert config.aws_secret_access_key == 'fake_secret_key' - assert config.aws_session_token == 'fake_session_token' + assert config.aws_access_key_id == "fake_access_key" + assert config.aws_secret_access_key == "fake_secret_key" + assert config.aws_session_token == "fake_session_token" # we do not set the profile assert config.profile_name is None @@ -43,7 +43,7 @@ def test_aws_credentials_from_botocore(environment: Dict[str, str]) -> None: import botocore.session session = botocore.session.get_session() - region_name = 'eu-central-1' # session.get_config_variable('region') + region_name = "eu-central-1" # session.get_config_variable('region') c = AwsCredentials(session) assert c.profile_name is None @@ -60,9 +60,7 @@ def test_aws_credentials_from_botocore(environment: Dict[str, str]) -> None: "token": "fake_session_token", "profile": None, "endpoint_url": None, - "client_kwargs": { - "region_name": region_name - } + "client_kwargs": {"region_name": region_name}, } c = AwsCredentials() @@ -112,18 +110,18 @@ def test_aws_credentials_for_profile(environment: Dict[str, str]) -> None: c.profile_name = "dlt-ci-user" try: c = resolve_configuration(c) - assert digest128(c.aws_access_key_id) == 'S3r3CtEf074HjqVeHKj/' + assert digest128(c.aws_access_key_id) == "S3r3CtEf074HjqVeHKj/" except botocore.exceptions.ProfileNotFound: pytest.skip("This test requires dlt-ci-user aws profile to be present") def test_aws_credentials_with_endpoint_url(environment: Dict[str, str]) -> None: set_aws_credentials_env(environment) - environment['CREDENTIALS__ENDPOINT_URL'] = 'https://123.r2.cloudflarestorage.com' + environment["CREDENTIALS__ENDPOINT_URL"] = "https://123.r2.cloudflarestorage.com" config = resolve_configuration(AwsCredentials()) - assert config.endpoint_url == 'https://123.r2.cloudflarestorage.com' + assert config.endpoint_url == "https://123.r2.cloudflarestorage.com" assert config.to_s3fs_credentials() == { "key": "fake_access_key", @@ -131,14 +129,12 @@ def test_aws_credentials_with_endpoint_url(environment: Dict[str, str]) -> None: "token": "fake_session_token", "profile": None, "endpoint_url": "https://123.r2.cloudflarestorage.com", - "client_kwargs": { - "region_name": 'eu-central-1' - } + "client_kwargs": {"region_name": "eu-central-1"}, } def set_aws_credentials_env(environment: Dict[str, str]) -> None: - environment['AWS_ACCESS_KEY_ID'] = 'fake_access_key' - environment['AWS_SECRET_ACCESS_KEY'] = 'fake_secret_key' - environment['AWS_SESSION_TOKEN'] = 'fake_session_token' - environment["AWS_DEFAULT_REGION"] = environment['REGION_NAME'] = 'eu-central-1' + environment["AWS_ACCESS_KEY_ID"] = "fake_access_key" + environment["AWS_SECRET_ACCESS_KEY"] = "fake_secret_key" + environment["AWS_SESSION_TOKEN"] = "fake_session_token" + environment["AWS_DEFAULT_REGION"] = environment["REGION_NAME"] = "eu-central-1" diff --git a/tests/load/filesystem/test_azure_credentials.py b/tests/load/filesystem/test_azure_credentials.py index b9cf10a05a..093cd6dd19 100644 --- a/tests/load/filesystem/test_azure_credentials.py +++ b/tests/load/filesystem/test_azure_credentials.py @@ -11,54 +11,57 @@ from tests.common.configuration.utils import environment from tests.utils import preserve_environ, autouse_test_storage -if 'az' not in ALL_FILESYSTEM_DRIVERS: - pytest.skip('az filesystem driver not configured', allow_module_level=True) +if "az" not in ALL_FILESYSTEM_DRIVERS: + pytest.skip("az filesystem driver not configured", allow_module_level=True) def test_azure_credentials_from_account_key(environment: Dict[str, str]) -> None: - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] = 'fake_account_name' - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY'] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" config = resolve_configuration(AzureCredentials()) # Verify sas token is generated with correct permissions and expiry time sas_params = parse_qs(config.azure_storage_sas_token) - permissions = set(sas_params['sp'][0]) - assert permissions == {'r', 'w', 'd', 'l', 'a', 'c'} + permissions = set(sas_params["sp"][0]) + assert permissions == {"r", "w", "d", "l", "a", "c"} - exp = ensure_pendulum_datetime(sas_params['se'][0]) + exp = ensure_pendulum_datetime(sas_params["se"][0]) assert exp > pendulum.now().add(hours=23) def test_create_azure_sas_token_with_permissions(environment: Dict[str, str]) -> None: - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] = 'fake_account_name' - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY'] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" - environment['CREDENTIALS__AZURE_SAS_TOKEN_PERMISSIONS'] = "rl" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_KEY"] = "QWERTYUIOPASDFGHJKLZXCVBNM1234567890" + environment["CREDENTIALS__AZURE_SAS_TOKEN_PERMISSIONS"] = "rl" config = resolve_configuration(AzureCredentials()) sas_params = parse_qs(config.azure_storage_sas_token) - permissions = set(sas_params['sp'][0]) - assert permissions == {'r', 'l'} - + permissions = set(sas_params["sp"][0]) + assert permissions == {"r", "l"} def test_azure_credentials_from_sas_token(environment: Dict[str, str]) -> None: - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] = 'fake_account_name' - environment['CREDENTIALS__AZURE_STORAGE_SAS_TOKEN'] = "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" + environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] = ( + "sp=rwdlacx&se=2021-01-01T00:00:00Z&sv=2019-12-12&sr=c&sig=1234567890" + ) config = resolve_configuration(AzureCredentials()) - assert config.azure_storage_sas_token == environment['CREDENTIALS__AZURE_STORAGE_SAS_TOKEN'] - assert config.azure_storage_account_name == environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] + assert config.azure_storage_sas_token == environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"] + assert ( + config.azure_storage_account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + ) assert config.azure_storage_account_key is None assert config.to_adlfs_credentials() == { - 'account_name': environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'], - 'account_key': None, - 'sas_token': environment['CREDENTIALS__AZURE_STORAGE_SAS_TOKEN'], + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "account_key": None, + "sas_token": environment["CREDENTIALS__AZURE_STORAGE_SAS_TOKEN"], } @@ -68,22 +71,24 @@ def test_azure_credentials_missing_account_name(environment: Dict[str, str]) -> ex = excinfo.value - assert 'azure_storage_account_name' in ex.fields + assert "azure_storage_account_name" in ex.fields def test_azure_credentials_from_default(environment: Dict[str, str]) -> None: - environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] = 'fake_account_name' + environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] = "fake_account_name" config = resolve_configuration(AzureCredentials()) - assert config.azure_storage_account_name == environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'] + assert ( + config.azure_storage_account_name == environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"] + ) assert config.azure_storage_account_key is None assert config.azure_storage_sas_token is None # fsspec args should have anon=True when using system credentials assert config.to_adlfs_credentials() == { - 'account_name': environment['CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME'], - 'account_key': None, - 'sas_token': None, - 'anon': False + "account_name": environment["CREDENTIALS__AZURE_STORAGE_ACCOUNT_NAME"], + "account_key": None, + "sas_token": None, + "anon": False, } diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index 0055f37716..b1fc2b2d09 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -6,12 +6,16 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.storages import LoadStorage, FileStorage -from dlt.destinations.impl.filesystem.filesystem import LoadFilesystemJob, FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem.filesystem import ( + LoadFilesystemJob, + FilesystemDestinationClientConfiguration, +) from tests.load.filesystem.utils import perform_load from tests.utils import clean_test_storage, init_test_logging from tests.utils import preserve_environ, autouse_test_storage + @pytest.fixture(autouse=True) def storage() -> FileStorage: return clean_test_storage(init_normalize=True, init_loader=True) @@ -24,34 +28,38 @@ def logger_autouse() -> None: NORMALIZED_FILES = [ "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl", - "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" + "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl", ] ALL_LAYOUTS = ( None, - "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", # new default layout with schema - "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}", # classic layout - "{table_name}88{load_id}-u-{file_id}.{ext}" # default layout with strange separators + "{schema_name}/{table_name}/{load_id}.{file_id}.{ext}", # new default layout with schema + "{schema_name}.{table_name}.{load_id}.{file_id}.{ext}", # classic layout + "{table_name}88{load_id}-u-{file_id}.{ext}", # default layout with strange separators ) def test_filesystem_destination_configuration() -> None: assert FilesystemDestinationClientConfiguration().fingerprint() == "" - assert FilesystemDestinationClientConfiguration(bucket_url="s3://cool").fingerprint() == digest128("s3://cool") + assert FilesystemDestinationClientConfiguration( + bucket_url="s3://cool" + ).fingerprint() == digest128("s3://cool") -@pytest.mark.parametrize('write_disposition', ('replace', 'append', 'merge')) -@pytest.mark.parametrize('layout', ALL_LAYOUTS) +@pytest.mark.parametrize("write_disposition", ("replace", "append", "merge")) +@pytest.mark.parametrize("layout", ALL_LAYOUTS) def test_successful_load(write_disposition: str, layout: str, default_buckets_env: str) -> None: """Test load is successful with an empty destination dataset""" if layout: - os.environ['DESTINATION__FILESYSTEM__LAYOUT'] = layout + os.environ["DESTINATION__FILESYSTEM__LAYOUT"] = layout else: os.environ.pop("DESTINATION__FILESYSTEM__LAYOUT", None) - dataset_name = 'test_' + uniq_id() + dataset_name = "test_" + uniq_id() - with perform_load(dataset_name, NORMALIZED_FILES, write_disposition=write_disposition) as load_info: + with perform_load( + dataset_name, NORMALIZED_FILES, write_disposition=write_disposition + ) as load_info: client, jobs, _, load_id = load_info layout = client.config.layout dataset_path = posixpath.join(client.fs_path, client.config.dataset_name) @@ -62,77 +70,99 @@ def test_successful_load(write_disposition: str, layout: str, default_buckets_en # Sanity check, there are jobs assert jobs for job in jobs: - assert job.state() == 'completed' + assert job.state() == "completed" job_info = LoadStorage.parse_job_file_name(job.file_name()) destination_path = posixpath.join( dataset_path, - layout.format(schema_name=client.schema.name, table_name=job_info.table_name, load_id=load_id, file_id=job_info.file_id, ext=job_info.file_format) + layout.format( + schema_name=client.schema.name, + table_name=job_info.table_name, + load_id=load_id, + file_id=job_info.file_id, + ext=job_info.file_format, + ), ) # File is created with correct filename and path assert client.fs_client.isfile(destination_path) -@pytest.mark.parametrize('layout', ALL_LAYOUTS) +@pytest.mark.parametrize("layout", ALL_LAYOUTS) def test_replace_write_disposition(layout: str, default_buckets_env: str) -> None: if layout: - os.environ['DESTINATION__FILESYSTEM__LAYOUT'] = layout + os.environ["DESTINATION__FILESYSTEM__LAYOUT"] = layout else: os.environ.pop("DESTINATION__FILESYSTEM__LAYOUT", None) - dataset_name = 'test_' + uniq_id() + dataset_name = "test_" + uniq_id() # NOTE: context manager will delete the dataset at the end so keep it open until the end - with perform_load(dataset_name, NORMALIZED_FILES, write_disposition='replace') as load_info: + with perform_load(dataset_name, NORMALIZED_FILES, write_disposition="replace") as load_info: client, _, root_path, load_id1 = load_info layout = client.config.layout # this path will be kept after replace job_2_load_1_path = posixpath.join( root_path, - LoadFilesystemJob.make_destination_filename(layout, NORMALIZED_FILES[1], client.schema.name, load_id1) + LoadFilesystemJob.make_destination_filename( + layout, NORMALIZED_FILES[1], client.schema.name, load_id1 + ), ) - with perform_load(dataset_name, [NORMALIZED_FILES[0]], write_disposition='replace') as load_info: + with perform_load( + dataset_name, [NORMALIZED_FILES[0]], write_disposition="replace" + ) as load_info: client, _, root_path, load_id2 = load_info # this one we expect to be replaced with job_1_load_2_path = posixpath.join( root_path, - LoadFilesystemJob.make_destination_filename(layout, NORMALIZED_FILES[0], client.schema.name, load_id2) + LoadFilesystemJob.make_destination_filename( + layout, NORMALIZED_FILES[0], client.schema.name, load_id2 + ), ) # First file from load1 remains, second file is replaced by load2 # assert that only these two files are in the destination folder paths = [] - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for basedir, _dirs, files in client.fs_client.walk( + client.dataset_path, detail=False, refresh=True + ): for f in files: paths.append(posixpath.join(basedir, f)) ls = set(paths) assert ls == {job_2_load_1_path, job_1_load_2_path} -@pytest.mark.parametrize('layout', ALL_LAYOUTS) +@pytest.mark.parametrize("layout", ALL_LAYOUTS) def test_append_write_disposition(layout: str, default_buckets_env: str) -> None: """Run load twice with append write_disposition and assert that there are two copies of each file in destination""" if layout: - os.environ['DESTINATION__FILESYSTEM__LAYOUT'] = layout + os.environ["DESTINATION__FILESYSTEM__LAYOUT"] = layout else: os.environ.pop("DESTINATION__FILESYSTEM__LAYOUT", None) - dataset_name = 'test_' + uniq_id() + dataset_name = "test_" + uniq_id() # NOTE: context manager will delete the dataset at the end so keep it open until the end - with perform_load(dataset_name, NORMALIZED_FILES, write_disposition='append') as load_info: + with perform_load(dataset_name, NORMALIZED_FILES, write_disposition="append") as load_info: client, jobs1, root_path, load_id1 = load_info - with perform_load(dataset_name, NORMALIZED_FILES, write_disposition='append') as load_info: + with perform_load(dataset_name, NORMALIZED_FILES, write_disposition="append") as load_info: client, jobs2, root_path, load_id2 = load_info layout = client.config.layout expected_files = [ - LoadFilesystemJob.make_destination_filename(layout, job.file_name(), client.schema.name, load_id1) for job in jobs1 + LoadFilesystemJob.make_destination_filename( + layout, job.file_name(), client.schema.name, load_id1 + ) + for job in jobs1 ] + [ - LoadFilesystemJob.make_destination_filename(layout, job.file_name(), client.schema.name, load_id2) for job in jobs2 + LoadFilesystemJob.make_destination_filename( + layout, job.file_name(), client.schema.name, load_id2 + ) + for job in jobs2 ] expected_files = sorted([posixpath.join(root_path, fn) for fn in expected_files]) paths = [] - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for basedir, _dirs, files in client.fs_client.walk( + client.dataset_path, detail=False, refresh=True + ): for f in files: paths.append(posixpath.join(basedir, f)) assert list(sorted(paths)) == expected_files diff --git a/tests/load/filesystem/test_filesystem_common.py b/tests/load/filesystem/test_filesystem_common.py index caf43ca47c..92cce62160 100644 --- a/tests/load/filesystem/test_filesystem_common.py +++ b/tests/load/filesystem/test_filesystem_common.py @@ -25,13 +25,16 @@ def test_filesystem_configuration() -> None: config = FilesystemConfiguration(bucket_url="az://root") assert config.protocol == "az" # print(config.resolve_credentials_type()) - assert config.resolve_credentials_type() == Union[AzureCredentialsWithoutDefaults, AzureCredentials] + assert ( + config.resolve_credentials_type() + == Union[AzureCredentialsWithoutDefaults, AzureCredentials] + ) # make sure that only bucket_url and credentials are there - assert dict(config) == {'bucket_url': 'az://root', 'credentials': None} + assert dict(config) == {"bucket_url": "az://root", "credentials": None} def test_filesystem_instance(all_buckets_env: str) -> None: - bucket_url = os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] + bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] config = get_config() assert bucket_url.startswith(config.protocol) filesystem, url = fsspec_from_config(config) @@ -54,7 +57,7 @@ def test_filesystem_instance(all_buckets_env: str) -> None: @pytest.mark.parametrize("load_content", (True, False)) def test_filesystem_dict(default_buckets_env: str, load_content: bool) -> None: - bucket_url = os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] + bucket_url = os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] config = get_config() if config.protocol in ["memory", "file"]: pytest.skip(f"{config.protocol} not supported in this test") @@ -62,7 +65,9 @@ def test_filesystem_dict(default_buckets_env: str, load_content: bool) -> None: filesystem, _ = fsspec_from_config(config) # use glob to get data try: - all_file_items = list(glob_files(filesystem, posixpath.join(bucket_url, glob_folder, "samples"))) + all_file_items = list( + glob_files(filesystem, posixpath.join(bucket_url, glob_folder, "samples")) + ) assert_sample_files(all_file_items, filesystem, config, load_content) except NotImplementedError as ex: pytest.skip("Skipping due to " + str(ex)) @@ -74,17 +79,18 @@ def test_filesystem_instance_from_s3_endpoint(environment: Dict[str, str]) -> No E.g. when using an S3 compatible service such as Cloudflare R2 """ from s3fs import S3FileSystem - environment['DESTINATION__FILESYSTEM__BUCKET_URL'] = 's3://dummy-bucket' - environment['CREDENTIALS__ENDPOINT_URL'] = 'https://fake-s3-endpoint.example.com' - environment['CREDENTIALS__AWS_ACCESS_KEY_ID'] = 'fake-access-key' - environment['CREDENTIALS__AWS_SECRET_ACCESS_KEY'] = 'fake-secret-key' + + environment["DESTINATION__FILESYSTEM__BUCKET_URL"] = "s3://dummy-bucket" + environment["CREDENTIALS__ENDPOINT_URL"] = "https://fake-s3-endpoint.example.com" + environment["CREDENTIALS__AWS_ACCESS_KEY_ID"] = "fake-access-key" + environment["CREDENTIALS__AWS_SECRET_ACCESS_KEY"] = "fake-secret-key" config = get_config() filesystem, bucket_name = fsspec_from_config(config) assert isinstance(filesystem, S3FileSystem) - assert filesystem.endpoint_url == 'https://fake-s3-endpoint.example.com' - assert bucket_name == 'dummy-bucket' - assert filesystem.key == 'fake-access-key' - assert filesystem.secret == 'fake-secret-key' + assert filesystem.endpoint_url == "https://fake-s3-endpoint.example.com" + assert bucket_name == "dummy-bucket" + assert filesystem.key == "fake-access-key" + assert filesystem.secret == "fake-secret-key" diff --git a/tests/load/filesystem/utils.py b/tests/load/filesystem/utils.py index 8186e82c3b..d03e43bed5 100644 --- a/tests/load/filesystem/utils.py +++ b/tests/load/filesystem/utils.py @@ -16,16 +16,13 @@ def setup_loader(dataset_name: str) -> Load: destination: TDestination = filesystem() # type: ignore[assignment] config = filesystem.spec(dataset_name=dataset_name) # setup loader - with Container().injectable_context(ConfigSectionContext(sections=('filesystem',))): - return Load( - destination, - initial_client_config=config - ) + with Container().injectable_context(ConfigSectionContext(sections=("filesystem",))): + return Load(destination, initial_client_config=config) @contextmanager def perform_load( - dataset_name: str, cases: Sequence[str], write_disposition: str='append' + dataset_name: str, cases: Sequence[str], write_disposition: str = "append" ) -> Iterator[Tuple[FilesystemClient, List[LoadJob], str, str]]: load = setup_loader(dataset_name) load_id, schema = prepare_load_package(load.load_storage, cases, write_disposition) @@ -33,9 +30,9 @@ def perform_load( # for the replace disposition in the loader we truncate the tables, so do this here truncate_tables = [] - if write_disposition == 'replace': + if write_disposition == "replace": for item in cases: - parts = item.split('.') + parts = item.split(".") truncate_tables.append(parts[0]) client.initialize_storage(truncate_tables=truncate_tables) diff --git a/tests/load/mssql/test_mssql_credentials.py b/tests/load/mssql/test_mssql_credentials.py index 5428246247..540beaac28 100644 --- a/tests/load/mssql/test_mssql_credentials.py +++ b/tests/load/mssql/test_mssql_credentials.py @@ -3,22 +3,23 @@ from dlt.destinations.impl.mssql.configuration import MsSqlCredentials - def test_to_odbc_dsn() -> None: creds = resolve_configuration( - MsSqlCredentials("mssql://test_user:test_password@sql.example.com:12345/test_db?FOO=a&BAR=b") + MsSqlCredentials( + "mssql://test_user:test_password@sql.example.com:12345/test_db?FOO=a&BAR=b" + ) ) dsn = creds.to_odbc_dsn() - result = {k: v for k, v in (param.split('=') for param in dsn.split(";"))} + result = {k: v for k, v in (param.split("=") for param in dsn.split(";"))} assert result == { - 'DRIVER': 'ODBC Driver 18 for SQL Server', - 'SERVER': 'sql.example.com,12345', - 'DATABASE': 'test_db', - 'UID': 'test_user', - 'PWD': 'test_password', - 'FOO': 'a', - 'BAR': 'b' + "DRIVER": "ODBC Driver 18 for SQL Server", + "SERVER": "sql.example.com,12345", + "DATABASE": "test_db", + "UID": "test_user", + "PWD": "test_password", + "FOO": "a", + "BAR": "b", } diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index 114d94a20f..f7e0ce53ff 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -12,6 +12,7 @@ from tests.load.utils import TABLE_UPDATE + @pytest.fixture def schema() -> Schema: return Schema("event") @@ -20,7 +21,10 @@ def schema() -> Schema: @pytest.fixture def client(schema: Schema) -> MsSqlClient: # return client without opening connection - return MsSqlClient(schema, MsSqlClientConfiguration(dataset_name="test_" + uniq_id(), credentials=MsSqlCredentials())) + return MsSqlClient( + schema, + MsSqlClientConfiguration(dataset_name="test_" + uniq_id(), credentials=MsSqlCredentials()), + ) def test_create_table(client: MsSqlClient) -> None: diff --git a/tests/load/pipeline/conftest.py b/tests/load/pipeline/conftest.py index 76dc74a555..34227a8041 100644 --- a/tests/load/pipeline/conftest.py +++ b/tests/load/pipeline/conftest.py @@ -1,3 +1,8 @@ -from tests.utils import patch_home_dir, preserve_environ, autouse_test_storage, duckdb_pipeline_location +from tests.utils import ( + patch_home_dir, + preserve_environ, + autouse_test_storage, + duckdb_pipeline_location, +) from tests.pipeline.utils import drop_dataset_from_env from tests.load.pipeline.utils import drop_pipeline diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index 9a72536329..3a113bfd6f 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -18,13 +18,27 @@ from tests.cases import arrow_table_all_data_types, TArrowFormat -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_staging_configs=True, all_staging_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, default_staging_configs=True, all_staging_configs=True + ), + ids=lambda x: x.name, +) @pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) -def test_load_item(item_type: Literal["pandas", "table", "record_batch"], destination_config: DestinationTestConfiguration) -> None: - os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID'] = "True" - os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_ID'] = "True" - include_time = destination_config.destination not in ("athena", "redshift") # athena/redshift can't load TIME columns from parquet - item, records = arrow_table_all_data_types(item_type, include_json=False, include_time=include_time) +def test_load_item( + item_type: Literal["pandas", "table", "record_batch"], + destination_config: DestinationTestConfiguration, +) -> None: + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_ID"] = "True" + include_time = destination_config.destination not in ( + "athena", + "redshift", + ) # athena/redshift can't load TIME columns from parquet + item, records = arrow_table_all_data_types( + item_type, include_json=False, include_time=include_time + ) pipeline = destination_config.setup_pipeline("arrow_" + uniq_id()) @@ -54,7 +68,6 @@ def some_data(): if isinstance(row[i], memoryview): row[i] = row[i].tobytes() - if destination_config.destination == "redshift": # Binary columns are hex formatted in results for record in records: @@ -71,7 +84,9 @@ def some_data(): for row in expected: for i in range(len(row)): if isinstance(row[i], datetime): - row[i] = reduce_pendulum_datetime_precision(row[i], pipeline.destination.capabilities().timestamp_precision) + row[i] = reduce_pendulum_datetime_precision( + row[i], pipeline.destination.capabilities().timestamp_precision + ) load_id = load_info.loads_ids[0] @@ -88,9 +103,20 @@ def some_data(): @pytest.mark.no_load # Skips drop_pipeline fixture since we don't do any loading -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_staging_configs=True, all_staging_configs=True, default_vector_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_sql_configs=True, + default_staging_configs=True, + all_staging_configs=True, + default_vector_configs=True, + ), + ids=lambda x: x.name, +) @pytest.mark.parametrize("item_type", ["table", "pandas", "record_batch"]) -def test_parquet_column_names_are_normalized(item_type: TArrowFormat, destination_config: DestinationTestConfiguration) -> None: +def test_parquet_column_names_are_normalized( + item_type: TArrowFormat, destination_config: DestinationTestConfiguration +) -> None: """Test normalizing of parquet columns in all destinations""" # Create df with column names with inconsistent naming conventions df = pd.DataFrame( @@ -102,7 +128,7 @@ def test_parquet_column_names_are_normalized(item_type: TArrowFormat, destinatio "e-MAIL", " pHone Number", "ADDRESS", - "CreatedAt" + "CreatedAt", ], ) @@ -122,19 +148,22 @@ def some_data(): # Find the extracted file norm_storage = pipeline._get_normalize_storage() - extract_files = [fn for fn in norm_storage.list_files_to_normalize_sorted() if fn.endswith(".parquet")] + extract_files = [ + fn for fn in norm_storage.list_files_to_normalize_sorted() if fn.endswith(".parquet") + ] assert len(extract_files) == 1 # Normalized column names according to schema naming convention - expected_column_names = [pipeline.default_schema.naming.normalize_path(col) for col in df.columns] + expected_column_names = [ + pipeline.default_schema.naming.normalize_path(col) for col in df.columns + ] new_table_name = pipeline.default_schema.naming.normalize_table_identifier("some_data") schema_columns = pipeline.default_schema.get_table_columns(new_table_name) # Schema columns are normalized - assert [c['name'] for c in schema_columns.values()] == expected_column_names - + assert [c["name"] for c in schema_columns.values()] == expected_column_names - with norm_storage.storage.open_file(extract_files[0], 'rb') as f: + with norm_storage.storage.open_file(extract_files[0], "rb") as f: result_tbl = pa.parquet.read_table(f) # Parquet schema is written with normalized column names diff --git a/tests/load/pipeline/test_athena.py b/tests/load/pipeline/test_athena.py index dd5baae73b..3da081d881 100644 --- a/tests/load/pipeline/test_athena.py +++ b/tests/load/pipeline/test_athena.py @@ -5,16 +5,19 @@ import dlt from dlt.common import pendulum from dlt.common.utils import uniq_id -from tests.load.pipeline.utils import load_table_counts +from tests.load.pipeline.utils import load_table_counts from tests.cases import table_update_and_row, assert_all_data_types_row from tests.pipeline.utils import assert_load_info from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["athena"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["athena"]), + ids=lambda x: x.name, +) def test_athena_destinations(destination_config: DestinationTestConfiguration) -> None: - pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) @dlt.resource(name="items", write_disposition="append") @@ -22,19 +25,15 @@ def items(): yield { "id": 1, "name": "item", - "sub_items": [{ - "id": 101, - "name": "sub item 101" - },{ - "id": 101, - "name": "sub item 102" - }] + "sub_items": [{"id": 101, "name": "sub item 101"}, {"id": 101, "name": "sub item 102"}], } pipeline.run(items) # see if we have athena tables with items - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values() ]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values()] + ) assert table_counts["items"] == 1 assert table_counts["items__sub_items"] == 2 assert table_counts["_dlt_loads"] == 1 @@ -46,25 +45,37 @@ def items2(): "id": 1, "name": "item", "new_field": "hello", - "sub_items": [{ - "id": 101, - "name": "sub item 101", - "other_new_field": "hello 101", - },{ - "id": 101, - "name": "sub item 102", - "other_new_field": "hello 102", - }] + "sub_items": [ + { + "id": 101, + "name": "sub item 101", + "other_new_field": "hello 101", + }, + { + "id": 101, + "name": "sub item 102", + "other_new_field": "hello 102", + }, + ], } + pipeline.run(items2) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema._schema_tables.values()] + ) assert table_counts["items"] == 2 assert table_counts["items__sub_items"] == 4 assert table_counts["_dlt_loads"] == 2 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["athena"]), ids=lambda x: x.name) -def test_athena_all_datatypes_and_timestamps(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["athena"]), + ids=lambda x: x.name, +) +def test_athena_all_datatypes_and_timestamps( + destination_config: DestinationTestConfiguration, +) -> None: pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) # TIME is not supported @@ -74,7 +85,7 @@ def test_athena_all_datatypes_and_timestamps(destination_config: DestinationTest @dlt.resource(table_name="data_types", write_disposition="append", columns=column_schemas) def my_resource() -> Iterator[Any]: nonlocal data_types - yield [data_types]*10 + yield [data_types] * 10 @dlt.source(max_table_nesting=0) def my_source() -> Any: @@ -89,42 +100,69 @@ def my_source() -> Any: db_row = list(db_rows[0]) # content must equal assert_all_data_types_row( - db_row[:-2], parse_complex_strings=True, timestamp_precision=sql_client.capabilities.timestamp_precision, schema=column_schemas + db_row[:-2], + parse_complex_strings=True, + timestamp_precision=sql_client.capabilities.timestamp_precision, + schema=column_schemas, ) # now let's query the data with timestamps and dates. # https://docs.aws.amazon.com/athena/latest/ug/engine-versions-reference-0003.html#engine-versions-reference-0003-timestamp-changes # use string representation TIMESTAMP(2) - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = TIMESTAMP '2022-05-23 13:26:45.176'") + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = TIMESTAMP '2022-05-23 13:26:45.176'" + ) assert len(db_rows) == 10 # no rows - TIMESTAMP(6) not supported - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = TIMESTAMP '2022-05-23 13:26:45.176145'") + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = TIMESTAMP '2022-05-23 13:26:45.176145'" + ) assert len(db_rows) == 0 # use pendulum # that will pass - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = %s", pendulum.datetime(2022, 5, 23, 13, 26, 45, 176000)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = %s", + pendulum.datetime(2022, 5, 23, 13, 26, 45, 176000), + ) assert len(db_rows) == 10 # that will return empty list - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = %s", pendulum.datetime(2022, 5, 23, 13, 26, 45, 176145)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = %s", + pendulum.datetime(2022, 5, 23, 13, 26, 45, 176145), + ) assert len(db_rows) == 0 # use datetime - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = %s", datetime.datetime(2022, 5, 23, 13, 26, 45, 176000)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = %s", + datetime.datetime(2022, 5, 23, 13, 26, 45, 176000), + ) assert len(db_rows) == 10 - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col4 = %s", datetime.datetime(2022, 5, 23, 13, 26, 45, 176145)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col4 = %s", + datetime.datetime(2022, 5, 23, 13, 26, 45, 176145), + ) assert len(db_rows) == 0 # check date db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col10 = DATE '2023-02-27'") assert len(db_rows) == 10 - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col10 = %s", pendulum.date(2023, 2, 27)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col10 = %s", pendulum.date(2023, 2, 27) + ) assert len(db_rows) == 10 - db_rows = sql_client.execute_sql("SELECT * FROM data_types WHERE col10 = %s", datetime.date(2023, 2, 27)) + db_rows = sql_client.execute_sql( + "SELECT * FROM data_types WHERE col10 = %s", datetime.date(2023, 2, 27) + ) assert len(db_rows) == 10 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["athena"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["athena"]), + ids=lambda x: x.name, +) def test_athena_blocks_time_column(destination_config: DestinationTestConfiguration) -> None: pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) @@ -134,7 +172,7 @@ def test_athena_blocks_time_column(destination_config: DestinationTestConfigurat @dlt.resource(table_name="data_types", write_disposition="append", columns=column_schemas) def my_resource() -> Iterator[Any]: nonlocal data_types - yield [data_types]*10 + yield [data_types] * 10 @dlt.source(max_table_nesting=0) def my_source() -> Any: @@ -144,4 +182,7 @@ def my_source() -> Any: assert info.has_failed_jobs - assert "Athena cannot load TIME columns from parquet tables" in info.load_packages[0].jobs['failed_jobs'][0].failed_message + assert ( + "Athena cannot load TIME columns from parquet tables" + in info.load_packages[0].jobs["failed_jobs"][0].failed_message + ) diff --git a/tests/load/pipeline/test_dbt_helper.py b/tests/load/pipeline/test_dbt_helper.py index 37c1f0c607..11f59d5276 100644 --- a/tests/load/pipeline/test_dbt_helper.py +++ b/tests/load/pipeline/test_dbt_helper.py @@ -27,10 +27,16 @@ def dbt_venv() -> Iterator[Venv]: yield venv -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_run_jaffle_package(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_run_jaffle_package( + destination_config: DestinationTestConfiguration, dbt_venv: Venv +) -> None: if destination_config.destination == "athena": - pytest.skip("dbt-athena requires database to be created and we don't do it in case of Jaffle") + pytest.skip( + "dbt-athena requires database to be created and we don't do it in case of Jaffle" + ) pipeline = destination_config.setup_pipeline("jaffle_jaffle", full_refresh=True) # get runner, pass the env from fixture dbt = dlt.dbt.package(pipeline, "https://github.com/dbt-labs/jaffle_shop.git", venv=dbt_venv) @@ -55,16 +61,21 @@ def test_run_jaffle_package(destination_config: DestinationTestConfiguration, db assert len(orders) == 99 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: from docs.examples.chess.chess import chess + if not destination_config.supports_dbt: pytest.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" - pipeline = destination_config.setup_pipeline("chess_games", dataset_name="chess_dbt_test", full_refresh=True) + pipeline = destination_config.setup_pipeline( + "chess_games", dataset_name="chess_dbt_test", full_refresh=True + ) assert pipeline.default_schema_name is None # get the runner for the "dbt_transform" package transforms = dlt.dbt.package(pipeline, "docs/examples/chess/dbt_transform", venv=dbt_venv) @@ -81,29 +92,44 @@ def test_run_chess_dbt(destination_config: DestinationTestConfiguration, dbt_ven transforms.run_all(source_tests_selector="source:*") # run all the tests transforms.test() - load_ids = select_data(pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status") + load_ids = select_data( + pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status" + ) assert len(load_ids) == 2 - view_player_games = select_data(pipeline, "SELECT * FROM view_player_games ORDER BY username, uuid") + view_player_games = select_data( + pipeline, "SELECT * FROM view_player_games ORDER BY username, uuid" + ) assert len(view_player_games) > 0 # run again transforms.run() # no new load ids - no new data in view table - new_load_ids = select_data(pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status") - new_view_player_games = select_data(pipeline, "SELECT * FROM view_player_games ORDER BY username, uuid") + new_load_ids = select_data( + pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status" + ) + new_view_player_games = select_data( + pipeline, "SELECT * FROM view_player_games ORDER BY username, uuid" + ) assert load_ids == new_load_ids assert view_player_games == new_view_player_games -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_run_chess_dbt_to_other_dataset(destination_config: DestinationTestConfiguration, dbt_venv: Venv) -> None: +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_run_chess_dbt_to_other_dataset( + destination_config: DestinationTestConfiguration, dbt_venv: Venv +) -> None: from docs.examples.chess.chess import chess + if not destination_config.supports_dbt: pytest.skip("dbt is not supported for this destination configuration") # provide chess url via environ os.environ["CHESS_URL"] = "https://api.chess.com/pub/" - pipeline = destination_config.setup_pipeline("chess_games", dataset_name="chess_dbt_test", full_refresh=True) + pipeline = destination_config.setup_pipeline( + "chess_games", dataset_name="chess_dbt_test", full_refresh=True + ) # load each schema in separate dataset pipeline.config.use_single_dataset = False # assert pipeline.default_schema_name is None @@ -126,12 +152,18 @@ def test_run_chess_dbt_to_other_dataset(destination_config: DestinationTestConfi # run tests on destination dataset where transformations actually are transforms.test(destination_dataset_name=info.dataset_name + "_" + test_suffix) # get load ids from the source dataset - load_ids = select_data(pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status") + load_ids = select_data( + pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status" + ) assert len(load_ids) == 1 # status is 0, no more entries assert load_ids[0][2] == 0 # get from destination dataset - load_ids = select_data(pipeline, "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status", schema_name=test_suffix) + load_ids = select_data( + pipeline, + "SELECT load_id, schema_name, status FROM _dlt_loads ORDER BY status", + schema_name=test_suffix, + ) # TODO: the package is not finished, both results should be here assert len(load_ids) == 1 # status is 1, no more entries diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 4354460374..cd18454d7c 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -10,7 +10,11 @@ from dlt.common.utils import uniq_id from dlt.pipeline import helpers, state_sync, Pipeline from dlt.load import Load -from dlt.pipeline.exceptions import PipelineHasPendingDataException, PipelineNeverRan, PipelineStepFailed +from dlt.pipeline.exceptions import ( + PipelineHasPendingDataException, + PipelineNeverRan, + PipelineStepFailed, +) from dlt.destinations.job_client_impl import SqlJobClientBase from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -20,40 +24,46 @@ def _attach(pipeline: Pipeline) -> Pipeline: return dlt.attach(pipeline.pipeline_name, pipeline.pipelines_dir) -@dlt.source(section='droppable', name='droppable') +@dlt.source(section="droppable", name="droppable") def droppable_source() -> List[DltResource]: @dlt.resource - def droppable_a(a: dlt.sources.incremental[int]=dlt.sources.incremental('a', 0)) -> Iterator[Dict[str, Any]]: + def droppable_a( + a: dlt.sources.incremental[int] = dlt.sources.incremental("a", 0) + ) -> Iterator[Dict[str, Any]]: yield dict(a=1, b=2, c=3) yield dict(a=4, b=23, c=24) - @dlt.resource - def droppable_b(asd: dlt.sources.incremental[int]=dlt.sources.incremental('asd', 0)) -> Iterator[Dict[str, Any]]: + def droppable_b( + asd: dlt.sources.incremental[int] = dlt.sources.incremental("asd", 0) + ) -> Iterator[Dict[str, Any]]: # Child table yield dict(asd=2323, qe=555, items=[dict(m=1, n=2), dict(m=3, n=4)]) - @dlt.resource - def droppable_c(qe: dlt.sources.incremental[int] = dlt.sources.incremental('qe')) -> Iterator[Dict[str, Any]]: + def droppable_c( + qe: dlt.sources.incremental[int] = dlt.sources.incremental("qe"), + ) -> Iterator[Dict[str, Any]]: # Grandchild table - yield dict(asdasd=2424, qe=111, items=[ - dict(k=2, r=2, labels=[dict(name='abc'), dict(name='www')]) - ]) + yield dict( + asdasd=2424, qe=111, items=[dict(k=2, r=2, labels=[dict(name="abc"), dict(name="www")])] + ) @dlt.resource - def droppable_d(o: dlt.sources.incremental[int] = dlt.sources.incremental('o')) -> Iterator[List[Dict[str, Any]]]: - dlt.state()['data_from_d'] = {'foo1': {'bar': 1}, 'foo2': {'bar': 2}} + def droppable_d( + o: dlt.sources.incremental[int] = dlt.sources.incremental("o"), + ) -> Iterator[List[Dict[str, Any]]]: + dlt.state()["data_from_d"] = {"foo1": {"bar": 1}, "foo2": {"bar": 2}} yield [dict(o=55), dict(o=22)] return [droppable_a(), droppable_b(), droppable_c(), droppable_d()] RESOURCE_TABLES = dict( - droppable_a=['droppable_a'], - droppable_b=['droppable_b', 'droppable_b__items'], - droppable_c=['droppable_c', 'droppable_c__items', 'droppable_c__items__labels'], - droppable_d=['droppable_d'] + droppable_a=["droppable_a"], + droppable_b=["droppable_b", "droppable_b__items"], + droppable_c=["droppable_c", "droppable_c__items", "droppable_c__items__labels"], + droppable_d=["droppable_d"], ) @@ -61,12 +71,13 @@ def assert_dropped_resources(pipeline: Pipeline, resources: List[str]) -> None: assert_dropped_resource_tables(pipeline, resources) assert_dropped_resource_states(pipeline, resources) + def assert_dropped_resource_tables(pipeline: Pipeline, resources: List[str]) -> None: # Verify only requested resource tables are removed from pipeline schema all_tables = set(chain.from_iterable(RESOURCE_TABLES.values())) dropped_tables = set(chain.from_iterable(RESOURCE_TABLES[r] for r in resources)) expected_tables = all_tables - dropped_tables - result_tables = set(t['name'] for t in pipeline.default_schema.data_tables()) + result_tables = set(t["name"] for t in pipeline.default_schema.data_tables()) assert result_tables == expected_tables # Verify requested tables are dropped from destination @@ -86,8 +97,8 @@ def assert_dropped_resource_states(pipeline: Pipeline, resources: List[str]) -> # Verify only requested resource keys are removed from state all_resources = set(RESOURCE_TABLES.keys()) expected_keys = all_resources - set(resources) - sources_state = pipeline.state['sources'] - result_keys = set(sources_state['droppable']['resources'].keys()) + sources_state = pipeline.state["sources"] + result_keys = set(sources_state["droppable"]["resources"].keys()) assert result_keys == expected_keys @@ -97,136 +108,158 @@ def assert_destination_state_loaded(pipeline: Pipeline) -> None: with pipeline.destination_client() as client: # type: ignore[assignment] destination_state = state_sync.load_state_from_destination(pipeline.pipeline_name, client) pipeline_state = dict(pipeline.state) - del pipeline_state['_local'] + del pipeline_state["_local"] assert pipeline_state == destination_state -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_command_resources_and_state(destination_config: DestinationTestConfiguration) -> None: """Test the drop command with resource and state path options and verify correct data is deleted from destination and locally""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - helpers.drop(attached, resources=['droppable_c', 'droppable_d'], state_paths='data_from_d.*.bar') + helpers.drop( + attached, resources=["droppable_c", "droppable_d"], state_paths="data_from_d.*.bar" + ) attached = _attach(pipeline) - assert_dropped_resources(attached, ['droppable_c', 'droppable_d']) + assert_dropped_resources(attached, ["droppable_c", "droppable_d"]) # Verify extra json paths are removed from state - sources_state = pipeline.state['sources'] - assert sources_state['droppable']['data_from_d'] == {'foo1': {}, 'foo2': {}} + sources_state = pipeline.state["sources"] + assert sources_state["droppable"]["data_from_d"] == {"foo1": {}, "foo2": {}} assert_destination_state_loaded(pipeline) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_command_only_state(destination_config: DestinationTestConfiguration) -> None: """Test the drop command with resource and state path options and verify correct data is deleted from destination and locally""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - helpers.drop(attached, state_paths='data_from_d.*.bar') + helpers.drop(attached, state_paths="data_from_d.*.bar") attached = _attach(pipeline) assert_dropped_resources(attached, []) # Verify extra json paths are removed from state - sources_state = pipeline.state['sources'] - assert sources_state['droppable']['data_from_d'] == {'foo1': {}, 'foo2': {}} + sources_state = pipeline.state["sources"] + assert sources_state["droppable"]["data_from_d"] == {"foo1": {}, "foo2": {}} assert_destination_state_loaded(pipeline) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_destination_tables_fails(destination_config: DestinationTestConfiguration) -> None: """Fail on drop tables. Command runs again.""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - with mock.patch.object(helpers.DropCommand, '_drop_destination_tables', side_effect=RuntimeError("Something went wrong")): + with mock.patch.object( + helpers.DropCommand, + "_drop_destination_tables", + side_effect=RuntimeError("Something went wrong"), + ): with pytest.raises(RuntimeError): - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) attached = _attach(pipeline) - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) - assert_dropped_resources(attached, ['droppable_a', 'droppable_b']) + assert_dropped_resources(attached, ["droppable_a", "droppable_b"]) assert_destination_state_loaded(attached) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration) -> None: """Fail directly after drop tables. Command runs again ignoring destination tables missing.""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - with mock.patch.object(helpers.DropCommand, '_drop_state_keys', side_effect=RuntimeError("Something went wrong")): + with mock.patch.object( + helpers.DropCommand, "_drop_state_keys", side_effect=RuntimeError("Something went wrong") + ): with pytest.raises(RuntimeError): - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) attached = _attach(pipeline) - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) - assert_dropped_resources(attached, ['droppable_a', 'droppable_b']) + assert_dropped_resources(attached, ["droppable_a", "droppable_b"]) assert_destination_state_loaded(attached) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_load_step_fails(destination_config: DestinationTestConfiguration) -> None: """Test idempotence. pipeline.load() fails. Command can be run again successfully""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - with mock.patch.object(Load, 'run', side_effect=RuntimeError("Something went wrong")): + with mock.patch.object(Load, "run", side_effect=RuntimeError("Something went wrong")): with pytest.raises(PipelineStepFailed) as e: - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) assert isinstance(e.value.exception, RuntimeError) attached = _attach(pipeline) - helpers.drop(attached, resources=('droppable_a', 'droppable_b')) + helpers.drop(attached, resources=("droppable_a", "droppable_b")) - assert_dropped_resources(attached, ['droppable_a', 'droppable_b']) + assert_dropped_resources(attached, ["droppable_a", "droppable_b"]) assert_destination_state_loaded(attached) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_resource_regex(destination_config: DestinationTestConfiguration) -> None: source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) - helpers.drop(attached, resources=['re:.+_b', 're:.+_a']) + helpers.drop(attached, resources=["re:.+_b", "re:.+_a"]) attached = _attach(pipeline) - assert_dropped_resources(attached, ['droppable_a', 'droppable_b']) + assert_dropped_resources(attached, ["droppable_a", "droppable_b"]) assert_destination_state_loaded(attached) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: """No resources, no state keys. Nothing is changed.""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) attached = _attach(pipeline) @@ -238,13 +271,17 @@ def test_drop_nothing(destination_config: DestinationTestConfiguration) -> None: assert previous_state == attached.state -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None: """Using drop_all flag. Destination dataset and all local state is deleted""" source = droppable_source() - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(source) - dlt_tables = [t['name'] for t in pipeline.default_schema.dlt_tables()] # Original _dlt tables to check for + dlt_tables = [ + t["name"] for t in pipeline.default_schema.dlt_tables() + ] # Original _dlt tables to check for attached = _attach(pipeline) @@ -261,15 +298,17 @@ def test_drop_all_flag(destination_config: DestinationTestConfiguration) -> None assert exists -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(droppable_source()) attached = _attach(pipeline) - helpers.drop(attached, resources='droppable_a') + helpers.drop(attached, resources="droppable_a") attached = _attach(pipeline) @@ -278,30 +317,32 @@ def test_run_pipeline_after_partial_drop(destination_config: DestinationTestConf attached.load(raise_on_failed_jobs=True) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_drop_state_only(destination_config: DestinationTestConfiguration) -> None: """Pipeline can be run again after dropping some resources""" - pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) + pipeline = destination_config.setup_pipeline("drop_test_" + uniq_id(), full_refresh=True) pipeline.run(droppable_source()) attached = _attach(pipeline) - helpers.drop(attached, resources=('droppable_a', 'droppable_b'), state_only=True) + helpers.drop(attached, resources=("droppable_a", "droppable_b"), state_only=True) attached = _attach(pipeline) assert_dropped_resource_tables(attached, []) # No tables dropped - assert_dropped_resource_states(attached, ['droppable_a', 'droppable_b']) + assert_dropped_resource_states(attached, ["droppable_a", "droppable_b"]) assert_destination_state_loaded(attached) def test_drop_first_run_and_pending_packages() -> None: """Attempts to drop before pipeline runs and when partial loads happen""" - pipeline = dlt.pipeline('drop_test_' + uniq_id(), destination="dummy") + pipeline = dlt.pipeline("drop_test_" + uniq_id(), destination="dummy") with pytest.raises(PipelineNeverRan): helpers.drop(pipeline, "droppable_a") os.environ["COMPLETED_PROB"] = "1.0" pipeline.run(droppable_source().with_resources("droppable_a")) pipeline.extract(droppable_source().with_resources("droppable_b")) with pytest.raises(PipelineHasPendingDataException): - helpers.drop(pipeline, "droppable_a") \ No newline at end of file + helpers.drop(pipeline, "droppable_a") diff --git a/tests/load/pipeline/test_duckdb.py b/tests/load/pipeline/test_duckdb.py index c71ac37a81..6064392976 100644 --- a/tests/load/pipeline/test_duckdb.py +++ b/tests/load/pipeline/test_duckdb.py @@ -6,10 +6,18 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import airtable_emojis -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration, load_table_counts - - -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb"]), ids=lambda x: x.name) +from tests.load.pipeline.utils import ( + destinations_configs, + DestinationTestConfiguration, + load_table_counts, +) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb"]), + ids=lambda x: x.name, +) def test_duck_case_names(destination_config: DestinationTestConfiguration) -> None: # we want to have nice tables # dlt.config["schema.naming"] = "duck_case" @@ -18,14 +26,16 @@ def test_duck_case_names(destination_config: DestinationTestConfiguration) -> No # create tables and columns with emojis and other special characters pipeline.run(airtable_emojis().with_resources("📆 Schedule", "🦚Peacock", "🦚WidePeacock")) pipeline.run([{"🐾Feet": 2, "1+1": "two", "\nhey": "value"}], table_name="🦚Peacocks🦚") - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts == { "📆 Schedule": 3, "🦚Peacock": 1, - '🦚Peacock__peacock': 3, - '🦚Peacocks🦚': 1, - '🦚WidePeacock': 1, - '🦚WidePeacock__peacock': 3 + "🦚Peacock__peacock": 3, + "🦚Peacocks🦚": 1, + "🦚WidePeacock": 1, + "🦚WidePeacock__peacock": 3, } # this will fail - duckdb preserves case but is case insensitive when comparing identifiers @@ -38,5 +48,3 @@ def test_duck_case_names(destination_config: DestinationTestConfiguration) -> No with client.execute_query("DESCRIBE 🦚peacocks🦚;") as q: tables = q.df() assert tables["column_name"].tolist() == ["🐾Feet", "1+1", "hey", "_dlt_load_id", "_dlt_id"] - - diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index dce65bc8d7..8fc4adc0c3 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -12,12 +12,16 @@ skip_if_not_active("filesystem") -def assert_file_matches(layout: str, job: LoadJobInfo, load_id: str, client: FilesystemClient) -> None: +def assert_file_matches( + layout: str, job: LoadJobInfo, load_id: str, client: FilesystemClient +) -> None: """Verify file contents of load job are identical to the corresponding file in destination""" local_path = Path(job.file_path) filename = local_path.name - destination_fn = LoadFilesystemJob.make_destination_filename(layout, filename, client.schema.name, load_id) + destination_fn = LoadFilesystemJob.make_destination_filename( + layout, filename, client.schema.name, load_id + ) destination_path = posixpath.join(client.dataset_path, destination_fn) assert local_path.read_bytes() == client.fs_client.read_bytes(destination_path) @@ -29,11 +33,15 @@ def test_pipeline_merge_write_disposition(default_buckets_env: str) -> None: """ import pyarrow.parquet as pq # Module is evaluated by other tests - pipeline = dlt.pipeline(pipeline_name='test_' + uniq_id(), destination="filesystem", dataset_name='test_' + uniq_id()) + pipeline = dlt.pipeline( + pipeline_name="test_" + uniq_id(), + destination="filesystem", + dataset_name="test_" + uniq_id(), + ) - @dlt.resource(primary_key='id') + @dlt.resource(primary_key="id") def some_data(): - yield [{'id': 1}, {'id': 2}, {'id': 3}] + yield [{"id": 1}, {"id": 2}, {"id": 3}] @dlt.resource def other_data(): @@ -43,8 +51,8 @@ def other_data(): def some_source(): return [some_data(), other_data()] - info1 = pipeline.run(some_source(), write_disposition='merge') - info2 = pipeline.run(some_source(), write_disposition='merge') + info1 = pipeline.run(some_source(), write_disposition="merge") + info2 = pipeline.run(some_source(), write_disposition="merge") client: FilesystemClient = pipeline.destination_client() # type: ignore[assignment] layout = client.config.layout @@ -71,10 +79,9 @@ def some_source(): # Verify file contents assert info2.load_packages for pkg in info2.load_packages: - assert pkg.jobs['completed_jobs'] - for job in pkg.jobs['completed_jobs']: - assert_file_matches(layout, job, pkg.load_id, client) - + assert pkg.jobs["completed_jobs"] + for job in pkg.jobs["completed_jobs"]: + assert_file_matches(layout, job, pkg.load_id, client) complete_fn = f"{client.schema.name}.{LOADS_TABLE_NAME}.%s" @@ -83,7 +90,7 @@ def some_source(): assert client.fs_client.isfile(posixpath.join(client.dataset_path, complete_fn % load_id2)) # Force replace - pipeline.run(some_source(), write_disposition='replace') + pipeline.run(some_source(), write_disposition="replace") append_files = client.fs_client.ls(append_glob, detail=False, refresh=True) replace_files = client.fs_client.ls(replace_glob, detail=False, refresh=True) assert len(append_files) == 1 @@ -91,16 +98,19 @@ def some_source(): def test_pipeline_parquet_filesystem_destination() -> None: - import pyarrow.parquet as pq # Module is evaluated by other tests # store locally - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = "file://_storage" - pipeline = dlt.pipeline(pipeline_name='parquet_test_' + uniq_id(), destination="filesystem", dataset_name='parquet_test_' + uniq_id()) - - @dlt.resource(primary_key='id') + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = "file://_storage" + pipeline = dlt.pipeline( + pipeline_name="parquet_test_" + uniq_id(), + destination="filesystem", + dataset_name="parquet_test_" + uniq_id(), + ) + + @dlt.resource(primary_key="id") def some_data(): - yield [{'id': 1}, {'id': 2}, {'id': 3}] + yield [{"id": 1}, {"id": 2}, {"id": 3}] @dlt.resource def other_data(): @@ -119,8 +129,8 @@ def some_source(): assert len(package_info.jobs["completed_jobs"]) == 3 client: FilesystemClient = pipeline.destination_client() # type: ignore[assignment] - some_data_glob = posixpath.join(client.dataset_path, 'some_data/*') - other_data_glob = posixpath.join(client.dataset_path, 'other_data/*') + some_data_glob = posixpath.join(client.dataset_path, "some_data/*") + other_data_glob = posixpath.join(client.dataset_path, "other_data/*") some_data_files = client.fs_client.glob(some_data_glob) other_data_files = client.fs_client.glob(other_data_glob) diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index 4e8d1f9049..0714ac333d 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -25,14 +25,20 @@ # ACTIVE_DESTINATIONS += ["motherduck"] -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("eth_2", full_refresh=True) with open("tests/common/cases/schemas/eth/ethereum_schema_v5.yml", "r", encoding="utf-8") as f: schema = dlt.Schema.from_dict(yaml.safe_load(f)) - with open("tests/normalize/cases/ethereum.blocks.9c1d9b504ea240a482b007788d5cd61c_2.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/ethereum.blocks.9c1d9b504ea240a482b007788d5cd61c_2.json", + "r", + encoding="utf-8", + ) as f: data = json.load(f) # take only the first block. the first block does not have uncles so this table should not be created and merged @@ -42,7 +48,10 @@ def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguratio # we load a single block assert eth_1_counts["blocks"] == 1 # check root key propagation - assert p.default_schema.tables["blocks__transactions"]["columns"]["_dlt_root_id"]["root_key"] is True + assert ( + p.default_schema.tables["blocks__transactions"]["columns"]["_dlt_root_id"]["root_key"] + is True + ) # now we load the whole dataset. blocks should be created which adds columns to blocks # if the table would be created before the whole load would fail because new columns have hints info = p.run(data, table_name="blocks", write_disposition="merge", schema=schema) @@ -59,11 +68,15 @@ def test_merge_on_keys_in_schema(destination_config: DestinationTestConfiguratio assert eth_2_counts == eth_3_counts -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_merge_on_ad_hoc_primary_key(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_1", full_refresh=True) - with open("tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: data = json.load(f) # note: NodeId will be normalized to "node_id" which exists in the schema info = p.run(data[:17], table_name="issues", write_disposition="merge", primary_key="NodeId") @@ -89,17 +102,27 @@ def test_merge_on_ad_hoc_primary_key(destination_config: DestinationTestConfigur @dlt.source(root_key=True) def github(): - - @dlt.resource(table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url")) + @dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", + merge_key=("node_id", "url"), + ) def load_issues(): - with open("tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: yield from json.load(f) return load_issues -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_merge_source_compound_keys_and_changes(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_merge_source_compound_keys_and_changes( + destination_config: DestinationTestConfiguration, +) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) info = p.run(github()) @@ -108,9 +131,18 @@ def test_merge_source_compound_keys_and_changes(destination_config: DestinationT # 100 issues total assert github_1_counts["issues"] == 100 # check keys created - assert p.default_schema.tables["issues"]["columns"]["node_id"].items() > {"merge_key": True, "data_type": "text", "nullable": False}.items() - assert p.default_schema.tables["issues"]["columns"]["url"].items() > {"merge_key": True, "data_type": "text", "nullable": False}.items() - assert p.default_schema.tables["issues"]["columns"]["id"].items() > {"primary_key": True, "data_type": "bigint", "nullable": False}.items() + assert ( + p.default_schema.tables["issues"]["columns"]["node_id"].items() + > {"merge_key": True, "data_type": "text", "nullable": False}.items() + ) + assert ( + p.default_schema.tables["issues"]["columns"]["url"].items() + > {"merge_key": True, "data_type": "text", "nullable": False}.items() + ) + assert ( + p.default_schema.tables["issues"]["columns"]["id"].items() + > {"primary_key": True, "data_type": "bigint", "nullable": False}.items() + ) # append load_issues resource info = p.run(github().load_issues, write_disposition="append") @@ -118,10 +150,10 @@ def test_merge_source_compound_keys_and_changes(destination_config: DestinationT assert p.default_schema.tables["issues"]["write_disposition"] == "append" # the counts of all tables must be double github_2_counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables()]) - assert {k:v*2 for k, v in github_1_counts.items()} == github_2_counts + assert {k: v * 2 for k, v in github_1_counts.items()} == github_2_counts # now replace all resources - info = p.run(github(), write_disposition="replace" ) + info = p.run(github(), write_disposition="replace") assert_load_info(info) assert p.default_schema.tables["issues"]["write_disposition"] == "replace" # assert p.default_schema.tables["issues__labels"]["write_disposition"] == "replace" @@ -130,7 +162,9 @@ def test_merge_source_compound_keys_and_changes(destination_config: DestinationT assert github_1_counts == github_3_counts -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) github_data = github() @@ -161,7 +195,9 @@ def test_merge_no_child_tables(destination_config: DestinationTestConfiguration) assert github_2_counts["issues"] == 100 if destination_config.supports_merge else 115 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_merge_no_merge_keys(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) github_data = github() @@ -187,19 +223,24 @@ def test_merge_no_merge_keys(destination_config: DestinationTestConfiguration) - assert github_1_counts["issues"] == 10 if destination_config.supports_merge else 100 - 45 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_merge_keys_non_existing_columns(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) github_data = github() # set keys names that do not exist in the data - github_data.load_issues.apply_hints(merge_key=("mA1", "Ma2"), primary_key=("123-x", )) + github_data.load_issues.apply_hints(merge_key=("mA1", "Ma2"), primary_key=("123-x",)) # skip first 45 rows github_data.load_issues.add_filter(skip_first(45)) info = p.run(github_data) assert_load_info(info) github_1_counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables()]) assert github_1_counts["issues"] == 100 - 45 - assert p.default_schema.tables["issues"]["columns"]["m_a1"].items() > {"merge_key": True, "nullable": False}.items() + assert ( + p.default_schema.tables["issues"]["columns"]["m_a1"].items() + > {"merge_key": True, "nullable": False}.items() + ) # for non merge destinations we just check that the run passes if not destination_config.supports_merge: @@ -207,7 +248,7 @@ def test_merge_keys_non_existing_columns(destination_config: DestinationTestConf # all the keys are invalid so the merge falls back to replace github_data = github() - github_data.load_issues.apply_hints(merge_key=("mA1", "Ma2"), primary_key=("123-x", )) + github_data.load_issues.apply_hints(merge_key=("mA1", "Ma2"), primary_key=("123-x",)) github_data.load_issues.add_filter(take_first(1)) info = p.run(github_data) assert_load_info(info) @@ -219,7 +260,11 @@ def test_merge_keys_non_existing_columns(destination_config: DestinationTestConf assert "m_a1" not in table_schema # unbound columns were not created -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["duckdb", "snowflake", "bigquery"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["duckdb", "snowflake", "bigquery"]), + ids=lambda x: x.name, +) def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("github_3", full_refresh=True) github_data = github() @@ -227,7 +272,9 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) github_data.max_table_nesting = 2 github_data_copy = github() github_data_copy.max_table_nesting = 2 - info = p.run([github_data, github_data_copy], loader_file_format="parquet", write_disposition="merge") + info = p.run( + [github_data, github_data_copy], loader_file_format="parquet", write_disposition="merge" + ) assert_load_info(info) # make sure it was parquet or sql transforms files = p.get_load_package_info(p.list_completed_load_packages()[0]).jobs["completed_jobs"] @@ -250,22 +297,34 @@ def test_pipeline_load_parquet(destination_config: DestinationTestConfiguration) assert github_1_counts["issues"] == 100 - -@dlt.transformer(name="github_repo_events", primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) -def github_repo_events(page: List[StrAny], last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): - """A transformer taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at' """ +@dlt.transformer( + name="github_repo_events", + primary_key="id", + write_disposition="merge", + table_name=lambda i: i["type"], +) +def github_repo_events( + page: List[StrAny], + last_created_at=dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z"), +): + """A transformer taking a stream of github events and dispatching them to tables named by event type. Deduplicates be 'id'. Loads incrementally by 'created_at'""" yield page @dlt.transformer(name="github_repo_events", primary_key="id", write_disposition="merge") -def github_repo_events_table_meta(page: List[StrAny], last_created_at = dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z")): - """A transformer taking a stream of github events and dispatching them to tables using table meta. Deduplicates be 'id'. Loads incrementally by 'created_at' """ - yield from [dlt.mark.with_table_name(p, p['type']) for p in page] +def github_repo_events_table_meta( + page: List[StrAny], + last_created_at=dlt.sources.incremental("created_at", "1970-01-01T00:00:00Z"), +): + """A transformer taking a stream of github events and dispatching them to tables using table meta. Deduplicates be 'id'. Loads incrementally by 'created_at'""" + yield from [dlt.mark.with_table_name(p, p["type"]) for p in page] @dlt.resource def _get_shuffled_events(shuffle: bool = dlt.secrets.value): - with open("tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8" + ) as f: issues = json.load(f) # random order if shuffle: @@ -273,17 +332,22 @@ def _get_shuffled_events(shuffle: bool = dlt.secrets.value): yield issues - -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -@pytest.mark.parametrize("github_resource",[github_repo_events, github_repo_events_table_meta]) -def test_merge_with_dispatch_and_incremental(destination_config: DestinationTestConfiguration, github_resource: DltResource) -> None: - newest_issues = list(sorted(_get_shuffled_events(True), key = lambda x: x["created_at"], reverse=True)) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +@pytest.mark.parametrize("github_resource", [github_repo_events, github_repo_events_table_meta]) +def test_merge_with_dispatch_and_incremental( + destination_config: DestinationTestConfiguration, github_resource: DltResource +) -> None: + newest_issues = list( + sorted(_get_shuffled_events(True), key=lambda x: x["created_at"], reverse=True) + ) newest_issue = newest_issues[0] @dlt.resource def _new_event(node_id): new_i = copy(newest_issue) - new_i["id"] = str(random.randint(0, 2^32)) + new_i["id"] = str(random.randint(0, 2 ^ 32)) new_i["created_at"] = pendulum.now().isoformat() new_i["node_id"] = node_id # yield pages @@ -301,21 +365,33 @@ def _updated_event(node_id): with Container().injectable_context(StateInjectableContext(state={})): assert len(list(_get_shuffled_events(True) | github_resource)) == 100 incremental_state = github_resource.state - assert incremental_state["incremental"]["created_at"]["last_value"] == newest_issue["created_at"] - assert incremental_state["incremental"]["created_at"]["unique_hashes"] == [digest128(f'"{newest_issue["id"]}"')] + assert ( + incremental_state["incremental"]["created_at"]["last_value"] + == newest_issue["created_at"] + ) + assert incremental_state["incremental"]["created_at"]["unique_hashes"] == [ + digest128(f'"{newest_issue["id"]}"') + ] # subsequent load will skip all elements assert len(list(_get_shuffled_events(True) | github_resource)) == 0 # add one more issue assert len(list(_new_event("new_node") | github_resource)) == 1 - assert incremental_state["incremental"]["created_at"]["last_value"] > newest_issue["created_at"] - assert incremental_state["incremental"]["created_at"]["unique_hashes"] != [digest128(str(newest_issue["id"]))] + assert ( + incremental_state["incremental"]["created_at"]["last_value"] + > newest_issue["created_at"] + ) + assert incremental_state["incremental"]["created_at"]["unique_hashes"] != [ + digest128(str(newest_issue["id"])) + ] # load to destination p = destination_config.setup_pipeline("github_3", full_refresh=True) info = p.run(_get_shuffled_events(True) | github_resource) assert_load_info(info) # get top tables - counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None]) + counts = load_table_counts( + p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None] + ) # total number of events in all top tables == 100 assert sum(counts.values()) == 100 # this should skip all events due to incremental load @@ -326,10 +402,12 @@ def _updated_event(node_id): # load one more event with a new id info = p.run(_new_event("new_node") | github_resource) assert_load_info(info) - counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None]) + counts = load_table_counts( + p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None] + ) assert sum(counts.values()) == 101 # all the columns have primary keys and merge disposition derived from resource - for table in p.default_schema.data_tables(): + for table in p.default_schema.data_tables(): if table.get("parent") is None: assert table["write_disposition"] == "merge" assert table["columns"]["id"]["primary_key"] is True @@ -338,7 +416,9 @@ def _updated_event(node_id): info = p.run(_updated_event("new_node_X") | github_resource) assert_load_info(info) # still 101 - counts = load_table_counts(p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None]) + counts = load_table_counts( + p, *[t["name"] for t in p.default_schema.data_tables() if t.get("parent") is None] + ) assert sum(counts.values()) == 101 if destination_config.supports_merge else 102 # for non merge destinations we just check that the run passes if not destination_config.supports_merge: @@ -350,13 +430,18 @@ def _updated_event(node_id): assert len(list(q.fetchall())) == 1 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_deduplicate_single_load(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("abstract", full_refresh=True) @dlt.resource(write_disposition="merge", primary_key="id") def duplicates(): - yield [{"id": 1, "name": "row1", "child": [1, 2, 3]}, {"id": 1, "name": "row2", "child": [4, 5, 6]}] + yield [ + {"id": 1, "name": "row1", "child": [1, 2, 3]}, + {"id": 1, "name": "row2", "child": [4, 5, 6]}, + ] info = p.run(duplicates()) assert_load_info(info) @@ -366,7 +451,6 @@ def duplicates(): qual_name = p.sql_client().make_qualified_table_name("duplicates") select_data(p, f"SELECT * FROM {qual_name}")[0] - @dlt.resource(write_disposition="merge", primary_key=("id", "subkey")) def duplicates_no_child(): yield [{"id": 1, "subkey": "AX", "name": "row1"}, {"id": 1, "subkey": "AX", "name": "row2"}] @@ -377,13 +461,18 @@ def duplicates_no_child(): assert counts["duplicates_no_child"] == 1 if destination_config.supports_merge else 2 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_no_deduplicate_only_merge_key(destination_config: DestinationTestConfiguration) -> None: p = destination_config.setup_pipeline("abstract", full_refresh=True) @dlt.resource(write_disposition="merge", merge_key="id") def duplicates(): - yield [{"id": 1, "name": "row1", "child": [1, 2, 3]}, {"id": 1, "name": "row2", "child": [4, 5, 6]}] + yield [ + {"id": 1, "name": "row1", "child": [1, 2, 3]}, + {"id": 1, "name": "row2", "child": [4, 5, 6]}, + ] info = p.run(duplicates()) assert_load_info(info) @@ -391,7 +480,6 @@ def duplicates(): assert counts["duplicates"] == 2 assert counts["duplicates__child"] == 6 - @dlt.resource(write_disposition="merge", merge_key=("id", "subkey")) def duplicates_no_child(): yield [{"id": 1, "subkey": "AX", "name": "row1"}, {"id": 1, "subkey": "AX", "name": "row2"}] diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 004aac0285..9eb570ee6a 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -15,20 +15,41 @@ from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceNameMissing from dlt.extract import DltSource -from dlt.pipeline.exceptions import CannotRestorePipelineException, PipelineConfigMissing, PipelineStepFailed +from dlt.pipeline.exceptions import ( + CannotRestorePipelineException, + PipelineConfigMissing, + PipelineStepFailed, +) from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.exceptions import DestinationHasFailedJobs from tests.utils import TEST_STORAGE_ROOT, preserve_environ from tests.pipeline.utils import assert_load_info -from tests.load.utils import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA, assert_all_data_types_row, delete_dataset -from tests.load.pipeline.utils import drop_active_pipeline_data, assert_query_data, assert_table, load_table_counts, select_data +from tests.load.utils import ( + TABLE_ROW_ALL_DATA_TYPES, + TABLE_UPDATE_COLUMNS_SCHEMA, + assert_all_data_types_row, + delete_dataset, +) +from tests.load.pipeline.utils import ( + drop_active_pipeline_data, + assert_query_data, + assert_table, + load_table_counts, + select_data, +) from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name) -@pytest.mark.parametrize('use_single_dataset', [True, False]) -def test_default_pipeline_names(use_single_dataset: bool, destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("use_single_dataset", [True, False]) +def test_default_pipeline_names( + use_single_dataset: bool, destination_config: DestinationTestConfiguration +) -> None: destination_config.setup() p = dlt.pipeline() p.config.use_single_dataset = use_single_dataset @@ -67,8 +88,12 @@ def data_fun() -> Iterator[Any]: with p.managed_state(): p._set_destinations( Destination.from_reference(destination_config.destination), - Destination.from_reference(destination_config.staging) if destination_config.staging else None - ) + ( + Destination.from_reference(destination_config.staging) + if destination_config.staging + else None + ), + ) # does not reset the dataset name assert p.dataset_name in possible_dataset_names # never do that in production code @@ -92,13 +117,23 @@ def data_fun() -> Iterator[Any]: assert_table(p, "data_fun", data, schema_name="names", info=info) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) def test_default_schema_name(destination_config: DestinationTestConfiguration) -> None: destination_config.setup() dataset_name = "dataset_" + uniq_id() data = ["a", "b", "c"] - p = dlt.pipeline("test_default_schema_name", TEST_STORAGE_ROOT, destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p = dlt.pipeline( + "test_default_schema_name", + TEST_STORAGE_ROOT, + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) p.extract(data, table_name="test", schema=Schema("default")) p.normalize() info = p.load() @@ -111,9 +146,12 @@ def test_default_schema_name(destination_config: DestinationTestConfiguration) - assert_table(p, "test", data, info=info) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) def test_attach_pipeline(destination_config: DestinationTestConfiguration) -> None: - # load data and then restore the pipeline and see if data is still there data = ["a", "b", "c"] @@ -123,7 +161,12 @@ def _data(): yield d destination_config.setup() - info = dlt.run(_data(), destination=destination_config.destination, staging=destination_config.staging, dataset_name="specific" + uniq_id()) + info = dlt.run( + _data(), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name="specific" + uniq_id(), + ) with pytest.raises(CannotRestorePipelineException): dlt.attach("unknown") @@ -144,9 +187,12 @@ def _data(): assert_table(p, "data_table", data, info=info) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_skip_sync_schema_for_tables_without_columns(destination_config: DestinationTestConfiguration) -> None: - +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_skip_sync_schema_for_tables_without_columns( + destination_config: DestinationTestConfiguration, +) -> None: # load data and then restore the pipeline and see if data is still there data = ["a", "b", "c"] @@ -173,7 +219,11 @@ def _data(): assert not exists -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) def test_run_full_refresh(destination_config: DestinationTestConfiguration) -> None: data = ["a", ["a", "b", "c"], ["a", "b", "c"]] destination_config.setup() @@ -186,7 +236,12 @@ def _data(): return dlt.resource(d(), name="lists", write_disposition="replace") p = dlt.pipeline(full_refresh=True) - info = p.run(_data(), destination=destination_config.destination, staging=destination_config.staging, dataset_name="iteration" + uniq_id()) + info = p.run( + _data(), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name="iteration" + uniq_id(), + ) assert info.dataset_name == p.dataset_name assert info.dataset_name.endswith(p._pipeline_instance_id) # print(p.default_schema.to_pretty_yaml()) @@ -203,23 +258,18 @@ def _data(): assert_table(p, "lists__value", sorted(data_list)) - -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_evolve_schema(destination_config: DestinationTestConfiguration) -> None: dataset_name = "d" + uniq_id() row = { "id": "level0", - "f": [{ - "id": "level1", - "l": ["a", "b", "c"], - "v": 120, - "o": [{"a": 1}, {"a": 2}] - }] + "f": [{"id": "level1", "l": ["a", "b", "c"], "v": 120, "o": [{"a": 1}, {"a": 2}]}], } @dlt.source(name="parallel") def source(top_elements: int): - @dlt.defer def get_item(no: int) -> TDataItem: # the test will not last 10 seconds but 2 (there are 5 working threads by default) @@ -228,23 +278,38 @@ def get_item(no: int) -> TDataItem: data["id"] = "level" + str(no) return data - @dlt.resource(columns={"id": {"name": "id", "nullable": False, "data_type": "text", "unique": True, "sort": True}}) + @dlt.resource( + columns={ + "id": { + "name": "id", + "nullable": False, + "data_type": "text", + "unique": True, + "sort": True, + } + } + ) def simple_rows(): for no in range(top_elements): # yield deferred items resolved in threads yield get_item(no) - @dlt.resource(table_name="simple_rows", columns={"new_column": {"nullable": True, "data_type": "decimal"}}) + @dlt.resource( + table_name="simple_rows", + columns={"new_column": {"nullable": True, "data_type": "decimal"}}, + ) def extended_rows(): for no in range(top_elements): # yield deferred items resolved in threads - yield get_item(no+100) + yield get_item(no + 100) return simple_rows(), extended_rows(), dlt.resource(["a", "b", "c"], name="simple") import_schema_path = os.path.join(TEST_STORAGE_ROOT, "schemas", "import") export_schema_path = os.path.join(TEST_STORAGE_ROOT, "schemas", "export") - p = destination_config.setup_pipeline("my_pipeline", import_schema_path=import_schema_path, export_schema_path=export_schema_path) + p = destination_config.setup_pipeline( + "my_pipeline", import_schema_path=import_schema_path, export_schema_path=export_schema_path + ) p.extract(source(10).with_resources("simple_rows")) # print(p.default_schema.to_pretty_yaml()) @@ -285,21 +350,35 @@ def extended_rows(): # TODO: test export and import schema # test data - id_data = sorted(["level" + str(n) for n in range(10)] + ["level" + str(n) for n in range(100, 110)]) + id_data = sorted( + ["level" + str(n) for n in range(10)] + ["level" + str(n) for n in range(100, 110)] + ) with p.sql_client() as client: simple_rows_table = client.make_qualified_table_name("simple_rows") dlt_loads_table = client.make_qualified_table_name("_dlt_loads") assert_query_data(p, f"SELECT * FROM {simple_rows_table} ORDER BY id", id_data) - assert_query_data(p, f"SELECT schema_version_hash FROM {dlt_loads_table} ORDER BY inserted_at", version_history) - - -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), ids=lambda x: x.name) -@pytest.mark.parametrize('disable_compression', [True, False]) -def test_pipeline_data_writer_compression(disable_compression: bool, destination_config: DestinationTestConfiguration) -> None: + assert_query_data( + p, + f"SELECT schema_version_hash FROM {dlt_loads_table} ORDER BY inserted_at", + version_history, + ) + + +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_buckets_filesystem_configs=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("disable_compression", [True, False]) +def test_pipeline_data_writer_compression( + disable_compression: bool, destination_config: DestinationTestConfiguration +) -> None: # Ensure pipeline works without compression data = ["a", "b", "c"] - dataset_name = "compression_data_"+ uniq_id() - dlt.config["data_writer"] = {"disable_compression": disable_compression} # not sure how else to set this + dataset_name = "compression_data_" + uniq_id() + dlt.config["data_writer"] = { + "disable_compression": disable_compression + } # not sure how else to set this p = destination_config.setup_pipeline("compression_test", dataset_name=dataset_name) p.extract(dlt.resource(data, name="data")) s = p._get_normalize_storage() @@ -313,27 +392,24 @@ def test_pipeline_data_writer_compression(disable_compression: bool, destination assert_table(p, "data", data, info=info) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_source_max_nesting(destination_config: DestinationTestConfiguration) -> None: destination_config.setup() - complex_part = { - "l": [1, 2, 3], - "c": { - "a": 1, - "b": 12.3 - } - } + complex_part = {"l": [1, 2, 3], "c": {"a": 1, "b": 12.3}} @dlt.source(name="complex", max_table_nesting=0) def complex_data(): - return dlt.resource([ - { - "idx": 1, - "cn": complex_part - } - ], name="complex_cn") - info = dlt.run(complex_data(), destination=destination_config.destination, staging=destination_config.staging, dataset_name="ds_" + uniq_id()) + return dlt.resource([{"idx": 1, "cn": complex_part}], name="complex_cn") + + info = dlt.run( + complex_data(), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name="ds_" + uniq_id(), + ) print(info) with dlt.pipeline().sql_client() as client: complex_cn_table = client.make_qualified_table_name("complex_cn") @@ -345,7 +421,9 @@ def complex_data(): assert cn_val == complex_part -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_dataset_name_change(destination_config: DestinationTestConfiguration) -> None: destination_config.setup() # standard name @@ -385,11 +463,18 @@ def test_dataset_name_change(destination_config: DestinationTestConfiguration) - # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["postgres"]), ids=lambda x: x.name) -def test_pipeline_explicit_destination_credentials(destination_config: DestinationTestConfiguration) -> None: - +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres"]), + ids=lambda x: x.name, +) +def test_pipeline_explicit_destination_credentials( + destination_config: DestinationTestConfiguration, +) -> None: # explicit credentials resolved - p = dlt.pipeline(destination="postgres", credentials="postgresql://loader:loader@localhost:5432/dlt_data") + p = dlt.pipeline( + destination="postgres", credentials="postgresql://loader:loader@localhost:5432/dlt_data" + ) c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] assert c.config.credentials.host == "localhost" # type: ignore[attr-defined] @@ -398,7 +483,9 @@ def test_pipeline_explicit_destination_credentials(destination_config: Destinati os.environ.pop("DESTINATION__POSTGRES__CREDENTIALS", None) # explicit credentials resolved ignoring the config providers os.environ["DESTINATION__POSTGRES__CREDENTIALS__HOST"] = "HOST" - p = dlt.pipeline(destination="postgres", credentials="postgresql://loader:loader@localhost:5432/dlt_data") + p = dlt.pipeline( + destination="postgres", credentials="postgresql://loader:loader@localhost:5432/dlt_data" + ) c = p._get_destination_clients(Schema("s"), p._get_destination_client_initial_config())[0] assert c.config.credentials.host == "localhost" # type: ignore[attr-defined] @@ -420,14 +507,18 @@ def test_pipeline_explicit_destination_credentials(destination_config: Destinati # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["postgres"]), ids=lambda x: x.name) -def test_pipeline_with_sources_sharing_schema(destination_config: DestinationTestConfiguration) -> None: - +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres"]), + ids=lambda x: x.name, +) +def test_pipeline_with_sources_sharing_schema( + destination_config: DestinationTestConfiguration, +) -> None: schema = Schema("shared") @dlt.source(schema=schema, max_table_nesting=1) def source_1(): - @dlt.resource(primary_key="user_id") def gen1(): dlt.current.source_state()["source_1"] = True @@ -442,7 +533,6 @@ def conflict(): @dlt.source(schema=schema, max_table_nesting=2) def source_2(): - @dlt.resource(primary_key="id") def gen1(): dlt.current.source_state()["source_2"] = True @@ -485,9 +575,15 @@ def conflict(): p.load() table_names = [t["name"] for t in default_schema.data_tables()] counts = load_table_counts(p, *table_names) - assert counts == {'gen1': 2, 'gen2': 3, 'conflict': 1} + assert counts == {"gen1": 2, "gen2": 3, "conflict": 1} # both sources share the same state - assert p.state["sources"] == {'shared': {'source_1': True, 'resources': {'gen1': {'source_1': True, 'source_2': True}}, 'source_2': True}} + assert p.state["sources"] == { + "shared": { + "source_1": True, + "resources": {"gen1": {"source_1": True, "source_2": True}}, + "source_2": True, + } + } drop_active_pipeline_data() # same pipeline but enable conflict @@ -498,13 +594,16 @@ def conflict(): # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["postgres"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["postgres"]), + ids=lambda x: x.name, +) def test_many_pipelines_single_dataset(destination_config: DestinationTestConfiguration) -> None: schema = Schema("shared") @dlt.source(schema=schema, max_table_nesting=1) def source_1(): - @dlt.resource(primary_key="user_id") def gen1(): dlt.current.source_state()["source_1"] = True @@ -515,7 +614,6 @@ def gen1(): @dlt.source(schema=schema, max_table_nesting=2) def source_2(): - @dlt.resource(primary_key="id") def gen1(): dlt.current.source_state()["source_2"] = True @@ -528,44 +626,68 @@ def gen2(): return gen2, gen1 # load source_1 to common dataset - p = dlt.pipeline(pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset") + p = dlt.pipeline( + pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset" + ) p.run(source_1(), credentials="duckdb:///_storage/test_quack.duckdb") counts = load_table_counts(p, *p.default_schema.tables.keys()) - assert counts.items() >= {'gen1': 1, '_dlt_pipeline_state': 1, "_dlt_loads": 1}.items() + assert counts.items() >= {"gen1": 1, "_dlt_pipeline_state": 1, "_dlt_loads": 1}.items() p._wipe_working_folder() p.deactivate() - p = dlt.pipeline(pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset") + p = dlt.pipeline( + pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset" + ) p.run(source_2(), credentials="duckdb:///_storage/test_quack.duckdb") # table_names = [t["name"] for t in p.default_schema.data_tables()] counts = load_table_counts(p, *p.default_schema.tables.keys()) # gen1: one record comes from source_1, 1 record from source_2 - assert counts.items() >= {'gen1': 2, '_dlt_pipeline_state': 2, "_dlt_loads": 2}.items() + assert counts.items() >= {"gen1": 2, "_dlt_pipeline_state": 2, "_dlt_loads": 2}.items() # assert counts == {'gen1': 2, 'gen2': 3} p._wipe_working_folder() p.deactivate() # restore from destination, check state - p = dlt.pipeline(pipeline_name="source_1_pipeline", destination="duckdb", dataset_name="shared_dataset", credentials="duckdb:///_storage/test_quack.duckdb") + p = dlt.pipeline( + pipeline_name="source_1_pipeline", + destination="duckdb", + dataset_name="shared_dataset", + credentials="duckdb:///_storage/test_quack.duckdb", + ) p.sync_destination() # we have our separate state - assert p.state["sources"]["shared"] == {'source_1': True, 'resources': {'gen1': {'source_1': True}}} + assert p.state["sources"]["shared"] == { + "source_1": True, + "resources": {"gen1": {"source_1": True}}, + } # but the schema was common so we have the earliest one assert "gen2" in p.default_schema.tables p._wipe_working_folder() p.deactivate() - p = dlt.pipeline(pipeline_name="source_2_pipeline", destination="duckdb", dataset_name="shared_dataset", credentials="duckdb:///_storage/test_quack.duckdb") + p = dlt.pipeline( + pipeline_name="source_2_pipeline", + destination="duckdb", + dataset_name="shared_dataset", + credentials="duckdb:///_storage/test_quack.duckdb", + ) p.sync_destination() # we have our separate state - assert p.state["sources"]["shared"] == {'source_2': True, 'resources': {'gen1': {'source_2': True}}} + assert p.state["sources"]["shared"] == { + "source_2": True, + "resources": {"gen1": {"source_2": True}}, + } # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["snowflake"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["snowflake"]), + ids=lambda x: x.name, +) def test_snowflake_custom_stage(destination_config: DestinationTestConfiguration) -> None: """Using custom stage name instead of the table stage""" - os.environ['DESTINATION__SNOWFLAKE__STAGE_NAME'] = 'my_non_existing_stage' + os.environ["DESTINATION__SNOWFLAKE__STAGE_NAME"] = "my_non_existing_stage" pipeline, data = simple_nested_pipeline(destination_config, f"custom_stage_{uniq_id()}", False) info = pipeline.run(data()) with pytest.raises(DestinationHasFailedJobs) as f_jobs: @@ -577,8 +699,8 @@ def test_snowflake_custom_stage(destination_config: DestinationTestConfiguration # NOTE: this stage must be created in DLT_DATA database for this test to pass! # CREATE STAGE MY_CUSTOM_LOCAL_STAGE; # GRANT READ, WRITE ON STAGE DLT_DATA.PUBLIC.MY_CUSTOM_LOCAL_STAGE TO ROLE DLT_LOADER_ROLE; - stage_name = 'PUBLIC.MY_CUSTOM_LOCAL_STAGE' - os.environ['DESTINATION__SNOWFLAKE__STAGE_NAME'] = stage_name + stage_name = "PUBLIC.MY_CUSTOM_LOCAL_STAGE" + os.environ["DESTINATION__SNOWFLAKE__STAGE_NAME"] = stage_name pipeline, data = simple_nested_pipeline(destination_config, f"custom_stage_{uniq_id()}", False) info = pipeline.run(data()) assert_load_info(info) @@ -591,16 +713,22 @@ def test_snowflake_custom_stage(destination_config: DestinationTestConfiguration assert len(staged_files) == 3 # check data of one table to ensure copy was done successfully tbl_name = client.make_qualified_table_name("lists") - assert_query_data(pipeline, f"SELECT value FROM {tbl_name}", ['a', None, None]) + assert_query_data(pipeline, f"SELECT value FROM {tbl_name}", ["a", None, None]) # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, subset=["snowflake"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, subset=["snowflake"]), + ids=lambda x: x.name, +) def test_snowflake_delete_file_after_copy(destination_config: DestinationTestConfiguration) -> None: """Using keep_staged_files = false option to remove staged files after copy""" - os.environ['DESTINATION__SNOWFLAKE__KEEP_STAGED_FILES'] = 'FALSE' + os.environ["DESTINATION__SNOWFLAKE__KEEP_STAGED_FILES"] = "FALSE" - pipeline, data = simple_nested_pipeline(destination_config, f"delete_staged_files_{uniq_id()}", False) + pipeline, data = simple_nested_pipeline( + destination_config, f"delete_staged_files_{uniq_id()}", False + ) info = pipeline.run(data()) assert_load_info(info) @@ -609,26 +737,32 @@ def test_snowflake_delete_file_after_copy(destination_config: DestinationTestCon with pipeline.sql_client() as client: # no files are left in table stage - stage_name = client.make_qualified_table_name('%lists') + stage_name = client.make_qualified_table_name("%lists") staged_files = client.execute_sql(f'LIST @{stage_name}/"{load_id}"') assert len(staged_files) == 0 # ensure copy was done tbl_name = client.make_qualified_table_name("lists") - assert_query_data(pipeline, f"SELECT value FROM {tbl_name}", ['a', None, None]) + assert_query_data(pipeline, f"SELECT value FROM {tbl_name}", ["a", None, None]) # do not remove - it allows us to filter tests by destination -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, all_staging_configs=True, file_format="parquet"), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, all_staging_configs=True, file_format="parquet"), + ids=lambda x: x.name, +) def test_parquet_loading(destination_config: DestinationTestConfiguration) -> None: """Run pipeline twice with merge write disposition Resource with primary key falls back to append. Resource without keys falls back to replace. """ - pipeline = destination_config.setup_pipeline('parquet_test_' + uniq_id(), dataset_name='parquet_test_' + uniq_id()) + pipeline = destination_config.setup_pipeline( + "parquet_test_" + uniq_id(), dataset_name="parquet_test_" + uniq_id() + ) - @dlt.resource(primary_key='id') + @dlt.resource(primary_key="id") def some_data(): - yield [{'id': 1}, {'id': 2}, {'id': 3}] + yield [{"id": 1}, {"id": 2}, {"id": 3}] @dlt.resource(write_disposition="replace") def other_data(): @@ -662,7 +796,7 @@ def other_data(): @dlt.resource(table_name="data_types", write_disposition="merge", columns=column_schemas) def my_resource(): nonlocal data_types - yield [data_types]*10 + yield [data_types] * 10 @dlt.source(max_table_nesting=0) def some_source(): @@ -685,8 +819,14 @@ def some_source(): assert len(package_info.jobs["completed_jobs"]) == expected_completed_jobs with pipeline.sql_client() as sql_client: - assert [row[0] for row in sql_client.execute_sql("SELECT * FROM other_data ORDER BY 1")] == [1, 2, 3, 4, 5] - assert [row[0] for row in sql_client.execute_sql("SELECT * FROM some_data ORDER BY 1")] == [1, 2, 3] + assert [ + row[0] for row in sql_client.execute_sql("SELECT * FROM other_data ORDER BY 1") + ] == [1, 2, 3, 4, 5] + assert [row[0] for row in sql_client.execute_sql("SELECT * FROM some_data ORDER BY 1")] == [ + 1, + 2, + 3, + ] db_rows = sql_client.execute_sql("SELECT * FROM data_types") assert len(db_rows) == 10 db_row = list(db_rows[0]) @@ -694,12 +834,15 @@ def some_source(): assert_all_data_types_row( db_row, schema=column_schemas, - parse_complex_strings=destination_config.destination in ["snowflake", "bigquery", "redshift"], - timestamp_precision= 3 if destination_config.destination == "athena" else 6 + parse_complex_strings=destination_config.destination + in ["snowflake", "bigquery", "redshift"], + timestamp_precision=3 if destination_config.destination == "athena" else 6, ) -def simple_nested_pipeline(destination_config: DestinationTestConfiguration, dataset_name: str, full_refresh: bool) -> Tuple[dlt.Pipeline, Callable[[], DltSource]]: +def simple_nested_pipeline( + destination_config: DestinationTestConfiguration, dataset_name: str, full_refresh: bool +) -> Tuple[dlt.Pipeline, Callable[[], DltSource]]: data = ["a", ["a", "b", "c"], ["a", "b", "c"]] def d(): @@ -709,6 +852,11 @@ def d(): def _data(): return dlt.resource(d(), name="lists", write_disposition="append") - p = dlt.pipeline(pipeline_name=f"pipeline_{dataset_name}", full_refresh=full_refresh, destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p = dlt.pipeline( + pipeline_name=f"pipeline_{dataset_name}", + full_refresh=full_refresh, + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) return p, _data - diff --git a/tests/load/pipeline/test_redshift.py b/tests/load/pipeline/test_redshift.py index 709e924bc9..a5d0cd178f 100644 --- a/tests/load/pipeline/test_redshift.py +++ b/tests/load/pipeline/test_redshift.py @@ -9,7 +9,11 @@ from tests.pipeline.utils import assert_load_info -@pytest.mark.parametrize("destination_config", destinations_configs(all_staging_configs=True, subset=["redshift"]), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(all_staging_configs=True, subset=["redshift"]), + ids=lambda x: x.name, +) def test_redshift_blocks_time_column(destination_config: DestinationTestConfiguration) -> None: pipeline = destination_config.setup_pipeline("athena_" + uniq_id(), full_refresh=True) @@ -19,7 +23,7 @@ def test_redshift_blocks_time_column(destination_config: DestinationTestConfigur @dlt.resource(table_name="data_types", write_disposition="append", columns=column_schemas) def my_resource() -> Iterator[Any]: nonlocal data_types - yield [data_types]*10 + yield [data_types] * 10 @dlt.source(max_table_nesting=0) def my_source() -> Any: @@ -29,4 +33,7 @@ def my_source() -> Any: assert info.has_failed_jobs - assert "Redshift cannot load TIME columns from" in info.load_packages[0].jobs['failed_jobs'][0].failed_message + assert ( + "Redshift cannot load TIME columns from" + in info.load_packages[0].jobs["failed_jobs"][0].failed_message + ) diff --git a/tests/load/pipeline/test_replace_disposition.py b/tests/load/pipeline/test_replace_disposition.py index d39556ab2f..095cee7154 100644 --- a/tests/load/pipeline/test_replace_disposition.py +++ b/tests/load/pipeline/test_replace_disposition.py @@ -3,38 +3,62 @@ import dlt, os, pytest from dlt.common.utils import uniq_id -from tests.pipeline.utils import assert_load_info -from tests.load.pipeline.utils import drop_active_pipeline_data, load_table_counts, load_tables_to_dicts +from tests.pipeline.utils import assert_load_info +from tests.load.pipeline.utils import ( + drop_active_pipeline_data, + load_table_counts, + load_tables_to_dicts, +) from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration REPLACE_STRATEGIES = ["truncate-and-insert", "insert-from-staging", "staging-optimized"] -@pytest.mark.parametrize("destination_config", destinations_configs(local_filesystem_configs=True, default_staging_configs=True, default_sql_configs=True), ids=lambda x: x.name) -@pytest.mark.parametrize("replace_strategy", REPLACE_STRATEGIES) -def test_replace_disposition(destination_config: DestinationTestConfiguration, replace_strategy: str) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + local_filesystem_configs=True, default_staging_configs=True, default_sql_configs=True + ), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("replace_strategy", REPLACE_STRATEGIES) +def test_replace_disposition( + destination_config: DestinationTestConfiguration, replace_strategy: str +) -> None: if not destination_config.supports_merge and replace_strategy != "truncate-and-insert": - pytest.skip(f"Destination {destination_config.name} does not support merge and thus {replace_strategy}") + pytest.skip( + f"Destination {destination_config.name} does not support merge and thus" + f" {replace_strategy}" + ) # only allow 40 items per file - os.environ['DATA_WRITER__FILE_MAX_ITEMS'] = "40" + os.environ["DATA_WRITER__FILE_MAX_ITEMS"] = "40" # use staging tables for replace - os.environ['DESTINATION__REPLACE_STRATEGY'] = replace_strategy + os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy # make duckdb to reuse database in working folder os.environ["DESTINATION__DUCKDB__CREDENTIALS"] = "duckdb:///test_replace_disposition.duckdb" # TODO: start storing _dlt_loads with right json content increase_loads = lambda x: x if destination_config.destination == "filesystem" else x + 1 - increase_state_loads = lambda info: len([job for job in info.load_packages[0].jobs["completed_jobs"] if job.job_file_info.table_name == "_dlt_pipeline_state" and job.job_file_info.file_format not in ["sql", "reference"]]) + increase_state_loads = lambda info: len( + [ + job + for job in info.load_packages[0].jobs["completed_jobs"] + if job.job_file_info.table_name == "_dlt_pipeline_state" + and job.job_file_info.file_format not in ["sql", "reference"] + ] + ) # filesystem does not have versions and child tables def norm_table_counts(counts: Dict[str, int], *child_tables: str) -> Dict[str, int]: if destination_config.destination != "filesystem": return counts - return {**{"_dlt_version": 0}, **{t:0 for t in child_tables}, **counts} + return {**{"_dlt_version": 0}, **{t: 0 for t in child_tables}, **counts} dataset_name = "test_replace_strategies_ds" + uniq_id() - pipeline = destination_config.setup_pipeline("test_replace_strategies", dataset_name=dataset_name) + pipeline = destination_config.setup_pipeline( + "test_replace_strategies", dataset_name=dataset_name + ) offset = 1000 @@ -45,36 +69,39 @@ def load_items(): # 6 jobs for the sub_items # 3 jobs for the sub_sub_items nonlocal offset - for _, index in enumerate(range(offset, offset+120), 1): + for _, index in enumerate(range(offset, offset + 120), 1): yield { "id": index, "name": f"item {index}", - "sub_items": [{ - "id": index + 1000, - "name": f"sub item {index + 1000}" - },{ - "id": index + 2000, - "name": f"sub item {index + 2000}", - "sub_sub_items": [{ - "id": index + 3000, - "name": f"sub item {index + 3000}", - }] - }] - } + "sub_items": [ + {"id": index + 1000, "name": f"sub item {index + 1000}"}, + { + "id": index + 2000, + "name": f"sub item {index + 2000}", + "sub_sub_items": [ + { + "id": index + 3000, + "name": f"sub item {index + 3000}", + } + ], + }, + ], + } # append resource to see if we do not drop any tables @dlt.resource(write_disposition="append") def append_items(): nonlocal offset - for _, index in enumerate(range(offset, offset+12), 1): + for _, index in enumerate(range(offset, offset + 12), 1): yield { "id": index, "name": f"item {index}", } - # first run with offset 0 - info = pipeline.run([load_items, append_items], loader_file_format=destination_config.file_format) + info = pipeline.run( + [load_items, append_items], loader_file_format=destination_config.file_format + ) assert_load_info(info) # count state records that got extracted state_records = increase_state_loads(info) @@ -83,7 +110,9 @@ def append_items(): # second run with higher offset so we can check the results offset = 1000 - info = pipeline.run([load_items, append_items], loader_file_format=destination_config.file_format) + info = pipeline.run( + [load_items, append_items], loader_file_format=destination_config.file_format + ) assert_load_info(info) state_records += increase_state_loads(info) dlt_loads = increase_loads(dlt_loads) @@ -97,7 +126,7 @@ def append_items(): "items__sub_items__sub_sub_items": 120, "_dlt_pipeline_state": state_records, "_dlt_loads": dlt_loads, - "_dlt_version": dlt_versions + "_dlt_version": dlt_versions, } # check trace @@ -105,36 +134,47 @@ def append_items(): "append_items": 12, "items": 120, "items__sub_items": 240, - "items__sub_items__sub_sub_items": 120 + "items__sub_items__sub_sub_items": 120, } - # check we really have the replaced data in our destination - table_dicts = load_tables_to_dicts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert {x for i,x in enumerate(range(1000, 1120), 1)} == {int(x["id"]) for x in table_dicts["items"]} - assert {x for i,x in enumerate(range(2000, 2000+120), 1)}.union({x for i,x in enumerate(range(3000, 3000+120), 1)}) == {int(x["id"]) for x in table_dicts["items__sub_items"]} - assert {x for i,x in enumerate(range(4000, 4120), 1)} == {int(x["id"]) for x in table_dicts["items__sub_items__sub_sub_items"]} + table_dicts = load_tables_to_dicts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) + assert {x for i, x in enumerate(range(1000, 1120), 1)} == { + int(x["id"]) for x in table_dicts["items"] + } + assert {x for i, x in enumerate(range(2000, 2000 + 120), 1)}.union( + {x for i, x in enumerate(range(3000, 3000 + 120), 1)} + ) == {int(x["id"]) for x in table_dicts["items__sub_items"]} + assert {x for i, x in enumerate(range(4000, 4120), 1)} == { + int(x["id"]) for x in table_dicts["items__sub_items__sub_sub_items"] + } # we need to test that destination tables including child tables are cleared when we yield none from the resource @dlt.resource(name="items", write_disposition="replace", primary_key="id") def load_items_none(): yield - info = pipeline.run([load_items_none, append_items], loader_file_format=destination_config.file_format) + info = pipeline.run( + [load_items_none, append_items], loader_file_format=destination_config.file_format + ) assert_load_info(info) state_records += increase_state_loads(info) dlt_loads = increase_loads(dlt_loads) # table and child tables should be cleared table_counts = load_table_counts(pipeline, *pipeline.default_schema.tables.keys()) - assert norm_table_counts(table_counts, "items__sub_items", "items__sub_items__sub_sub_items") == { + assert norm_table_counts( + table_counts, "items__sub_items", "items__sub_items__sub_sub_items" + ) == { "append_items": 36, "items": 0, "items__sub_items": 0, "items__sub_items__sub_sub_items": 0, "_dlt_pipeline_state": state_records, "_dlt_loads": dlt_loads, - "_dlt_version": dlt_versions + "_dlt_version": dlt_versions, } # check trace assert pipeline.last_trace.last_normalize_info.row_counts == { @@ -146,8 +186,12 @@ def load_items_none(): # drop_active_pipeline_data() # create a pipeline with different name but loading to the same dataset as above - this is to provoke truncating non existing tables - pipeline_2 = destination_config.setup_pipeline("test_replace_strategies_2", dataset_name=dataset_name) - info = pipeline_2.run(load_items, table_name="items_copy", loader_file_format=destination_config.file_format) + pipeline_2 = destination_config.setup_pipeline( + "test_replace_strategies_2", dataset_name=dataset_name + ) + info = pipeline_2.run( + load_items, table_name="items_copy", loader_file_format=destination_config.file_format + ) assert_load_info(info) new_state_records = increase_state_loads(info) assert new_state_records == 1 @@ -158,7 +202,7 @@ def load_items_none(): "items_copy": 120, "items_copy__sub_items": 240, "items_copy__sub_items__sub_sub_items": 120, - "_dlt_pipeline_state": 1 + "_dlt_pipeline_state": 1, } info = pipeline_2.run(append_items, loader_file_format=destination_config.file_format) @@ -176,7 +220,7 @@ def load_items_none(): "items_copy__sub_items__sub_sub_items": 120, "_dlt_pipeline_state": state_records + 1, "_dlt_loads": dlt_loads, - "_dlt_version": increase_loads(dlt_versions) + "_dlt_version": increase_loads(dlt_versions), } # check trace assert pipeline_2.last_trace.last_normalize_info.row_counts == { @@ -185,50 +229,56 @@ def load_items_none(): # old pipeline -> shares completed loads and versions table table_counts = load_table_counts(pipeline, *pipeline.default_schema.tables.keys()) - assert norm_table_counts(table_counts, "items__sub_items", "items__sub_items__sub_sub_items") == { + assert norm_table_counts( + table_counts, "items__sub_items", "items__sub_items__sub_sub_items" + ) == { "append_items": 48, "items": 0, "items__sub_items": 0, "items__sub_items__sub_sub_items": 0, "_dlt_pipeline_state": state_records + 1, "_dlt_loads": dlt_loads, # next load - "_dlt_version": increase_loads(dlt_versions) # new table name -> new schema + "_dlt_version": increase_loads(dlt_versions), # new table name -> new schema } -@pytest.mark.parametrize("destination_config", destinations_configs(local_filesystem_configs=True, default_staging_configs=True, default_sql_configs=True), ids=lambda x: x.name) + +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + local_filesystem_configs=True, default_staging_configs=True, default_sql_configs=True + ), + ids=lambda x: x.name, +) @pytest.mark.parametrize("replace_strategy", REPLACE_STRATEGIES) -def test_replace_table_clearing(destination_config: DestinationTestConfiguration,replace_strategy: str) -> None: +def test_replace_table_clearing( + destination_config: DestinationTestConfiguration, replace_strategy: str +) -> None: if not destination_config.supports_merge and replace_strategy != "truncate-and-insert": - pytest.skip(f"Destination {destination_config.name} does not support merge and thus {replace_strategy}") + pytest.skip( + f"Destination {destination_config.name} does not support merge and thus" + f" {replace_strategy}" + ) # use staging tables for replace - os.environ['DESTINATION__REPLACE_STRATEGY'] = replace_strategy + os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy - pipeline = destination_config.setup_pipeline("test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True) + pipeline = destination_config.setup_pipeline( + "test_replace_table_clearing", dataset_name="test_replace_table_clearing", full_refresh=True + ) @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") def items_with_subitems(): data = { "id": 1, "name": "item", - "sub_items": [{ - "id": 101, - "name": "sub item 101" - },{ - "id": 101, - "name": "sub item 102" - }] + "sub_items": [{"id": 101, "name": "sub item 101"}, {"id": 101, "name": "sub item 102"}], } yield dlt.mark.with_table_name(data, "items") yield dlt.mark.with_table_name(data, "other_items") @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") def items_without_subitems(): - data = [{ - "id": 1, - "name": "item", - "sub_items": [] - }] + data = [{"id": 1, "name": "item", "sub_items": []}] yield dlt.mark.with_table_name(data, "items") yield dlt.mark.with_table_name(data, "other_items") @@ -236,17 +286,16 @@ def items_without_subitems(): def items_with_subitems_yield_none(): yield None yield None - data = [{ - "id": 1, - "name": "item", - "sub_items": [{ - "id": 101, - "name": "sub item 101" - },{ - "id": 101, - "name": "sub item 102" - }] - }] + data = [ + { + "id": 1, + "name": "item", + "sub_items": [ + {"id": 101, "name": "sub item 101"}, + {"id": 101, "name": "sub item 102"}, + ], + } + ] yield dlt.mark.with_table_name(data, "items") yield dlt.mark.with_table_name(data, "other_items") yield None @@ -257,13 +306,7 @@ def static_items(): yield { "id": 1, "name": "item", - "sub_items": [{ - "id": 101, - "name": "sub item 101" - },{ - "id": 101, - "name": "sub item 102" - }] + "sub_items": [{"id": 101, "name": "sub item 101"}, {"id": 101, "name": "sub item 102"}], } @dlt.resource(name="main_resource", write_disposition="replace", primary_key="id") @@ -271,8 +314,12 @@ def yield_none(): yield # regular call - pipeline.run([items_with_subitems, static_items], loader_file_format=destination_config.file_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + pipeline.run( + [items_with_subitems, static_items], loader_file_format=destination_config.file_format + ) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 1 assert table_counts["items__sub_items"] == 2 assert table_counts["other_items"] == 1 @@ -287,12 +334,14 @@ def yield_none(): "other_items__sub_items": 2, "static_items": 1, "static_items__sub_items": 2, - "_dlt_pipeline_state": 1 + "_dlt_pipeline_state": 1, } # see if child table gets cleared pipeline.run(items_without_subitems, loader_file_format=destination_config.file_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 1 assert table_counts.get("items__sub_items", 0) == 0 assert table_counts["other_items"] == 1 @@ -300,15 +349,14 @@ def yield_none(): assert table_counts["static_items"] == 1 assert table_counts["static_items__sub_items"] == 2 # check trace - assert pipeline.last_trace.last_normalize_info.row_counts == { - "items": 1, - "other_items": 1 - } + assert pipeline.last_trace.last_normalize_info.row_counts == {"items": 1, "other_items": 1} # see if yield none clears everything pipeline.run(items_with_subitems, loader_file_format=destination_config.file_format) pipeline.run(yield_none, loader_file_format=destination_config.file_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts.get("items", 0) == 0 assert table_counts.get("items__sub_items", 0) == 0 assert table_counts.get("other_items", 0) == 0 @@ -316,14 +364,13 @@ def yield_none(): assert table_counts["static_items"] == 1 assert table_counts["static_items__sub_items"] == 2 # check trace - assert pipeline.last_trace.last_normalize_info.row_counts == { - "items": 0, - "other_items": 0 - } + assert pipeline.last_trace.last_normalize_info.row_counts == {"items": 0, "other_items": 0} # see if yielding something next to other none entries still goes into db pipeline.run(items_with_subitems_yield_none, loader_file_format=destination_config.file_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 1 assert table_counts["items__sub_items"] == 2 assert table_counts["other_items"] == 1 diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index 1ebb3378a6..6ee013f05d 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -21,7 +21,11 @@ from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V8, yml_case_path as common_yml_case_path from tests.common.configuration.utils import environment from tests.load.pipeline.utils import assert_query_data, drop_active_pipeline_data -from tests.load.utils import destinations_configs, DestinationTestConfiguration, get_normalized_dataset_name +from tests.load.utils import ( + destinations_configs, + DestinationTestConfiguration, + get_normalized_dataset_name, +) @pytest.fixture(autouse=True) @@ -31,10 +35,17 @@ def duckdb_pipeline_location() -> None: del os.environ["DESTINATION__DUCKDB__CREDENTIALS"] -@pytest.mark.parametrize("destination_config", destinations_configs(default_staging_configs=True, default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs( + default_staging_configs=True, default_sql_configs=True, default_vector_configs=True + ), + ids=lambda x: x.name, +) def test_restore_state_utils(destination_config: DestinationTestConfiguration) -> None: - - p = destination_config.setup_pipeline(pipeline_name="pipe_" + uniq_id(), dataset_name="state_test_" + uniq_id()) + p = destination_config.setup_pipeline( + pipeline_name="pipe_" + uniq_id(), dataset_name="state_test_" + uniq_id() + ) schema = Schema("state") # inject schema into pipeline, don't do it in production @@ -56,11 +67,13 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - initial_state["_local"]["_last_extracted_at"] = pendulum.now() # add _dlt_id and _dlt_load_id resource = state_resource(initial_state) - resource.apply_hints(columns={ - "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, - "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, - **STATE_TABLE_COLUMNS - }) + resource.apply_hints( + columns={ + "_dlt_id": {"name": "_dlt_id", "data_type": "text", "nullable": False}, + "_dlt_load_id": {"name": "_dlt_load_id", "data_type": "text", "nullable": False}, + **STATE_TABLE_COLUMNS, + } + ) schema.update_table(schema.normalize_table_identifiers(resource.compute_table_schema())) # do not bump version here or in sync_schema, dlt won't recognize that schema changed and it won't update it in storage # so dlt in normalize stage infers _state_version table again but with different column order and the column order in schema is different @@ -137,21 +150,37 @@ def test_restore_state_utils(destination_config: DestinationTestConfiguration) - assert new_stored_state["_state_version"] + 1 == new_stored_state_2["_state_version"] -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) -def test_silently_skip_on_invalid_credentials(destination_config: DestinationTestConfiguration, environment: Any) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) +def test_silently_skip_on_invalid_credentials( + destination_config: DestinationTestConfiguration, environment: Any +) -> None: environment["CREDENTIALS"] = "postgres://loader:password@localhost:5432/dlt_data" - environment["DESTINATION__BIGQUERY__CREDENTIALS"] = '{"project_id": "chat-analytics-","client_email": "loader@chat-analytics-317513","private_key": "-----BEGIN PRIVATE KEY-----\\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD"}' + environment["DESTINATION__BIGQUERY__CREDENTIALS"] = ( + '{"project_id": "chat-analytics-","client_email":' + ' "loader@chat-analytics-317513","private_key": "-----BEGIN PRIVATE' + ' KEY-----\\nMIIEuwIBADANBgkqhkiG9w0BAQEFAASCBKUwggShAgEAAoIBAQCNEN0bL39HmD"}' + ) pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() # NOTE: we are not restoring the state in __init__ anymore but the test should stay: init should not fail on lack of credentials destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) -@pytest.mark.parametrize('use_single_dataset', [True, False]) -def test_get_schemas_from_destination(destination_config: DestinationTestConfiguration, use_single_dataset: bool) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) +@pytest.mark.parametrize("use_single_dataset", [True, False]) +def test_get_schemas_from_destination( + destination_config: DestinationTestConfiguration, use_single_dataset: bool +) -> None: pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) p.config.use_single_dataset = use_single_dataset @@ -164,23 +193,29 @@ def _make_dn_name(schema_name: str) -> str: default_schema = Schema("state") p._inject_schema(default_schema) - with p.destination_client() as job_client: + with p.destination_client() as job_client: # just sync schema without name - will use default schema p.sync_schema() - assert get_normalized_dataset_name(job_client) == default_schema.naming.normalize_table_identifier(dataset_name) + assert get_normalized_dataset_name( + job_client + ) == default_schema.naming.normalize_table_identifier(dataset_name) schema_two = Schema("two") with p._get_destination_clients(schema_two)[0] as job_client: # use the job_client to do that job_client.initialize_storage() job_client.update_stored_schema() # this may be a separate dataset depending in use_single_dataset setting - assert get_normalized_dataset_name(job_client) == schema_two.naming.normalize_table_identifier(_make_dn_name("two")) + assert get_normalized_dataset_name( + job_client + ) == schema_two.naming.normalize_table_identifier(_make_dn_name("two")) schema_three = Schema("three") p._inject_schema(schema_three) with p._get_destination_clients(schema_three)[0] as job_client: # sync schema with a name p.sync_schema(schema_three.name) - assert get_normalized_dataset_name(job_client) == schema_three.naming.normalize_table_identifier(_make_dn_name("three")) + assert get_normalized_dataset_name( + job_client + ) == schema_three.naming.normalize_table_identifier(_make_dn_name("three")) # wipe and restore p._wipe_working_folder() @@ -217,11 +252,15 @@ def _make_dn_name(schema_name: str) -> str: assert len(restored_schemas) == 3 -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) def test_restore_state_pipeline(destination_config: DestinationTestConfiguration) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "True" pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) def some_data_gen(param: str) -> Any: @@ -283,7 +322,10 @@ def some_data(): assert p.default_schema_name == "default" assert set(p.schema_names) == set(["default", "two", "three", "four"]) assert p.state["sources"] == { - "default": {'state1': 'state1', 'state2': 'state2'}, "two": {'state3': 'state3'}, "three": {'state4': 'state4'}, "four": {"state5": JSON_TYPED_DICT_DECODED} + "default": {"state1": "state1", "state2": "state2"}, + "two": {"state3": "state3"}, + "three": {"state4": "state4"}, + "four": {"state5": JSON_TYPED_DICT_DECODED}, } for schema in p.schemas.values(): normalized_id = schema.naming.normalize_table_identifier("some_data") @@ -294,7 +336,9 @@ def some_data(): # full refresh will not restore pipeline even if requested p._wipe_working_folder() - p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name, full_refresh=True) + p = destination_config.setup_pipeline( + pipeline_name=pipeline_name, dataset_name=dataset_name, full_refresh=True + ) p.run() assert p.default_schema_name is None drop_active_pipeline_data() @@ -314,11 +358,15 @@ def some_data(): assert restored_state["_state_version"] == orig_state["_state_version"] # second run will not restore - p._inject_schema(Schema("second")) # this will modify state, run does not sync if states are identical + p._inject_schema( + Schema("second") + ) # this will modify state, run does not sync if states are identical assert p.state["_state_version"] > orig_state["_state_version"] # print(p.state) p.run() - assert set(p.schema_names) == set(["default", "two", "three", "second", "four"]) # we keep our local copy + assert set(p.schema_names) == set( + ["default", "two", "three", "second", "four"] + ) # we keep our local copy # clear internal flag and decrease state version so restore triggers state = p.state state["_state_version"] -= 1 @@ -328,10 +376,14 @@ def some_data(): assert set(p.schema_names) == set(["default", "two", "three", "four"]) -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) def test_ignore_state_unfinished_load(destination_config: DestinationTestConfiguration) -> None: pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline(pipeline_name=pipeline_name, dataset_name=dataset_name) @dlt.resource @@ -354,18 +406,24 @@ def complete_package_mock(self, load_id: str, schema: Schema, aborted: bool = Fa assert state is None -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) -def test_restore_schemas_while_import_schemas_exist(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) +def test_restore_schemas_while_import_schemas_exist( + destination_config: DestinationTestConfiguration, +) -> None: # restored schema should attach itself to imported schema and it should not get overwritten import_schema_path = os.path.join(TEST_STORAGE_ROOT, "schemas", "import") export_schema_path = os.path.join(TEST_STORAGE_ROOT, "schemas", "export") pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() p = destination_config.setup_pipeline( pipeline_name=pipeline_name, dataset_name=dataset_name, import_schema_path=import_schema_path, - export_schema_path=export_schema_path + export_schema_path=export_schema_path, ) prepare_import_folder(p) # make sure schema got imported @@ -395,10 +453,14 @@ def test_restore_schemas_while_import_schemas_exist(destination_config: Destinat p = dlt.pipeline( pipeline_name=pipeline_name, import_schema_path=import_schema_path, - export_schema_path=export_schema_path + export_schema_path=export_schema_path, ) # use run to get changes - p.run(destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p.run( + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) schema = p.schemas["ethereum"] assert normalized_labels in schema.tables assert normalized_annotations in schema.tables @@ -420,11 +482,14 @@ def test_restore_change_dataset_and_destination(destination_name: str) -> None: pass -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) def test_restore_state_parallel_changes(destination_config: DestinationTestConfiguration) -> None: - pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() destination_config.setup() p = dlt.pipeline(pipeline_name=pipeline_name) @@ -437,16 +502,26 @@ def some_data(param: str) -> Any: data1 = some_data("state1") data1._pipe.name = "state1_data" - p.run([data1, some_data("state2")], schema=Schema("default"), destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p.run( + [data1, some_data("state2")], + schema=Schema("default"), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) orig_state = p.state # create a production pipeline in separate pipelines_dir production_p = dlt.pipeline(pipeline_name=pipeline_name, pipelines_dir=TEST_STORAGE_ROOT) - production_p.run(destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + production_p.run( + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) assert production_p.default_schema_name == "default" prod_state = production_p.state - assert prod_state["sources"] == {"default": {'state1': 'state1', 'state2': 'state2'}} + assert prod_state["sources"] == {"default": {"state1": "state1", "state2": "state2"}} assert prod_state["_state_version"] == orig_state["_state_version"] # generate data on production that modifies the schema but not state data2 = some_data("state1") @@ -505,18 +580,22 @@ def some_data(param: str) -> Any: state_table = client.make_qualified_table_name(p.default_schema.state_table_name) assert_query_data( - p, - f"SELECT version FROM {state_table} ORDER BY created_at DESC", - [5, 4, 4, 3, 2] + p, f"SELECT version FROM {state_table} ORDER BY created_at DESC", [5, 4, 4, 3, 2] ) except SqlClientNotAvailable: pytest.skip(f"destination {destination_config.destination} does not support sql client") -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True, default_vector_configs=True), ids=lambda x: x.name) -def test_reset_pipeline_on_deleted_dataset(destination_config: DestinationTestConfiguration) -> None: +@pytest.mark.parametrize( + "destination_config", + destinations_configs(default_sql_configs=True, default_vector_configs=True), + ids=lambda x: x.name, +) +def test_reset_pipeline_on_deleted_dataset( + destination_config: DestinationTestConfiguration, +) -> None: pipeline_name = "pipe_" + uniq_id() - dataset_name="state_test_" + uniq_id() + dataset_name = "state_test_" + uniq_id() destination_config.setup() p = dlt.pipeline(pipeline_name=pipeline_name) @@ -527,7 +606,13 @@ def some_data(param: str) -> Any: data4 = some_data("state4") data4.apply_hints(table_name="state1_data4") - p.run(data4, schema=Schema("sch1"), destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p.run( + data4, + schema=Schema("sch1"), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) data5 = some_data("state4") data5.apply_hints(table_name="state1_data5") p.run(data5, schema=Schema("sch2")) @@ -550,7 +635,13 @@ def some_data(param: str) -> Any: p.config.restore_from_destination = False data4 = some_data("state4") data4.apply_hints(table_name="state1_data4") - p.run(data4, schema=Schema("sch1"), destination=destination_config.destination, staging=destination_config.staging, dataset_name=dataset_name) + p.run( + data4, + schema=Schema("sch1"), + destination=destination_config.destination, + staging=destination_config.staging, + dataset_name=dataset_name, + ) assert p.first_run is False assert p.state["_local"]["first_run"] is False # attach again to make the `run` method check the destination @@ -566,4 +657,7 @@ def some_data(param: str) -> Any: def prepare_import_folder(p: Pipeline) -> None: os.makedirs(p._schema_storage.config.import_schema_path, exist_ok=True) - shutil.copy(common_yml_case_path("schemas/eth/ethereum_schema_v5"), os.path.join(p._schema_storage.config.import_schema_path, "ethereum.schema.yaml")) + shutil.copy( + common_yml_case_path("schemas/eth/ethereum_schema_v5"), + os.path.join(p._schema_storage.config.import_schema_path, "ethereum.schema.yaml"), + ) diff --git a/tests/load/pipeline/test_stage_loading.py b/tests/load/pipeline/test_stage_loading.py index 9e2e28e5d5..de4a7f4c3b 100644 --- a/tests/load/pipeline/test_stage_loading.py +++ b/tests/load/pipeline/test_stage_loading.py @@ -8,16 +8,24 @@ from dlt.common.schema.typing import TDataType from tests.load.pipeline.test_merge_disposition import github -from tests.load.pipeline.utils import load_table_counts -from tests.pipeline.utils import assert_load_info -from tests.load.utils import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE_COLUMNS_SCHEMA, assert_all_data_types_row +from tests.load.pipeline.utils import load_table_counts +from tests.pipeline.utils import assert_load_info +from tests.load.utils import ( + TABLE_ROW_ALL_DATA_TYPES, + TABLE_UPDATE_COLUMNS_SCHEMA, + assert_all_data_types_row, +) from tests.cases import table_update_and_row from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -@dlt.resource(table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url")) +@dlt.resource( + table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url") +) def load_modified_issues(): - with open("tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: issues = json.load(f) # change 2 issues @@ -30,10 +38,13 @@ def load_modified_issues(): yield from issues -@pytest.mark.parametrize("destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name +) def test_staging_load(destination_config: DestinationTestConfiguration) -> None: - - pipeline = destination_config.setup_pipeline(pipeline_name='test_stage_loading_5', dataset_name="test_staging_load" + uniq_id()) + pipeline = destination_config.setup_pipeline( + pipeline_name="test_stage_loading_5", dataset_name="test_staging_load" + uniq_id() + ) info = pipeline.run(github(), loader_file_format=destination_config.file_format) assert_load_info(info) @@ -44,12 +55,41 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: # we have 4 parquet and 4 reference jobs plus one merge job num_jobs = 4 + 4 + 1 if destination_config.supports_merge else 4 + 4 assert len(package_info.jobs["completed_jobs"]) == num_jobs - assert len([x for x in package_info.jobs["completed_jobs"] if x.job_file_info.file_format == "reference"]) == 4 - assert len([x for x in package_info.jobs["completed_jobs"] if x.job_file_info.file_format == destination_config.file_format]) == 4 + assert ( + len( + [ + x + for x in package_info.jobs["completed_jobs"] + if x.job_file_info.file_format == "reference" + ] + ) + == 4 + ) + assert ( + len( + [ + x + for x in package_info.jobs["completed_jobs"] + if x.job_file_info.file_format == destination_config.file_format + ] + ) + == 4 + ) if destination_config.supports_merge: - assert len([x for x in package_info.jobs["completed_jobs"] if x.job_file_info.file_format == "sql"]) == 1 + assert ( + len( + [ + x + for x in package_info.jobs["completed_jobs"] + if x.job_file_info.file_format == "sql" + ] + ) + == 1 + ) - initial_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + initial_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert initial_counts["issues"] == 100 # check item of first row in db @@ -62,7 +102,9 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: info = pipeline.run(load_modified_issues, loader_file_format=destination_config.file_format) assert_load_info(info) assert pipeline.default_schema.tables["issues"]["write_disposition"] == "merge" - merge_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + merge_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert merge_counts == initial_counts # check changes where merged in @@ -73,39 +115,62 @@ def test_staging_load(destination_config: DestinationTestConfiguration) -> None: assert rows[0][0] == 300 # test append - info = pipeline.run(github().load_issues, write_disposition="append", loader_file_format=destination_config.file_format) + info = pipeline.run( + github().load_issues, + write_disposition="append", + loader_file_format=destination_config.file_format, + ) assert_load_info(info) assert pipeline.default_schema.tables["issues"]["write_disposition"] == "append" # the counts of all tables must be double - append_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - assert {k:v*2 for k, v in initial_counts.items()} == append_counts + append_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) + assert {k: v * 2 for k, v in initial_counts.items()} == append_counts # test replace - info = pipeline.run(github().load_issues, write_disposition="replace", loader_file_format=destination_config.file_format) + info = pipeline.run( + github().load_issues, + write_disposition="replace", + loader_file_format=destination_config.file_format, + ) assert_load_info(info) assert pipeline.default_schema.tables["issues"]["write_disposition"] == "replace" # the counts of all tables must be double - replace_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + replace_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert replace_counts == initial_counts -@pytest.mark.parametrize("destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(all_staging_configs=True), ids=lambda x: x.name +) def test_all_data_types(destination_config: DestinationTestConfiguration) -> None: - - pipeline = destination_config.setup_pipeline('test_stage_loading', dataset_name="test_all_data_types" + uniq_id()) + pipeline = destination_config.setup_pipeline( + "test_stage_loading", dataset_name="test_all_data_types" + uniq_id() + ) # Redshift parquet -> exclude col7_precision # redshift and athena, parquet and jsonl, exclude time types exclude_types: List[TDataType] = [] exclude_columns: List[str] = [] - if destination_config.destination in ("redshift", "athena") and destination_config.file_format in ('parquet', 'jsonl'): + if destination_config.destination in ( + "redshift", + "athena", + ) and destination_config.file_format in ("parquet", "jsonl"): # Redshift copy doesn't support TIME column exclude_types.append("time") - if destination_config.destination == "redshift" and destination_config.file_format in ("parquet", "jsonl"): + if destination_config.destination == "redshift" and destination_config.file_format in ( + "parquet", + "jsonl", + ): # Redshift can't load fixed width binary columns from parquet exclude_columns.append("col7_precision") - column_schemas, data_types = table_update_and_row(exclude_types=exclude_types, exclude_columns=exclude_columns) + column_schemas, data_types = table_update_and_row( + exclude_types=exclude_types, exclude_columns=exclude_columns + ) # bigquery cannot load into JSON fields from parquet if destination_config.file_format == "parquet": @@ -124,7 +189,7 @@ def test_all_data_types(destination_config: DestinationTestConfiguration) -> Non @dlt.resource(table_name="data_types", write_disposition="merge", columns=column_schemas) def my_resource(): nonlocal data_types - yield [data_types]*10 + yield [data_types] * 10 @dlt.source(max_table_nesting=0) def my_source(): @@ -138,13 +203,19 @@ def my_source(): assert len(db_rows) == 10 db_row = list(db_rows[0]) # parquet is not really good at inserting json, best we get are strings in JSON columns - parse_complex_strings = destination_config.file_format == "parquet" and destination_config.destination in ["redshift", "bigquery", "snowflake"] - allow_base64_binary = destination_config.file_format == "jsonl" and destination_config.destination in ["redshift"] + parse_complex_strings = ( + destination_config.file_format == "parquet" + and destination_config.destination in ["redshift", "bigquery", "snowflake"] + ) + allow_base64_binary = ( + destination_config.file_format == "jsonl" + and destination_config.destination in ["redshift"] + ) # content must equal assert_all_data_types_row( db_row[:-2], parse_complex_strings=parse_complex_strings, allow_base64_binary=allow_base64_binary, timestamp_precision=sql_client.capabilities.timestamp_precision, - schema=column_schemas + schema=column_schemas, ) diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index c88fd79588..11356cdd20 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -1,61 +1,80 @@ import pytest import dlt from typing import Any -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration, assert_data_table_counts +from tests.load.pipeline.utils import ( + destinations_configs, + DestinationTestConfiguration, + assert_data_table_counts, +) from tests.pipeline.utils import assert_load_info from dlt.pipeline.exceptions import PipelineStepFailed + def data_with_subtables(offset: int) -> Any: - for _, index in enumerate(range(offset, offset+100), 1): + for _, index in enumerate(range(offset, offset + 100), 1): yield { "id": index, "name": f"item {index}", - "sub_items": [{ - "id": index + 1000, - "name": f"sub item {index + 1000}" - }] + "sub_items": [{"id": index + 1000, "name": f"sub item {index + 1000}"}], } -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) + +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_switch_from_merge(destination_config: DestinationTestConfiguration): - pipeline = destination_config.setup_pipeline(pipeline_name='test_switch_from_merge', full_refresh=True) + pipeline = destination_config.setup_pipeline( + pipeline_name="test_switch_from_merge", full_refresh=True + ) - info = (pipeline.run(data_with_subtables(10), table_name="items", write_disposition="merge")) - assert_data_table_counts(pipeline, { - "items": 100, - "items__sub_items": 100 - }) - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"]["items"] == {'_dlt_id': '_dlt_root_id'} + info = pipeline.run(data_with_subtables(10), table_name="items", write_disposition="merge") + assert_data_table_counts(pipeline, {"items": 100, "items__sub_items": 100}) + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "items" + ] == {"_dlt_id": "_dlt_root_id"} - info = (pipeline.run(data_with_subtables(10), table_name="items", write_disposition="merge")) + info = pipeline.run(data_with_subtables(10), table_name="items", write_disposition="merge") assert_load_info(info) - assert_data_table_counts(pipeline, { - "items": 100 if destination_config.supports_merge else 200, - "items__sub_items": 100 if destination_config.supports_merge else 200 - }) - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"]["items"] == {'_dlt_id': '_dlt_root_id'} - - info = (pipeline.run(data_with_subtables(10), table_name="items", write_disposition="append")) + assert_data_table_counts( + pipeline, + { + "items": 100 if destination_config.supports_merge else 200, + "items__sub_items": 100 if destination_config.supports_merge else 200, + }, + ) + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "items" + ] == {"_dlt_id": "_dlt_root_id"} + + info = pipeline.run(data_with_subtables(10), table_name="items", write_disposition="append") assert_load_info(info) - assert_data_table_counts(pipeline, { - "items": 200 if destination_config.supports_merge else 300, - "items__sub_items": 200 if destination_config.supports_merge else 300 - }) - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"]["items"] == {'_dlt_id': '_dlt_root_id'} - - info = (pipeline.run(data_with_subtables(10), table_name="items", write_disposition="replace")) + assert_data_table_counts( + pipeline, + { + "items": 200 if destination_config.supports_merge else 300, + "items__sub_items": 200 if destination_config.supports_merge else 300, + }, + ) + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "items" + ] == {"_dlt_id": "_dlt_root_id"} + + info = pipeline.run(data_with_subtables(10), table_name="items", write_disposition="replace") assert_load_info(info) - assert_data_table_counts(pipeline, { - "items": 100, - "items__sub_items": 100 - }) - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"]["items"] == {'_dlt_id': '_dlt_root_id'} + assert_data_table_counts(pipeline, {"items": 100, "items__sub_items": 100}) + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "items" + ] == {"_dlt_id": "_dlt_root_id"} -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) @pytest.mark.parametrize("with_root_key", [True, False]) def test_switch_to_merge(destination_config: DestinationTestConfiguration, with_root_key: bool): - pipeline = destination_config.setup_pipeline(pipeline_name='test_switch_to_merge', full_refresh=True) + pipeline = destination_config.setup_pipeline( + pipeline_name="test_switch_to_merge", full_refresh=True + ) @dlt.resource() def resource(): @@ -68,16 +87,17 @@ def source(): s = source() s.root_key = with_root_key - info = (pipeline.run(s, table_name="items", write_disposition="append")) - assert_data_table_counts(pipeline, { - "items": 100, - "items__sub_items": 100 - }) + info = pipeline.run(s, table_name="items", write_disposition="append") + assert_data_table_counts(pipeline, {"items": 100, "items__sub_items": 100}) if with_root_key: - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["root"] == {'_dlt_id': '_dlt_root_id'} + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"][ + "root" + ] == {"_dlt_id": "_dlt_root_id"} else: - assert "propagation" not in pipeline.default_schema._normalizers_config["json"].get("config", {}) + assert "propagation" not in pipeline.default_schema._normalizers_config["json"].get( + "config", {} + ) # without a root key this will fail, it is expected if not with_root_key and destination_config.supports_merge: @@ -85,11 +105,15 @@ def source(): pipeline.run(s, table_name="items", write_disposition="merge") return - info = (pipeline.run(s, table_name="items", write_disposition="merge")) + info = pipeline.run(s, table_name="items", write_disposition="merge") assert_load_info(info) - assert_data_table_counts(pipeline, { - "items": 100 if destination_config.supports_merge else 200, - "items__sub_items": 100 if destination_config.supports_merge else 200, - }) - assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"]["items"] == {'_dlt_id': '_dlt_root_id'} - + assert_data_table_counts( + pipeline, + { + "items": 100 if destination_config.supports_merge else 200, + "items__sub_items": 100 if destination_config.supports_merge else 200, + }, + ) + assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["tables"][ + "items" + ] == {"_dlt_id": "_dlt_root_id"} diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 94fbc80cf8..b098f7bf0d 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -7,8 +7,15 @@ from dlt.common.configuration.container import Container from dlt.common.pipeline import LoadInfo, PipelineContext -from tests.pipeline.utils import (load_table_counts, load_data_table_counts, assert_data_table_counts, load_file, - load_files, load_tables_to_dicts, load_table_distinct_counts) +from tests.pipeline.utils import ( + load_table_counts, + load_data_table_counts, + assert_data_table_counts, + load_file, + load_files, + load_tables_to_dicts, + load_table_distinct_counts, +) from tests.load.utils import DestinationTestConfiguration, destinations_configs if TYPE_CHECKING: @@ -64,22 +71,47 @@ def _drop_dataset(schema_name: str) -> None: def _is_filesystem(p: dlt.Pipeline) -> bool: if not p.destination: return False - return p.destination.name == 'filesystem' + return p.destination.name == "filesystem" -def assert_table(p: dlt.Pipeline, table_name: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None) -> None: +def assert_table( + p: dlt.Pipeline, + table_name: str, + table_data: List[Any], + schema_name: str = None, + info: LoadInfo = None, +) -> None: func = _assert_table_fs if _is_filesystem(p) else _assert_table_sql func(p, table_name, table_data, schema_name, info) -def _assert_table_sql(p: dlt.Pipeline, table_name: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None) -> None: +def _assert_table_sql( + p: dlt.Pipeline, + table_name: str, + table_data: List[Any], + schema_name: str = None, + info: LoadInfo = None, +) -> None: with p.sql_client(schema_name=schema_name) as c: table_name = c.make_qualified_table_name(table_name) # Implement NULLS FIRST sort in python - assert_query_data(p, f"SELECT * FROM {table_name} ORDER BY 1", table_data, schema_name, info, sort_key=lambda row: row[0] is not None) - - -def _assert_table_fs(p: dlt.Pipeline, table_name: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None) -> None: + assert_query_data( + p, + f"SELECT * FROM {table_name} ORDER BY 1", + table_data, + schema_name, + info, + sort_key=lambda row: row[0] is not None, + ) + + +def _assert_table_fs( + p: dlt.Pipeline, + table_name: str, + table_data: List[Any], + schema_name: str = None, + info: LoadInfo = None, +) -> None: """Assert table is loaded to filesystem destination""" client: FilesystemClient = p.destination_client(schema_name) # type: ignore[assignment] # get table directory @@ -99,7 +131,14 @@ def select_data(p: dlt.Pipeline, sql: str, schema_name: str = None) -> List[Sequ return list(cur.fetchall()) -def assert_query_data(p: dlt.Pipeline, sql: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None, sort_key: Callable[[Any], Any] = None) -> None: +def assert_query_data( + p: dlt.Pipeline, + sql: str, + table_data: List[Any], + schema_name: str = None, + info: LoadInfo = None, + sort_key: Callable[[Any], Any] = None, +) -> None: """Asserts that query selecting single column of values matches `table_data`. If `info` is provided, second column must contain one of load_ids in `info` Args: diff --git a/tests/load/postgres/test_postgres_client.py b/tests/load/postgres/test_postgres_client.py index 65ac61cfd4..83b59bddaf 100644 --- a/tests/load/postgres/test_postgres_client.py +++ b/tests/load/postgres/test_postgres_client.py @@ -43,14 +43,20 @@ def test_postgres_credentials_defaults() -> None: def test_postgres_credentials_native_value(environment) -> None: with pytest.raises(ConfigFieldMissingException): - resolve_configuration(PostgresCredentials(), explicit_value="postgres://loader@localhost/dlt_data") + resolve_configuration( + PostgresCredentials(), explicit_value="postgres://loader@localhost/dlt_data" + ) # set password via env os.environ["CREDENTIALS__PASSWORD"] = "pass" - c = resolve_configuration(PostgresCredentials(), explicit_value="postgres://loader@localhost/dlt_data") + c = resolve_configuration( + PostgresCredentials(), explicit_value="postgres://loader@localhost/dlt_data" + ) assert c.is_resolved() assert c.password == "pass" # but if password is specified - it is final - c = resolve_configuration(PostgresCredentials(), explicit_value="postgres://loader:loader@localhost/dlt_data") + c = resolve_configuration( + PostgresCredentials(), explicit_value="postgres://loader:loader@localhost/dlt_data" + ) assert c.is_resolved() assert c.password == "loader" @@ -68,14 +74,32 @@ def test_wei_value(client: PostgresClient, file_storage: FileStorage) -> None: user_table_name = prepare_table(client) # postgres supports EVM precisions - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, parse_data__metadata__rasa_x_id)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {Wei.from_int256(2*256-1)});" - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) - - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, parse_data__metadata__rasa_x_id)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {Wei.from_int256(2*256-1, 18)});" - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) - - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, parse_data__metadata__rasa_x_id)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {Wei.from_int256(2*256-1, 78)});" - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp," + " parse_data__metadata__rasa_x_id)\nVALUES\n" + ) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {Wei.from_int256(2*256-1)});" + ) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) + + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp," + " parse_data__metadata__rasa_x_id)\nVALUES\n" + ) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {Wei.from_int256(2*256-1, 18)});" + ) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) + + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp," + " parse_data__metadata__rasa_x_id)\nVALUES\n" + ) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {Wei.from_int256(2*256-1, 78)});" + ) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 1d6965c0c0..68e6702b75 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -6,10 +6,14 @@ from dlt.common.schema import Schema from dlt.destinations.impl.postgres.postgres import PostgresClient -from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration, PostgresCredentials +from dlt.destinations.impl.postgres.configuration import ( + PostgresClientConfiguration, + PostgresCredentials, +) from tests.load.utils import TABLE_UPDATE + @pytest.fixture def schema() -> Schema: return Schema("event") @@ -18,7 +22,12 @@ def schema() -> Schema: @pytest.fixture def client(schema: Schema) -> PostgresClient: # return client without opening connection - return PostgresClient(schema, PostgresClientConfiguration(dataset_name="test_" + uniq_id(), credentials=PostgresCredentials())) + return PostgresClient( + schema, + PostgresClientConfiguration( + dataset_name="test_" + uniq_id(), credentials=PostgresCredentials() + ), + ) def test_create_table(client: PostgresClient) -> None: @@ -89,7 +98,14 @@ def test_create_table_with_hints(client: PostgresClient) -> None: assert '"col4" timestamp with time zone NOT NULL' in sql # same thing without indexes - client = PostgresClient(client.schema, PostgresClientConfiguration(dataset_name="test_" + uniq_id(), create_indexes=False, credentials=PostgresCredentials())) + client = PostgresClient( + client.schema, + PostgresClientConfiguration( + dataset_name="test_" + uniq_id(), + create_indexes=False, + credentials=PostgresCredentials(), + ), + ) sql = client._get_table_update_sql("event_test_table", mod_update, False)[0] sqlfluff.parse(sql, dialect="postgres") assert '"col2" double precision NOT NULL' in sql diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index 760eec4631..c24c309ca6 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -10,6 +10,7 @@ from tests.pipeline.utils import assert_load_info from tests.load.qdrant.utils import drop_active_pipeline_data, assert_collection + @pytest.fixture(autouse=True) def drop_qdrant_data() -> Iterator[None]: yield @@ -146,7 +147,6 @@ def some_data(): def test_pipeline_replace() -> None: - generator_instance1 = sequence_generator() generator_instance2 = sequence_generator() @@ -164,7 +164,8 @@ def some_data(): pipeline = dlt.pipeline( pipeline_name="test_pipeline_replace", destination="qdrant", - dataset_name="test_pipeline_replace_dataset" + uid, # Qdrant doesn't mandate any name normalization + dataset_name="test_pipeline_replace_dataset" + + uid, # Qdrant doesn't mandate any name normalization ) info = pipeline.run( @@ -172,7 +173,9 @@ def some_data(): write_disposition="replace", ) assert_load_info(info) - assert info.dataset_name == "test_pipeline_replace_dataset" + uid # Qdrant doesn't mandate any name normalization + assert ( + info.dataset_name == "test_pipeline_replace_dataset" + uid + ) # Qdrant doesn't mandate any name normalization data = next(generator_instance2) assert_collection(pipeline, "some_data", items=data) @@ -193,16 +196,14 @@ def test_pipeline_merge() -> None: "doc_id": 1, "title": "The Shawshank Redemption", "description": ( - "Two imprisoned men find redemption through acts " - "of decency over the years." + "Two imprisoned men find redemption through acts of decency over the years." ), }, { "doc_id": 2, "title": "The Godfather", "description": ( - "A crime dynasty's aging patriarch transfers " - "control to his reluctant son." + "A crime dynasty's aging patriarch transfers control to his reluctant son." ), }, { @@ -230,9 +231,7 @@ def movies_data(): dataset_name="TestPipelineAppendDataset" + uniq_id(), ) info = pipeline.run( - movies_data(), - write_disposition="merge", - dataset_name="MoviesDataset" + uniq_id() + movies_data(), write_disposition="merge", dataset_name="MoviesDataset" + uniq_id() ) assert_load_info(info) assert_collection(pipeline, "movies_data", items=data) @@ -308,21 +307,38 @@ def test_merge_github_nested() -> None: p = dlt.pipeline(destination="qdrant", dataset_name="github1", full_refresh=True) assert p.dataset_name.startswith("github1_202") - with open("tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: data = json.load(f) info = p.run( qdrant_adapter(data[:17], embed=["title", "body"]), table_name="issues", write_disposition="merge", - primary_key="id" + primary_key="id", ) assert_load_info(info) # assert if schema contains tables with right names print(p.default_schema.tables.keys()) - assert set(p.default_schema.tables.keys()) == {'_dlt_version', '_dlt_loads', 'issues', '_dlt_pipeline_state', 'issues__labels', 'issues__assignees'} - assert set([t["name"] for t in p.default_schema.data_tables()]) == {'issues', 'issues__labels', 'issues__assignees'} - assert set([t["name"] for t in p.default_schema.dlt_tables()]) == {'_dlt_version', '_dlt_loads', '_dlt_pipeline_state'} + assert set(p.default_schema.tables.keys()) == { + "_dlt_version", + "_dlt_loads", + "issues", + "_dlt_pipeline_state", + "issues__labels", + "issues__assignees", + } + assert set([t["name"] for t in p.default_schema.data_tables()]) == { + "issues", + "issues__labels", + "issues__assignees", + } + assert set([t["name"] for t in p.default_schema.dlt_tables()]) == { + "_dlt_version", + "_dlt_loads", + "_dlt_pipeline_state", + } issues = p.default_schema.tables["issues"] assert issues["columns"]["id"]["primary_key"] is True # make sure that vectorization is enabled for @@ -345,4 +361,3 @@ def test_empty_dataset_allowed() -> None: assert client.dataset_name is None assert client.sentinel_collection == "DltSentinelCollection" assert_collection(p, "content", expected_items_count=3) - diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index 1dfacbee7f..74d5db9715 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -20,14 +20,16 @@ def assert_collection( expected_items_count: int = None, items: List[Any] = None, ) -> None: - client: QdrantClient = pipeline.destination_client() # type: ignore[assignment] + client: QdrantClient = pipeline.destination_client() # type: ignore[assignment] # Check if collection exists exists = client._collection_exists(collection_name) assert exists qualified_collection_name = client._make_qualified_collection_name(collection_name) - point_records, offset = client.db_client.scroll(qualified_collection_name, with_payload=True, limit=50) + point_records, offset = client.db_client.scroll( + qualified_collection_name, with_payload=True, limit=50 + ) if expected_items_count is not None: assert expected_items_count == len(point_records) @@ -42,8 +44,10 @@ def assert_collection( assert_unordered_list_equal(objects_without_dlt_keys, items) + def drop_active_pipeline_data() -> None: print("Dropping active pipeline data for test") + def has_collections(client): schema = client.db_client.get_collections().collections return len(schema) > 0 @@ -51,7 +55,7 @@ def has_collections(client): if Container()[PipelineContext].is_active(): # take existing pipeline p = dlt.pipeline() - client: QdrantClient = p.destination_client() # type: ignore[assignment] + client: QdrantClient = p.destination_client() # type: ignore[assignment] if has_collections(client): client.drop_storage() diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index 7f617024df..f5efc16a47 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -20,7 +20,6 @@ from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage - @pytest.fixture def file_storage() -> FileStorage: return FileStorage(TEST_STORAGE_ROOT, file_type="b", makedirs=True) @@ -50,13 +49,13 @@ def test_text_too_long(client: RedshiftClient, file_storage: FileStorage) -> Non # try some unicode value - redshift checks the max length based on utf-8 representation, not the number of characters # max_len_str = 'उ' * (65535 // 3) + 1 -> does not fit # max_len_str = 'a' * 65535 + 1 -> does not fit - max_len_str = 'उ' * ((caps["max_text_data_type_length"] // 3) + 1) + max_len_str = "उ" * ((caps["max_text_data_type_length"] // 3) + 1) # max_len_str_b = max_len_str.encode("utf-8") # print(len(max_len_str_b)) row_id = uniq_id() insert_values = f"('{row_id}', '{uniq_id()}', '{max_len_str}' , '{str(pendulum.now())}');" with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) assert type(exv.value.dbapi_exception) is psycopg2.errors.StringDataRightTruncation @@ -64,25 +63,36 @@ def test_wei_value(client: RedshiftClient, file_storage: FileStorage) -> None: user_table_name = prepare_table(client) # max redshift decimal is (38, 0) (128 bit) = 10**38 - 1 - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, parse_data__metadata__rasa_x_id)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {10**38});" + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp," + " parse_data__metadata__rasa_x_id)\nVALUES\n" + ) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {10**38});" + ) with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) assert type(exv.value.dbapi_exception) is psycopg2.errors.InternalError_ def test_schema_string_exceeds_max_text_length(client: RedshiftClient) -> None: client.update_stored_schema() # schema should be compressed and stored as base64 - schema = SchemaStorage.load_schema_file(os.path.join(COMMON_TEST_CASES_PATH, "schemas/ev1"), "event", ("json",)) + schema = SchemaStorage.load_schema_file( + os.path.join(COMMON_TEST_CASES_PATH, "schemas/ev1"), "event", ("json",) + ) schema_str = json.dumps(schema.to_dict()) assert len(schema_str.encode("utf-8")) > client.capabilities.max_text_data_type_length client._update_schema_in_storage(schema) schema_info = client.get_stored_schema() assert schema_info.schema == schema_str # take base64 from db - with client.sql_client.execute_query(f"SELECT schema FROM {VERSION_TABLE_NAME} WHERE version_hash = '{schema.stored_version_hash}'") as cur: - row = cur.fetchone() + with client.sql_client.execute_query( + f"SELECT schema FROM {VERSION_TABLE_NAME} WHERE version_hash =" + f" '{schema.stored_version_hash}'" + ) as cur: + row = cur.fetchone() # decode base base64.b64decode(row[0], validate=True) @@ -99,7 +109,10 @@ def test_maximum_query_size(client: RedshiftClient, file_storage: FileStorage) - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp)\nVALUES\n" insert_values = "('{}', '{}', '90238094809sajlkjxoiewjhduuiuehd', '{}'){}" - insert_sql = insert_sql + insert_values.format(uniq_id(), uniq_id(), str(pendulum.now()), ",\n") * 150000 + insert_sql = ( + insert_sql + + insert_values.format(uniq_id(), uniq_id(), str(pendulum.now()), ",\n") * 150000 + ) insert_sql += insert_values.format(uniq_id(), uniq_id(), str(pendulum.now()), ";") user_table_name = prepare_table(client) diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index 2e0feb44e7..d2adfde403 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -7,10 +7,14 @@ from dlt.common.configuration import resolve_configuration from dlt.destinations.impl.redshift.redshift import RedshiftClient -from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration, RedshiftCredentials +from dlt.destinations.impl.redshift.configuration import ( + RedshiftClientConfiguration, + RedshiftCredentials, +) from tests.load.utils import TABLE_UPDATE + @pytest.fixture def schema() -> Schema: return Schema("event") @@ -19,12 +23,22 @@ def schema() -> Schema: @pytest.fixture def client(schema: Schema) -> RedshiftClient: # return client without opening connection - return RedshiftClient(schema, RedshiftClientConfiguration(dataset_name="test_" + uniq_id(), credentials=RedshiftCredentials())) + return RedshiftClient( + schema, + RedshiftClientConfiguration( + dataset_name="test_" + uniq_id(), credentials=RedshiftCredentials() + ), + ) def test_redshift_configuration() -> None: # check names normalized - with custom_environ({"DESTINATION__REDSHIFT__CREDENTIALS__DATABASE": "UPPER_CASE_DATABASE", "DESTINATION__REDSHIFT__CREDENTIALS__PASSWORD": " pass\n"}): + with custom_environ( + { + "DESTINATION__REDSHIFT__CREDENTIALS__DATABASE": "UPPER_CASE_DATABASE", + "DESTINATION__REDSHIFT__CREDENTIALS__PASSWORD": " pass\n", + } + ): C = resolve_configuration(RedshiftCredentials(), sections=("destination", "redshift")) assert C.database == "upper_case_database" assert C.password == "pass" @@ -32,13 +46,16 @@ def test_redshift_configuration() -> None: # check fingerprint assert RedshiftClientConfiguration().fingerprint() == "" # based on host - c = resolve_configuration(RedshiftCredentials(), explicit_value="postgres://user1:pass@host1/db1?warehouse=warehouse1&role=role1") + c = resolve_configuration( + RedshiftCredentials(), + explicit_value="postgres://user1:pass@host1/db1?warehouse=warehouse1&role=role1", + ) assert RedshiftClientConfiguration(credentials=c).fingerprint() == digest128("host1") def test_create_table(client: RedshiftClient) -> None: # non existing table - sql = ';'.join(client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)) + sql = ";".join(client._get_table_update_sql("event_test_table", TABLE_UPDATE, False)) sqlfluff.parse(sql, dialect="redshift") assert "event_test_table" in sql assert '"col1" bigint NOT NULL' in sql @@ -62,7 +79,7 @@ def test_create_table(client: RedshiftClient) -> None: def test_alter_table(client: RedshiftClient) -> None: # existing table has no columns - sql = ';'.join(client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)) + sql = ";".join(client._get_table_update_sql("event_test_table", TABLE_UPDATE, True)) sqlfluff.parse(sql, dialect="redshift") canonical_name = client.sql_client.make_qualified_table_name("event_test_table") # must have several ALTER TABLE statements @@ -94,7 +111,7 @@ def test_create_table_with_hints(client: RedshiftClient) -> None: mod_update[0]["sort"] = True mod_update[1]["cluster"] = True mod_update[4]["cluster"] = True - sql = ';'.join(client._get_table_update_sql("event_test_table", mod_update, False)) + sql = ";".join(client._get_table_update_sql("event_test_table", mod_update, False)) sqlfluff.parse(sql, dialect="redshift") # PRIMARY KEY will not be present https://heap.io/blog/redshift-pitfalls-avoid assert '"col1" bigint SORTKEY NOT NULL' in sql diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index abf80a1241..fb8ff925c0 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -9,7 +9,10 @@ from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.utils import digest128 -from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.configuration import ( + SnowflakeClientConfiguration, + SnowflakeCredentials, +) from tests.common.configuration.utils import environment @@ -37,75 +40,87 @@ def test_connection_string_with_all_params() -> None: def test_to_connector_params() -> None: # PEM key - pkey_str = Path('./tests/common/cases/secrets/encrypted-private-key').read_text('utf8') + pkey_str = Path("./tests/common/cases/secrets/encrypted-private-key").read_text("utf8") creds = SnowflakeCredentials() creds.private_key = pkey_str # type: ignore[assignment] - creds.private_key_passphrase = '12345' # type: ignore[assignment] - creds.username = 'user1' - creds.database = 'db1' - creds.host = 'host1' - creds.warehouse = 'warehouse1' - creds.role = 'role1' + creds.private_key_passphrase = "12345" # type: ignore[assignment] + creds.username = "user1" + creds.database = "db1" + creds.host = "host1" + creds.warehouse = "warehouse1" + creds.role = "role1" params = creds.to_connector_params() - assert isinstance(params['private_key'], bytes) - params.pop('private_key') + assert isinstance(params["private_key"], bytes) + params.pop("private_key") assert params == dict( - user='user1', - database='db1', - account='host1', + user="user1", + database="db1", + account="host1", password=None, - warehouse='warehouse1', - role='role1', + warehouse="warehouse1", + role="role1", ) # base64 encoded DER key - pkey_str = Path('./tests/common/cases/secrets/encrypted-private-key-base64').read_text('utf8') + pkey_str = Path("./tests/common/cases/secrets/encrypted-private-key-base64").read_text("utf8") creds = SnowflakeCredentials() creds.private_key = pkey_str # type: ignore[assignment] - creds.private_key_passphrase = '12345' # type: ignore[assignment] - creds.username = 'user1' - creds.database = 'db1' - creds.host = 'host1' - creds.warehouse = 'warehouse1' - creds.role = 'role1' + creds.private_key_passphrase = "12345" # type: ignore[assignment] + creds.username = "user1" + creds.database = "db1" + creds.host = "host1" + creds.warehouse = "warehouse1" + creds.role = "role1" params = creds.to_connector_params() - assert isinstance(params['private_key'], bytes) - params.pop('private_key') + assert isinstance(params["private_key"], bytes) + params.pop("private_key") assert params == dict( - user='user1', - database='db1', - account='host1', + user="user1", + database="db1", + account="host1", password=None, - warehouse='warehouse1', - role='role1', + warehouse="warehouse1", + role="role1", ) def test_snowflake_credentials_native_value(environment) -> None: with pytest.raises(ConfigurationValueError): - resolve_configuration(SnowflakeCredentials(), explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1") + resolve_configuration( + SnowflakeCredentials(), + explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1", + ) # set password via env os.environ["CREDENTIALS__PASSWORD"] = "pass" - c = resolve_configuration(SnowflakeCredentials(), explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1") + c = resolve_configuration( + SnowflakeCredentials(), + explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1", + ) assert c.is_resolved() assert c.password == "pass" # # but if password is specified - it is final - c = resolve_configuration(SnowflakeCredentials(), explicit_value="snowflake://user1:pass1@host1/db1?warehouse=warehouse1&role=role1") + c = resolve_configuration( + SnowflakeCredentials(), + explicit_value="snowflake://user1:pass1@host1/db1?warehouse=warehouse1&role=role1", + ) assert c.is_resolved() assert c.password == "pass1" # set PK via env del os.environ["CREDENTIALS__PASSWORD"] os.environ["CREDENTIALS__PRIVATE_KEY"] = "pk" - c = resolve_configuration(SnowflakeCredentials(), explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1") + c = resolve_configuration( + SnowflakeCredentials(), + explicit_value="snowflake://user1@host1/db1?warehouse=warehouse1&role=role1", + ) assert c.is_resolved() assert c.private_key == "pk" @@ -114,5 +129,8 @@ def test_snowflake_configuration() -> None: # def empty fingerprint assert SnowflakeClientConfiguration().fingerprint() == "" # based on host - c = resolve_configuration(SnowflakeCredentials(), explicit_value="snowflake://user1:pass@host1/db1?warehouse=warehouse1&role=role1") + c = resolve_configuration( + SnowflakeCredentials(), + explicit_value="snowflake://user1:pass@host1/db1?warehouse=warehouse1&role=role1", + ) assert SnowflakeClientConfiguration(credentials=c).fingerprint() == digest128("host1") diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 9ede1c8d13..e6eaf26c89 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -6,7 +6,10 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient -from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.configuration import ( + SnowflakeClientConfiguration, + SnowflakeCredentials, +) from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE @@ -21,14 +24,16 @@ def schema() -> Schema: def snowflake_client(schema: Schema) -> SnowflakeClient: # return client without opening connection creds = SnowflakeCredentials() - return SnowflakeClient(schema, SnowflakeClientConfiguration(dataset_name="test_" + uniq_id(), credentials=creds)) + return SnowflakeClient( + schema, SnowflakeClientConfiguration(dataset_name="test_" + uniq_id(), credentials=creds) + ) def test_create_table(snowflake_client: SnowflakeClient) -> None: statements = snowflake_client._get_table_update_sql("event_test_table", TABLE_UPDATE, False) assert len(statements) == 1 sql = statements[0] - sqlfluff.parse(sql, dialect='snowflake') + sqlfluff.parse(sql, dialect="snowflake") assert sql.strip().startswith("CREATE TABLE") assert "EVENT_TEST_TABLE" in sql diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index 9edc49a607..bd20a0ea95 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -22,7 +22,12 @@ from dlt.load.exceptions import LoadClientJobFailed, LoadClientJobRetry from dlt.common.schema.utils import get_top_level_table -from tests.utils import clean_test_storage, init_test_logging, TEST_DICT_CONFIG_PROVIDER, preserve_environ +from tests.utils import ( + clean_test_storage, + init_test_logging, + TEST_DICT_CONFIG_PROVIDER, + preserve_environ, +) from tests.load.utils import prepare_load_package from tests.utils import skip_if_not_active @@ -31,9 +36,10 @@ NORMALIZED_FILES = [ "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl", - "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" + "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl", ] + @pytest.fixture(autouse=True) def storage() -> FileStorage: return clean_test_storage(init_normalize=True, init_loader=True) @@ -47,10 +53,7 @@ def logger_autouse() -> None: def test_spool_job_started() -> None: # default config keeps the job always running load = setup_loader() - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) files = load.load_storage.list_new_jobs(load_id) assert len(files) == 2 jobs: List[LoadJob] = [] @@ -58,7 +61,11 @@ def test_spool_job_started() -> None: job = Load.w_spool_job(load, f, load_id, schema) assert type(job) is dummy_impl.LoadDummyJob assert job.state() == "running" - assert load.load_storage.storage.has_file(load.load_storage._get_job_file_path(load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name())) + assert load.load_storage.storage.has_file( + load.load_storage._get_job_file_path( + load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name() + ) + ) jobs.append(job) # still running remaining_jobs = load.complete_jobs(load_id, jobs, schema) @@ -68,8 +75,7 @@ def test_spool_job_started() -> None: def test_unsupported_writer_type() -> None: load = setup_loader() load_id, _ = prepare_load_package( - load.load_storage, - ["event_bot.181291798a78198.0.unsupported_format"] + load.load_storage, ["event_bot.181291798a78198.0.unsupported_format"] ) with pytest.raises(TerminalValueError): load.load_storage.list_new_jobs(load_id) @@ -77,10 +83,7 @@ def test_unsupported_writer_type() -> None: def test_unsupported_write_disposition() -> None: load = setup_loader() - load_id, schema = prepare_load_package( - load.load_storage, - [NORMALIZED_FILES[0]] - ) + load_id, schema = prepare_load_package(load.load_storage, [NORMALIZED_FILES[0]]) # mock unsupported disposition schema.get_table("event_user")["write_disposition"] = "skip" # write back schema @@ -88,16 +91,15 @@ def test_unsupported_write_disposition() -> None: with ThreadPoolExecutor() as pool: load.run(pool) # job with unsupported write disp. is failed - exception = [f for f in load.load_storage.list_failed_jobs(load_id) if f.endswith(".exception")][0] + exception = [ + f for f in load.load_storage.list_failed_jobs(load_id) if f.endswith(".exception") + ][0] assert "LoadClientUnsupportedWriteDisposition" in load.load_storage.storage.load(exception) def test_get_new_jobs_info() -> None: load = setup_loader() - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) # no write disposition specified - get all new jobs assert len(load.get_new_jobs_info(load_id)) == 2 @@ -105,54 +107,75 @@ def test_get_new_jobs_info() -> None: def test_get_completed_table_chain_single_job_per_table() -> None: load = setup_loader() - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) top_job_table = get_top_level_table(schema.tables, "event_user") assert load.get_completed_table_chain(load_id, schema, top_job_table) is None # fake being completed - assert len(load.get_completed_table_chain(load_id, schema, top_job_table, "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl")) == 1 + assert ( + len( + load.get_completed_table_chain( + load_id, + schema, + top_job_table, + "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl", + ) + ) + == 1 + ) # actually complete loop_top_job_table = get_top_level_table(schema.tables, "event_loop_interrupted") - load.load_storage.start_job(load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl") + load.load_storage.start_job( + load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" + ) assert load.get_completed_table_chain(load_id, schema, loop_top_job_table) is None - load.load_storage.complete_job(load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl") - assert load.get_completed_table_chain(load_id, schema, loop_top_job_table) == [schema.get_table("event_loop_interrupted")] - assert load.get_completed_table_chain(load_id, schema, loop_top_job_table, "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl") == [schema.get_table("event_loop_interrupted")] + load.load_storage.complete_job( + load_id, "event_loop_interrupted.839c6e6b514e427687586ccc65bf133f.0.jsonl" + ) + assert load.get_completed_table_chain(load_id, schema, loop_top_job_table) == [ + schema.get_table("event_loop_interrupted") + ] + assert load.get_completed_table_chain( + load_id, schema, loop_top_job_table, "event_user.839c6e6b514e427687586ccc65bf133f.0.jsonl" + ) == [schema.get_table("event_loop_interrupted")] def test_spool_job_failed() -> None: # this config fails job on start load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) files = load.load_storage.list_new_jobs(load_id) jobs: List[LoadJob] = [] for f in files: job = Load.w_spool_job(load, f, load_id, schema) assert type(job) is EmptyLoadJob assert job.state() == "failed" - assert load.load_storage.storage.has_file(load.load_storage._get_job_file_path(load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name())) + assert load.load_storage.storage.has_file( + load.load_storage._get_job_file_path( + load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name() + ) + ) jobs.append(job) # complete files remaining_jobs = load.complete_jobs(load_id, jobs, schema) assert len(remaining_jobs) == 0 for job in jobs: - assert load.load_storage.storage.has_file(load.load_storage._get_job_file_path(load_id, LoadStorage.FAILED_JOBS_FOLDER, job.file_name())) - assert load.load_storage.storage.has_file(load.load_storage._get_job_file_path(load_id, LoadStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception")) + assert load.load_storage.storage.has_file( + load.load_storage._get_job_file_path( + load_id, LoadStorage.FAILED_JOBS_FOLDER, job.file_name() + ) + ) + assert load.load_storage.storage.has_file( + load.load_storage._get_job_file_path( + load_id, LoadStorage.FAILED_JOBS_FOLDER, job.file_name() + ".exception" + ) + ) started_files = load.load_storage.list_started_jobs(load_id) assert len(started_files) == 0 # test the whole flow load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) run_all(load) package_info = load.load_storage.get_load_package_info(load_id) assert package_info.state == "loaded" @@ -165,10 +188,7 @@ def test_spool_job_failed_exception_init() -> None: os.environ["LOAD__RAISE_ON_FAILED_JOBS"] = "true" os.environ["FAIL_IN_INIT"] = "true" load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) - load_id, _ = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) with patch.object(dummy_impl.DummyClient, "complete_load") as complete_load: with pytest.raises(LoadClientJobFailed) as py_ex: run_all(load) @@ -187,10 +207,7 @@ def test_spool_job_failed_exception_complete() -> None: os.environ["LOAD__RAISE_ON_FAILED_JOBS"] = "true" os.environ["FAIL_IN_INIT"] = "false" load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) - load_id, _ = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) with pytest.raises(LoadClientJobFailed) as py_ex: run_all(load) assert py_ex.value.load_id == load_id @@ -204,22 +221,17 @@ def test_spool_job_failed_exception_complete() -> None: def test_spool_job_retry_new() -> None: # this config retries job on start (transient fail) load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) files = load.load_storage.list_new_jobs(load_id) for f in files: job = Load.w_spool_job(load, f, load_id, schema) assert job.state() == "retry" + def test_spool_job_retry_spool_new() -> None: # this config retries job on start (transient fail) load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) # call higher level function that returns jobs and counts with ThreadPoolExecutor() as pool: load.pool = pool @@ -232,17 +244,18 @@ def test_spool_job_retry_started() -> None: # this config keeps the job always running load = setup_loader() # dummy_impl.CLIENT_CONFIG = DummyClientConfiguration - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) files = load.load_storage.list_new_jobs(load_id) jobs: List[LoadJob] = [] for f in files: job = Load.w_spool_job(load, f, load_id, schema) assert type(job) is dummy_impl.LoadDummyJob assert job.state() == "running" - assert load.load_storage.storage.has_file(load.load_storage._get_job_file_path(load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name())) + assert load.load_storage.storage.has_file( + load.load_storage._get_job_file_path( + load_id, LoadStorage.STARTED_JOBS_FOLDER, job.file_name() + ) + ) # mock job config to make it retry job.config.retry_prob = 1.0 jobs.append(job) @@ -266,10 +279,7 @@ def test_spool_job_retry_started() -> None: def test_try_retrieve_job() -> None: load = setup_loader() - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) # manually move jobs to started files = load.load_storage.list_new_jobs(load_id) for f in files: @@ -282,10 +292,7 @@ def test_try_retrieve_job() -> None: for j in jobs: assert j.state() == "failed" # new load package - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) load.pool = ThreadPoolExecutor() jobs_count, jobs = load.spool_new_jobs(load_id, schema) assert jobs_count == 2 @@ -304,7 +311,9 @@ def test_completed_loop() -> None: def test_failed_loop() -> None: # ask to delete completed - load = setup_loader(delete_completed_jobs=True, client_config=DummyClientConfiguration(fail_prob=1.0)) + load = setup_loader( + delete_completed_jobs=True, client_config=DummyClientConfiguration(fail_prob=1.0) + ) # actually not deleted because one of the jobs failed assert_complete_job(load, load.load_storage.storage, should_delete_completed=False) @@ -319,10 +328,7 @@ def test_completed_loop_with_delete_completed() -> None: def test_retry_on_new_loop() -> None: # test job that retries sitting in new jobs load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) - load_id, schema = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, schema = prepare_load_package(load.load_storage, NORMALIZED_FILES) with ThreadPoolExecutor() as pool: # 1st retry load.run(pool) @@ -340,20 +346,21 @@ def test_retry_on_new_loop() -> None: assert len(files) == 0 # complete package load.run(pool) - assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) + assert not load.load_storage.storage.has_folder( + load.load_storage.get_normalized_package_path(load_id) + ) # parse the completed job names completed_path = load.load_storage.get_completed_package_path(load_id) - for fn in load.load_storage.storage.list_folder_files(os.path.join(completed_path, LoadStorage.COMPLETED_JOBS_FOLDER)): + for fn in load.load_storage.storage.list_folder_files( + os.path.join(completed_path, LoadStorage.COMPLETED_JOBS_FOLDER) + ): # we update a retry count in each case assert LoadStorage.parse_job_file_name(fn).retry_count == 2 def test_retry_exceptions() -> None: load = setup_loader(client_config=DummyClientConfiguration(retry_prob=1.0)) - prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + prepare_load_package(load.load_storage, NORMALIZED_FILES) with ThreadPoolExecutor() as pool: # 1st retry with pytest.raises(LoadClientJobRetry) as py_ex: @@ -374,23 +381,24 @@ def test_load_single_thread() -> None: os.environ["LOAD__WORKERS"] = "1" load = setup_loader(client_config=DummyClientConfiguration(completed_prob=1.0)) assert load.config.pool_type == "none" - load_id, _ = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) + load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) # we do not need pool to complete metrics = load.run(None) while metrics.pending_items > 0: metrics = load.run(None) - assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) + assert not load.load_storage.storage.has_folder( + load.load_storage.get_normalized_package_path(load_id) + ) def test_wrong_writer_type() -> None: load = setup_loader() load_id, _ = prepare_load_package( load.load_storage, - ["event_bot.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values", - "event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values"] + [ + "event_bot.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values", + "event_user.b1d32c6660b242aaabbf3fc27245b7e6.0.insert_values", + ], ) with ThreadPoolExecutor() as pool: with pytest.raises(JobWithUnsupportedWriterException) as exv: @@ -407,22 +415,28 @@ def test_terminal_exceptions() -> None: raise AssertionError() -def assert_complete_job(load: Load, storage: FileStorage, should_delete_completed: bool = False) -> None: - load_id, _ = prepare_load_package( - load.load_storage, - NORMALIZED_FILES - ) +def assert_complete_job( + load: Load, storage: FileStorage, should_delete_completed: bool = False +) -> None: + load_id, _ = prepare_load_package(load.load_storage, NORMALIZED_FILES) # will complete all jobs with patch.object(dummy_impl.DummyClient, "complete_load") as complete_load: with ThreadPoolExecutor() as pool: load.run(pool) # did process schema update - assert storage.has_file(os.path.join(load.load_storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME)) + assert storage.has_file( + os.path.join( + load.load_storage.get_normalized_package_path(load_id), + LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME, + ) + ) # will finalize the whole package load.run(pool) # moved to loaded assert not storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) - completed_path = load.load_storage._get_job_folder_completed_path(load_id, "completed_jobs") + completed_path = load.load_storage._get_job_folder_completed_path( + load_id, "completed_jobs" + ) if should_delete_completed: # package was deleted assert not storage.has_folder(completed_path) @@ -442,7 +456,9 @@ def run_all(load: Load) -> None: sleep(0.1) -def setup_loader(delete_completed_jobs: bool = False, client_config: DummyClientConfiguration = None) -> Load: +def setup_loader( + delete_completed_jobs: bool = False, client_config: DummyClientConfiguration = None +) -> Load: # reset jobs for a test dummy_impl.JOBS = {} destination: TDestination = dummy() # type: ignore[assignment] @@ -452,7 +468,4 @@ def setup_loader(delete_completed_jobs: bool = False, client_config: DummyClient # setup loader with TEST_DICT_CONFIG_PROVIDER().values({"delete_completed_jobs": delete_completed_jobs}): - return Load( - destination, - initial_client_config=client_config - ) + return Load(destination, initial_client_config=client_config) diff --git a/tests/load/test_insert_job_client.py b/tests/load/test_insert_job_client.py index 86049b035a..04f3e3a86a 100644 --- a/tests/load/test_insert_job_client.py +++ b/tests/load/test_insert_job_client.py @@ -7,7 +7,11 @@ from dlt.common.storages import FileStorage from dlt.common.utils import uniq_id -from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.insert_job_client import InsertValuesJobClient from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy @@ -16,22 +20,33 @@ DEFAULT_SUBSET = ["duckdb", "redshift", "postgres"] + @pytest.fixture def file_storage() -> FileStorage: return FileStorage(TEST_STORAGE_ROOT, file_type="b", makedirs=True) + @pytest.fixture(scope="function") def client(request) -> Iterator[InsertValuesJobClient]: yield from yield_client_with_storage(request.param.destination) # type: ignore[misc] -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), indirect=True, ids=lambda x: x.name) + +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), + indirect=True, + ids=lambda x: x.name, +) def test_simple_load(client: InsertValuesJobClient, file_storage: FileStorage) -> None: user_table_name = prepare_table(client) canonical_name = client.sql_client.make_qualified_table_name(user_table_name) # create insert insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}')" - expect_load_file(client, file_storage, insert_sql+insert_values+";", user_table_name) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}')" + ) + expect_load_file(client, file_storage, insert_sql + insert_values + ";", user_table_name) rows_count = client.sql_client.execute_sql(f"SELECT COUNT(1) FROM {canonical_name}")[0][0] assert rows_count == 1 # insert 100 more rows @@ -41,14 +56,22 @@ def test_simple_load(client: InsertValuesJobClient, file_storage: FileStorage) - assert rows_count == 101 # insert null value insert_sql_nc = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, text)\nVALUES\n" - insert_values_nc = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', NULL);" - expect_load_file(client, file_storage, insert_sql_nc+insert_values_nc, user_table_name) + insert_values_nc = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', NULL);" + ) + expect_load_file(client, file_storage, insert_sql_nc + insert_values_nc, user_table_name) rows_count = client.sql_client.execute_sql(f"SELECT COUNT(1) FROM {canonical_name}")[0][0] assert rows_count == 102 @skipifpypy -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), + indirect=True, + ids=lambda x: x.name, +) def test_loading_errors(client: InsertValuesJobClient, file_storage: FileStorage) -> None: # test expected dbiapi exceptions for supported destinations import duckdb @@ -66,75 +89,102 @@ def test_loading_errors(client: InsertValuesJobClient, file_storage: FileStorage TNotNullViolation = duckdb.ConstraintException TNumericValueOutOfRange = TDatatypeMismatch = duckdb.ConversionException - user_table_name = prepare_table(client) # insert into unknown column insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, _unk_)\nVALUES\n" - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', NULL);" + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', NULL);" + ) with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) assert type(exv.value.dbapi_exception) is TUndefinedColumn # insert null value insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp)\nVALUES\n" insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', NULL);" with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) assert type(exv.value.dbapi_exception) is TNotNullViolation # insert wrong type insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp)\nVALUES\n" insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', TRUE);" with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) assert type(exv.value.dbapi_exception) is TDatatypeMismatch # numeric overflow on bigint - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, metadata__rasa_x_id)\nVALUES\n" + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, metadata__rasa_x_id)\nVALUES\n" + ) # 2**64//2 - 1 is a maximum bigint value - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {2**64//2});" + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {2**64//2});" + ) with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) - assert type(exv.value.dbapi_exception) in (TNumericValueOutOfRange, ) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) + assert type(exv.value.dbapi_exception) in (TNumericValueOutOfRange,) # numeric overflow on NUMERIC - insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp, parse_data__intent__id)\nVALUES\n" + insert_sql = ( + "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp," + " parse_data__intent__id)\nVALUES\n" + ) # default decimal is (38, 9) (128 bit), use local context to generate decimals with 38 precision with numeric_default_context(): - below_limit = Decimal(10**29) - Decimal('0.001') + below_limit = Decimal(10**29) - Decimal("0.001") above_limit = Decimal(10**29) # this will pass - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {below_limit});" - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {below_limit});" + ) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) # this will raise - insert_values = f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd', '{str(pendulum.now())}', {above_limit});" + insert_values = ( + f"('{uniq_id()}', '{uniq_id()}', '90238094809sajlkjxoiewjhduuiuehd'," + f" '{str(pendulum.now())}', {above_limit});" + ) with pytest.raises(DatabaseTerminalException) as exv: - expect_load_file(client, file_storage, insert_sql+insert_values, user_table_name) - assert type(exv.value.dbapi_exception) in (TNumericValueOutOfRange, psycopg2.errors.InternalError_) - - - -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), indirect=True, ids=lambda x: x.name) + expect_load_file(client, file_storage, insert_sql + insert_values, user_table_name) + assert type(exv.value.dbapi_exception) in ( + TNumericValueOutOfRange, + psycopg2.errors.InternalError_, + ) + + +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, subset=DEFAULT_SUBSET), + indirect=True, + ids=lambda x: x.name, +) def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) -> None: mocked_caps = client.sql_client.__class__.capabilities insert_sql = prepare_insert_statement(10) # this guarantees that we execute inserts line by line - with patch.object(mocked_caps, "max_query_length", 2), patch.object(client.sql_client, "execute_fragments") as mocked_fragments: + with patch.object(mocked_caps, "max_query_length", 2), patch.object( + client.sql_client, "execute_fragments" + ) as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # print(mocked_fragments.mock_calls) # split in 10 lines assert mocked_fragments.call_count == 10 for idx, call in enumerate(mocked_fragments.call_args_list): - fragment:List[str] = call.args[0] + fragment: List[str] = call.args[0] # last elem of fragment is a data list, first element is id, and must end with ;\n assert fragment[-1].startswith(f"'{idx}'") assert fragment[-1].endswith(");") assert_load_with_max_query(client, file_storage, 10, 2) start_idx = insert_sql.find("S\n(") - idx = insert_sql.find("),\n", len(insert_sql)//2) + idx = insert_sql.find("),\n", len(insert_sql) // 2) # set query length so it reads data until "," (followed by \n) query_length = (idx - start_idx - 1) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object(client.sql_client, "execute_fragments") as mocked_fragments: + with patch.object(mocked_caps, "max_query_length", query_length), patch.object( + client.sql_client, "execute_fragments" + ) as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on ',' @@ -142,7 +192,9 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - # so it reads until "\n" query_length = (idx - start_idx) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object(client.sql_client, "execute_fragments") as mocked_fragments: + with patch.object(mocked_caps, "max_query_length", query_length), patch.object( + client.sql_client, "execute_fragments" + ) as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on ',' @@ -150,14 +202,21 @@ def test_query_split(client: InsertValuesJobClient, file_storage: FileStorage) - # so it reads till the last ; query_length = (len(insert_sql) - start_idx - 3) * 2 - with patch.object(mocked_caps, "max_query_length", query_length), patch.object(client.sql_client, "execute_fragments") as mocked_fragments: + with patch.object(mocked_caps, "max_query_length", query_length), patch.object( + client.sql_client, "execute_fragments" + ) as mocked_fragments: user_table_name = prepare_table(client) expect_load_file(client, file_storage, insert_sql, user_table_name) # split in 2 on ',' assert mocked_fragments.call_count == 1 -def assert_load_with_max_query(client: InsertValuesJobClient, file_storage: FileStorage, insert_lines: int, max_query_length: int) -> None: +def assert_load_with_max_query( + client: InsertValuesJobClient, + file_storage: FileStorage, + insert_lines: int, + max_query_length: int, +) -> None: # load and check for real mocked_caps = client.sql_client.__class__.capabilities with patch.object(mocked_caps, "max_query_length", max_query_length): @@ -167,7 +226,9 @@ def assert_load_with_max_query(client: InsertValuesJobClient, file_storage: File rows_count = client.sql_client.execute_sql(f"SELECT COUNT(1) FROM {user_table_name}")[0][0] assert rows_count == insert_lines # get all uniq ids in order - with client.sql_client.execute_query(f"SELECT _dlt_id FROM {user_table_name} ORDER BY timestamp ASC;") as c: + with client.sql_client.execute_query( + f"SELECT _dlt_id FROM {user_table_name} ORDER BY timestamp ASC;" + ) as c: rows = list(c.fetchall()) v_ids = list(map(lambda i: i[0], rows)) assert list(map(str, range(0, insert_lines))) == v_ids @@ -177,7 +238,7 @@ def assert_load_with_max_query(client: InsertValuesJobClient, file_storage: File def prepare_insert_statement(lines: int) -> str: insert_sql = "INSERT INTO {}(_dlt_id, _dlt_root_id, sender_id, timestamp)\nVALUES\n" insert_values = "('{}', '{}', '90238094809sajlkjxoiewjhduuiuehd', '{}')" - #ids = [] + # ids = [] for i in range(lines): # id_ = uniq_id() # ids.append(id_) diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index e08919424a..39e83e7897 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -9,20 +9,39 @@ from dlt.common import json, pendulum from dlt.common.schema import Schema -from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME, TWriteDisposition, TTableSchema +from dlt.common.schema.typing import ( + LOADS_TABLE_NAME, + VERSION_TABLE_NAME, + TWriteDisposition, + TTableSchema, +) from dlt.common.schema.utils import new_table, new_column from dlt.common.storages import FileStorage from dlt.common.schema import TTableSchemaColumns from dlt.common.utils import uniq_id -from dlt.destinations.exceptions import DatabaseException, DatabaseTerminalException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseException, + DatabaseTerminalException, + DatabaseUndefinedRelation, +) from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.common.destination.reference import WithStagingDataset from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage from tests.common.utils import load_json_case -from tests.load.utils import (TABLE_UPDATE, TABLE_UPDATE_COLUMNS_SCHEMA, TABLE_ROW_ALL_DATA_TYPES, assert_all_data_types_row , expect_load_file, load_table, yield_client_with_storage, - cm_yield_client_with_storage, write_dataset, prepare_table) +from tests.load.utils import ( + TABLE_UPDATE, + TABLE_UPDATE_COLUMNS_SCHEMA, + TABLE_ROW_ALL_DATA_TYPES, + assert_all_data_types_row, + expect_load_file, + load_table, + yield_client_with_storage, + cm_yield_client_with_storage, + write_dataset, + prepare_table, +) from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -30,17 +49,24 @@ def file_storage() -> FileStorage: return FileStorage(TEST_STORAGE_ROOT, file_type="b", makedirs=True) + @pytest.fixture(scope="function") def client(request) -> Iterator[SqlJobClientBase]: yield from yield_client_with_storage(request.param.destination) + @pytest.mark.order(1) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_initialize_storage(client: SqlJobClientBase) -> None: pass + @pytest.mark.order(2) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_get_schema_on_empty_storage(client: SqlJobClientBase) -> None: # test getting schema on empty dataset without any tables exists, _ = client.get_storage_table(VERSION_TABLE_NAME) @@ -50,8 +76,11 @@ def test_get_schema_on_empty_storage(client: SqlJobClientBase) -> None: schema_info = client.get_stored_schema_by_hash("8a0298298823928939") assert schema_info is None + @pytest.mark.order(3) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_get_update_basic_schema(client: SqlJobClientBase) -> None: schema = client.schema schema_update = client.update_stored_schema() @@ -104,7 +133,7 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: client._update_schema_in_storage(first_schema) this_schema = client.get_stored_schema_by_hash(first_schema.version_hash) newest_schema = client.get_stored_schema() - assert this_schema == newest_schema # error + assert this_schema == newest_schema # error assert this_schema.version == first_schema.version == 2 assert this_schema.version_hash == first_schema.stored_version_hash @@ -127,7 +156,9 @@ def test_get_update_basic_schema(client: SqlJobClientBase) -> None: assert this_schema == newest_schema -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_complete_load(client: SqlJobClientBase) -> None: client.update_stored_schema() load_id = "182879721.182912" @@ -139,18 +170,28 @@ def test_complete_load(client: SqlJobClientBase) -> None: assert load_rows[0][1] == client.schema.name assert load_rows[0][2] == 0 import datetime # noqa: I251 + assert type(load_rows[0][3]) is datetime.datetime assert load_rows[0][4] == client.schema.version_hash # make sure that hash in loads exists in schema versions table versions_table = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) - version_rows = list(client.sql_client.execute_sql(f"SELECT * FROM {versions_table} WHERE version_hash = %s", load_rows[0][4])) + version_rows = list( + client.sql_client.execute_sql( + f"SELECT * FROM {versions_table} WHERE version_hash = %s", load_rows[0][4] + ) + ) assert len(version_rows) == 1 client.complete_load("load2") load_rows = list(client.sql_client.execute_sql(f"SELECT * FROM {load_table}")) assert len(load_rows) == 2 -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, subset=["redshift", "postgres", "duckdb"]), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, subset=["redshift", "postgres", "duckdb"]), + indirect=True, + ids=lambda x: x.name, +) def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: # infer typical rasa event schema schema = client.schema @@ -160,7 +201,7 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: assert timestamp["sort"] is True # this will be destkey sender_id = schema._infer_column("sender_id", "982398490809324") - assert sender_id["cluster"] is True + assert sender_id["cluster"] is True # this will be not null record_hash = schema._infer_column("_dlt_id", "m,i0392903jdlkasjdlk") assert record_hash["unique"] is True @@ -176,7 +217,12 @@ def test_schema_update_create_table_redshift(client: SqlJobClientBase) -> None: assert exists is True -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, subset=["bigquery"]), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, subset=["bigquery"]), + indirect=True, + ids=lambda x: x.name, +) def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: # infer typical rasa event schema schema = client.schema @@ -203,7 +249,9 @@ def test_schema_update_create_table_bigquery(client: SqlJobClientBase) -> None: assert storage_table["version"]["cluster"] is False -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_schema_update_alter_table(client: SqlJobClientBase) -> None: # force to update schema in chunks by setting the max query size to 10 bytes/chars with patch.object(client.capabilities, "max_query_length", new=10): @@ -241,34 +289,36 @@ def test_schema_update_alter_table(client: SqlJobClientBase) -> None: assert storage_table["col4"]["data_type"] == "timestamp" -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_drop_tables(client: SqlJobClientBase) -> None: schema = client.schema # Add columns in all tables - schema.tables['event_user']['columns'] = dict(schema.tables['event_slot']['columns']) - schema.tables['event_bot']['columns'] = dict(schema.tables['event_slot']['columns']) + schema.tables["event_user"]["columns"] = dict(schema.tables["event_slot"]["columns"]) + schema.tables["event_bot"]["columns"] = dict(schema.tables["event_slot"]["columns"]) schema.bump_version() client.update_stored_schema() # Create a second schema with 2 hashes sd = schema.to_dict() - sd['name'] = 'event_2' + sd["name"] = "event_2" schema_2 = Schema.from_dict(sd).clone() # type: ignore[arg-type] for tbl_name in list(schema_2.tables): - if tbl_name.startswith('_dlt'): + if tbl_name.startswith("_dlt"): continue - schema_2.tables[tbl_name + '_2'] = schema_2.tables.pop(tbl_name) + schema_2.tables[tbl_name + "_2"] = schema_2.tables.pop(tbl_name) client.schema = schema_2 client.schema.bump_version() client.update_stored_schema() - client.schema.tables['event_slot_2']['columns']['value']['nullable'] = False + client.schema.tables["event_slot_2"]["columns"]["value"]["nullable"] = False client.schema.bump_version() client.update_stored_schema() # Drop tables from the first schema client.schema = schema - tables_to_drop = ['event_slot', 'event_user'] + tables_to_drop = ["event_slot", "event_user"] for tbl in tables_to_drop: del schema.tables[tbl] schema.bump_version() @@ -291,16 +341,22 @@ def test_drop_tables(client: SqlJobClientBase) -> None: # Verify _dlt_version schema is updated and old versions deleted table_name = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) - rows = client.sql_client.execute_sql(f"SELECT version_hash FROM {table_name} WHERE schema_name = %s", schema.name) + rows = client.sql_client.execute_sql( + f"SELECT version_hash FROM {table_name} WHERE schema_name = %s", schema.name + ) assert len(rows) == 1 assert rows[0][0] == schema.version_hash # Other schema is not replaced - rows = client.sql_client.execute_sql(f"SELECT version_hash FROM {table_name} WHERE schema_name = %s", schema_2.name) + rows = client.sql_client.execute_sql( + f"SELECT version_hash FROM {table_name} WHERE schema_name = %s", schema_2.name + ) assert len(rows) == 2 -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: schema = client.schema table_name = "event_test_table" + uniq_id() @@ -323,18 +379,25 @@ def test_get_storage_table_with_all_types(client: SqlJobClientBase) -> None: # print(c["data_type"]) assert c["name"] == expected_c["name"] # athena does not know wei data type and has no JSON type, time is not supported with parquet tables - if client.config.destination_name == "athena" and c["data_type"] in ("wei", "complex", "time"): + if client.config.destination_name == "athena" and c["data_type"] in ( + "wei", + "complex", + "time", + ): continue if client.config.destination_name == "mssql" and c["data_type"] in ("wei", "complex"): continue assert c["data_type"] == expected_c["data_type"] -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_preserve_column_order(client: SqlJobClientBase) -> None: schema = client.schema table_name = "event_test_table" + uniq_id() import random + columns = deepcopy(TABLE_UPDATE) random.shuffle(columns) @@ -353,13 +416,15 @@ def _assert_columns_order(sql_: str) -> None: idx = sql_.find(col_name, idx) assert idx > 0, f"column {col_name} not found in script" - sql = ';'.join(client._get_table_update_sql(table_name, columns, generate_alter=False)) + sql = ";".join(client._get_table_update_sql(table_name, columns, generate_alter=False)) _assert_columns_order(sql) - sql = ';'.join(client._get_table_update_sql(table_name, columns, generate_alter=True)) + sql = ";".join(client._get_table_update_sql(table_name, columns, generate_alter=True)) _assert_columns_order(sql) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_data_writer_load(client: SqlJobClientBase, file_storage: FileStorage) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") @@ -378,12 +443,16 @@ def test_data_writer_load(client: SqlJobClientBase, file_storage: FileStorage) - write_dataset(client, f, [rows[1]], client.schema.get_table(table_name)["columns"]) query = f.getvalue().decode() expect_load_file(client, file_storage, query, table_name) - db_row = client.sql_client.execute_sql(f"SELECT * FROM {canonical_name} WHERE f_int = {rows[1]['f_int']}")[0] + db_row = client.sql_client.execute_sql( + f"SELECT * FROM {canonical_name} WHERE f_int = {rows[1]['f_int']}" + )[0] assert db_row[3] is None assert db_row[5] is None -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_data_writer_string_escape(client: SqlJobClientBase, file_storage: FileStorage) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") @@ -401,8 +470,12 @@ def test_data_writer_string_escape(client: SqlJobClientBase, file_storage: FileS assert list(db_row) == list(row.values()) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) -def test_data_writer_string_escape_edge(client: SqlJobClientBase, file_storage: FileStorage) -> None: +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) +def test_data_writer_string_escape_edge( + client: SqlJobClientBase, file_storage: FileStorage +) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") rows, table_name = prepare_schema(client, "weird_rows") @@ -411,20 +484,26 @@ def test_data_writer_string_escape_edge(client: SqlJobClientBase, file_storage: write_dataset(client, f, rows, client.schema.get_table(table_name)["columns"]) query = f.getvalue().decode() expect_load_file(client, file_storage, query, table_name) - for i in range(1,len(rows) + 1): + for i in range(1, len(rows) + 1): db_row = client.sql_client.execute_sql(f"SELECT str FROM {canonical_name} WHERE idx = {i}") - row_value, expected = db_row[0][0], rows[i-1]["str"] + row_value, expected = db_row[0][0], rows[i - 1]["str"] assert row_value == expected -@pytest.mark.parametrize('write_disposition', ["append", "replace"]) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) -def test_load_with_all_types(client: SqlJobClientBase, write_disposition: TWriteDisposition, file_storage: FileStorage) -> None: +@pytest.mark.parametrize("write_disposition", ["append", "replace"]) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) +def test_load_with_all_types( + client: SqlJobClientBase, write_disposition: TWriteDisposition, file_storage: FileStorage +) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") table_name = "event_test_table" + uniq_id() # we should have identical content with all disposition types - client.schema.update_table(new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) + client.schema.update_table( + new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE) + ) client.schema.bump_version() client.update_stored_schema() @@ -435,7 +514,7 @@ def test_load_with_all_types(client: SqlJobClientBase, write_disposition: TWrite client.update_stored_schema() with client.sql_client.with_staging_dataset( - client.should_load_data_to_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] + client.should_load_data_to_staging_dataset(client.schema.tables[table_name]) # type: ignore[attr-defined] ): canonical_name = client.sql_client.make_qualified_table_name(table_name) # write row @@ -447,28 +526,39 @@ def test_load_with_all_types(client: SqlJobClientBase, write_disposition: TWrite # content must equal assert_all_data_types_row(db_row) -@pytest.mark.parametrize('write_disposition,replace_strategy', [ - ("append", ""), - ("merge", ""), - ("replace", "truncate-and-insert"), - ("replace", "insert-from-staging"), - ("replace", "staging-optimized") - ]) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) -def test_write_dispositions(client: SqlJobClientBase, write_disposition: TWriteDisposition, replace_strategy: str, file_storage: FileStorage) -> None: + +@pytest.mark.parametrize( + "write_disposition,replace_strategy", + [ + ("append", ""), + ("merge", ""), + ("replace", "truncate-and-insert"), + ("replace", "insert-from-staging"), + ("replace", "staging-optimized"), + ], +) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) +def test_write_dispositions( + client: SqlJobClientBase, + write_disposition: TWriteDisposition, + replace_strategy: str, + file_storage: FileStorage, +) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") - os.environ['DESTINATION__REPLACE_STRATEGY'] = replace_strategy + os.environ["DESTINATION__REPLACE_STRATEGY"] = replace_strategy table_name = "event_test_table" + uniq_id() client.schema.update_table( new_table(table_name, write_disposition=write_disposition, columns=TABLE_UPDATE) - ) + ) child_table = client.schema.naming.make_path(table_name, "child") # add child table without write disposition so it will be inferred from the parent client.schema.update_table( new_table(child_table, columns=TABLE_UPDATE, parent_table_name=table_name) - ) + ) client.schema.bump_version() client.update_stored_schema() @@ -500,7 +590,12 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: TWriteD else: # load directly on other expect_load_file(client, file_storage, query, t) - db_rows = list(client.sql_client.execute_sql(f"SELECT * FROM {client.sql_client.make_qualified_table_name(t)} ORDER BY col1 ASC")) + db_rows = list( + client.sql_client.execute_sql( + f"SELECT * FROM {client.sql_client.make_qualified_table_name(t)} ORDER BY" + " col1 ASC" + ) + ) # in case of merge if write_disposition == "append": # we append 1 row to tables in each iteration @@ -513,13 +608,20 @@ def test_write_dispositions(client: SqlJobClientBase, write_disposition: TWriteD assert len(db_rows) == 0 # check staging with client.sql_client.with_staging_dataset(staging=True): - db_rows = list(client.sql_client.execute_sql(f"SELECT * FROM {client.sql_client.make_qualified_table_name(t)} ORDER BY col1 ASC")) + db_rows = list( + client.sql_client.execute_sql( + f"SELECT * FROM {client.sql_client.make_qualified_table_name(t)} ORDER" + " BY col1 ASC" + ) + ) assert len(db_rows) == idx + 1 # last row must have our last idx - make sure we append and overwrite assert db_rows[-1][0] == idx -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_retrieve_job(client: SqlJobClientBase, file_storage: FileStorage) -> None: if not client.capabilities.preferred_loader_file_format: pytest.skip("preferred loader file format not set, destination will only work with staging") @@ -527,8 +629,8 @@ def test_retrieve_job(client: SqlJobClientBase, file_storage: FileStorage) -> No load_json = { "_dlt_id": uniq_id(), "_dlt_root_id": uniq_id(), - "sender_id":'90238094809sajlkjxoiewjhduuiuehd', - "timestamp": str(pendulum.now()) + "sender_id": "90238094809sajlkjxoiewjhduuiuehd", + "timestamp": str(pendulum.now()), } with io.BytesIO() as f: write_dataset(client, f, [load_json], client.schema.get_table(user_table_name)["columns"]) @@ -543,30 +645,50 @@ def test_retrieve_job(client: SqlJobClientBase, file_storage: FileStorage) -> No assert r_job.state() == "completed" -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_default_schema_name_init_storage(destination_config: DestinationTestConfiguration) -> None: - with cm_yield_client_with_storage(destination_config.destination, default_config_values={ - "default_schema_name": "event" # pass the schema that is a default schema. that should create dataset with the name `dataset_name` - }) as client: + with cm_yield_client_with_storage( + destination_config.destination, + default_config_values={ + "default_schema_name": ( # pass the schema that is a default schema. that should create dataset with the name `dataset_name` + "event" + ) + }, + ) as client: assert client.sql_client.dataset_name == client.config.dataset_name assert client.sql_client.has_dataset() - with cm_yield_client_with_storage(destination_config.destination, default_config_values={ - "default_schema_name": None # no default_schema. that should create dataset with the name `dataset_name` - }) as client: + with cm_yield_client_with_storage( + destination_config.destination, + default_config_values={ + "default_schema_name": ( + None # no default_schema. that should create dataset with the name `dataset_name` + ) + }, + ) as client: assert client.sql_client.dataset_name == client.config.dataset_name assert client.sql_client.has_dataset() - with cm_yield_client_with_storage(destination_config.destination, default_config_values={ - "default_schema_name": "event_2" # the default schema is not event schema . that should create dataset with the name `dataset_name` with schema suffix - }) as client: + with cm_yield_client_with_storage( + destination_config.destination, + default_config_values={ + "default_schema_name": ( # the default schema is not event schema . that should create dataset with the name `dataset_name` with schema suffix + "event_2" + ) + }, + ) as client: assert client.sql_client.dataset_name == client.config.dataset_name + "_event" assert client.sql_client.has_dataset() -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_many_schemas_single_dataset(destination_config: DestinationTestConfiguration, file_storage: FileStorage) -> None: - +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) +def test_many_schemas_single_dataset( + destination_config: DestinationTestConfiguration, file_storage: FileStorage +) -> None: def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: # load something to event:user_table user_row = { @@ -575,7 +697,7 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: # "_dlt_load_id": "load_id", "event": "user", "sender_id": "sender_id", - "timestamp": str(pendulum.now()) + "timestamp": str(pendulum.now()), } with io.BytesIO() as f: write_dataset(_client, f, [user_row], _client.schema.tables["event_user"]["columns"]) @@ -585,11 +707,14 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: db_rows = list(_client.sql_client.execute_sql(f"SELECT * FROM {qual_table_name}")) assert len(db_rows) == expected_rows - with cm_yield_client_with_storage(destination_config.destination, default_config_values={"default_schema_name": None}) as client: - + with cm_yield_client_with_storage( + destination_config.destination, default_config_values={"default_schema_name": None} + ) as client: # event schema with event table if not client.capabilities.preferred_loader_file_format: - pytest.skip("preferred loader file format not set, destination will only work with staging") + pytest.skip( + "preferred loader file format not set, destination will only work with staging" + ) user_table = load_table("event_user")["event_user"] client.schema.update_table(new_table("event_user", columns=list(user_table.values()))) @@ -633,11 +758,17 @@ def _load_something(_client: SqlJobClientBase, expected_rows: int) -> None: _load_something(client, 3) # adding new non null column will generate sync error - event_3_schema.tables["event_user"]["columns"]["mandatory_column"] = new_column("mandatory_column", "text", nullable=False) + event_3_schema.tables["event_user"]["columns"]["mandatory_column"] = new_column( + "mandatory_column", "text", nullable=False + ) client.schema.bump_version() with pytest.raises(DatabaseException) as py_ex: client.update_stored_schema() - assert "mandatory_column" in str(py_ex.value).lower() or "NOT NULL" in str(py_ex.value) or "Adding columns with constraints not yet supported" in str(py_ex.value) + assert ( + "mandatory_column" in str(py_ex.value).lower() + or "NOT NULL" in str(py_ex.value) + or "Adding columns with constraints not yet supported" in str(py_ex.value) + ) def prepare_schema(client: SqlJobClientBase, case: str) -> Tuple[List[Dict[str, Any]], str]: diff --git a/tests/load/test_sql_client.py b/tests/load/test_sql_client.py index 5c6e1b9e31..371a1b840c 100644 --- a/tests/load/test_sql_client.py +++ b/tests/load/test_sql_client.py @@ -9,7 +9,12 @@ from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME from dlt.common.storages import FileStorage from dlt.common.utils import derives_from_class_of_name, uniq_id -from dlt.destinations.exceptions import DatabaseException, DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation +from dlt.destinations.exceptions import ( + DatabaseException, + DatabaseTerminalException, + DatabaseTransientException, + DatabaseUndefinedRelation, +) from dlt.destinations.sql_client import DBApiCursor, SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase @@ -25,30 +30,43 @@ def file_storage() -> FileStorage: return FileStorage(TEST_STORAGE_ROOT, file_type="b", makedirs=True) + @pytest.fixture(scope="function") def client(request) -> Iterator[SqlJobClientBase]: yield from yield_client_with_storage(request.param.destination) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True, exclude=["mssql"]), indirect=True, ids=lambda x: x.name) + +@pytest.mark.parametrize( + "client", + destinations_configs(default_sql_configs=True, exclude=["mssql"]), + indirect=True, + ids=lambda x: x.name, +) def test_sql_client_default_dataset_unqualified(client: SqlJobClientBase) -> None: client.update_stored_schema() load_id = "182879721.182912" client.complete_load(load_id) curr: DBApiCursor # get data from unqualified name - with client.sql_client.execute_query(f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at") as curr: + with client.sql_client.execute_query( + f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at" + ) as curr: columns = [c[0] for c in curr.description] data = curr.fetchall() assert len(data) > 0 # get data from qualified name load_table = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) - with client.sql_client.execute_query(f"SELECT * FROM {load_table} ORDER BY inserted_at") as curr: + with client.sql_client.execute_query( + f"SELECT * FROM {load_table} ORDER BY inserted_at" + ) as curr: assert [c[0] for c in curr.description] == columns assert curr.fetchall() == data -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_malformed_query_parameters(client: SqlJobClientBase) -> None: client.update_stored_schema() loads_table_name = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) @@ -59,25 +77,35 @@ def test_malformed_query_parameters(client: SqlJobClientBase) -> None: # parameters for placeholder will not be provided. the placeholder remains in query if is_positional: with pytest.raises(DatabaseTransientException) as term_ex: - with client.sql_client.execute_query(f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}"): + with client.sql_client.execute_query( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}" + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) # too many parameters with pytest.raises(DatabaseTransientException) as term_ex: - with client.sql_client.execute_query(f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}", pendulum.now(), 10): + with client.sql_client.execute_query( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}", + pendulum.now(), + 10, + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) # unknown named parameter if client.sql_client.dbapi.paramstyle == "pyformat": with pytest.raises(DatabaseTransientException) as term_ex: - with client.sql_client.execute_query(f"SELECT * FROM {loads_table_name} WHERE inserted_at = %(date)s"): + with client.sql_client.execute_query( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = %(date)s" + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_malformed_execute_parameters(client: SqlJobClientBase) -> None: client.update_stored_schema() loads_table_name = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) @@ -88,32 +116,46 @@ def test_malformed_execute_parameters(client: SqlJobClientBase) -> None: # parameters for placeholder will not be provided. the placeholder remains in query if is_positional: with pytest.raises(DatabaseTransientException) as term_ex: - client.sql_client.execute_sql(f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}") + client.sql_client.execute_sql( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}" + ) assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) # too many parameters with pytest.raises(DatabaseTransientException) as term_ex: - client.sql_client.execute_sql(f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}", pendulum.now(), 10) + client.sql_client.execute_sql( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = {placeholder}", + pendulum.now(), + 10, + ) assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) # unknown named parameter if client.sql_client.dbapi.paramstyle == "pyformat": with pytest.raises(DatabaseTransientException) as term_ex: - client.sql_client.execute_sql(f"SELECT * FROM {loads_table_name} WHERE inserted_at = %(date)s") + client.sql_client.execute_sql( + f"SELECT * FROM {loads_table_name} WHERE inserted_at = %(date)s" + ) assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_execute_sql(client: SqlJobClientBase) -> None: client.update_stored_schema() # ask with datetime # no_rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {VERSION_TABLE_NAME} WHERE inserted_at = %s", pendulum.now().add(seconds=1)) # assert len(no_rows) == 0 version_table_name = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) - rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {version_table_name}") + rows = client.sql_client.execute_sql( + f"SELECT schema_name, inserted_at FROM {version_table_name}" + ) assert len(rows) == 1 assert rows[0][0] == "event" - rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE schema_name = %s", "event") + rows = client.sql_client.execute_sql( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE schema_name = %s", "event" + ) assert len(rows) == 1 # print(rows) assert rows[0][0] == "event" @@ -122,18 +164,31 @@ def test_execute_sql(client: SqlJobClientBase) -> None: # print(rows[0][1]) # print(type(rows[0][1])) # convert to pendulum to make sure it is supported by dbapi - rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", ensure_pendulum_datetime(rows[0][1])) + rows = client.sql_client.execute_sql( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", + ensure_pendulum_datetime(rows[0][1]), + ) assert len(rows) == 1 # use rows in subsequent test if client.sql_client.dbapi.paramstyle == "pyformat": - rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %(date)s", date=rows[0][1]) + rows = client.sql_client.execute_sql( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at =" + " %(date)s", + date=rows[0][1], + ) assert len(rows) == 1 assert rows[0][0] == "event" - rows = client.sql_client.execute_sql(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %(date)s", date=pendulum.now().add(seconds=1)) + rows = client.sql_client.execute_sql( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at =" + " %(date)s", + date=pendulum.now().add(seconds=1), + ) assert len(rows) == 0 -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_execute_ddl(client: SqlJobClientBase) -> None: uniq_suffix = uniq_id() client.update_stored_schema() @@ -149,33 +204,51 @@ def test_execute_ddl(client: SqlJobClientBase) -> None: assert rows[0][0] == Decimal("1.0") -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_execute_query(client: SqlJobClientBase) -> None: client.update_stored_schema() version_table_name = client.sql_client.make_qualified_table_name(VERSION_TABLE_NAME) - with client.sql_client.execute_query(f"SELECT schema_name, inserted_at FROM {version_table_name}") as curr: + with client.sql_client.execute_query( + f"SELECT schema_name, inserted_at FROM {version_table_name}" + ) as curr: rows = curr.fetchall() assert len(rows) == 1 assert rows[0][0] == "event" - with client.sql_client.execute_query(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE schema_name = %s", "event") as curr: + with client.sql_client.execute_query( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE schema_name = %s", "event" + ) as curr: rows = curr.fetchall() assert len(rows) == 1 assert rows[0][0] == "event" assert isinstance(rows[0][1], datetime.datetime) - with client.sql_client.execute_query(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", rows[0][1]) as curr: + with client.sql_client.execute_query( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", + rows[0][1], + ) as curr: rows = curr.fetchall() assert len(rows) == 1 assert rows[0][0] == "event" - with client.sql_client.execute_query(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", pendulum.now().add(seconds=1)) as curr: + with client.sql_client.execute_query( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %s", + pendulum.now().add(seconds=1), + ) as curr: rows = curr.fetchall() assert len(rows) == 0 if client.sql_client.dbapi.paramstyle == "pyformat": - with client.sql_client.execute_query(f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at = %(date)s", date=pendulum.now().add(seconds=1)) as curr: + with client.sql_client.execute_query( + f"SELECT schema_name, inserted_at FROM {version_table_name} WHERE inserted_at =" + " %(date)s", + date=pendulum.now().add(seconds=1), + ) as curr: rows = curr.fetchall() assert len(rows) == 0 -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_execute_df(client: SqlJobClientBase) -> None: if client.config.destination_name == "bigquery": chunk_size = 50 @@ -193,13 +266,17 @@ def test_execute_df(client: SqlJobClientBase) -> None: insert_query = ",".join([f"({idx})" for idx in range(0, total_records)]) client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES {insert_query};") - with client.sql_client.execute_query(f"SELECT * FROM {f_q_table_name} ORDER BY col ASC") as curr: + with client.sql_client.execute_query( + f"SELECT * FROM {f_q_table_name} ORDER BY col ASC" + ) as curr: df = curr.df() # Force lower case df columns, snowflake has all cols uppercase df.columns = [dfcol.lower() for dfcol in df.columns] assert list(df["col"]) == list(range(0, total_records)) # get chunked - with client.sql_client.execute_query(f"SELECT * FROM {f_q_table_name} ORDER BY col ASC") as curr: + with client.sql_client.execute_query( + f"SELECT * FROM {f_q_table_name} ORDER BY col ASC" + ) as curr: # be compatible with duckdb vector size df_1 = curr.df(chunk_size=chunk_size) df_2 = curr.df(chunk_size=chunk_size) @@ -214,7 +291,9 @@ def test_execute_df(client: SqlJobClientBase) -> None: assert df_3 is None -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_database_exceptions(client: SqlJobClientBase) -> None: client.update_stored_schema() term_ex: Any @@ -232,11 +311,15 @@ def test_database_exceptions(client: SqlJobClientBase) -> None: pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) with pytest.raises(DatabaseUndefinedRelation) as term_ex: - with client.sql_client.execute_query("DELETE FROM TABLE_XXX WHERE 1=1;DELETE FROM ticket_forms__ticket_field_ids WHERE 1=1;"): + with client.sql_client.execute_query( + "DELETE FROM TABLE_XXX WHERE 1=1;DELETE FROM ticket_forms__ticket_field_ids WHERE 1=1;" + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) with pytest.raises(DatabaseUndefinedRelation) as term_ex: - with client.sql_client.execute_query("DROP TABLE TABLE_XXX;DROP TABLE ticket_forms__ticket_field_ids;"): + with client.sql_client.execute_query( + "DROP TABLE TABLE_XXX;DROP TABLE ticket_forms__ticket_field_ids;" + ): pass # invalid syntax @@ -247,7 +330,9 @@ def test_database_exceptions(client: SqlJobClientBase) -> None: # invalid column with pytest.raises(DatabaseTerminalException) as term_ex: loads_table_name = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) - with client.sql_client.execute_query(f"SELECT * FROM {loads_table_name} ORDER BY column_XXX"): + with client.sql_client.execute_query( + f"SELECT * FROM {loads_table_name} ORDER BY column_XXX" + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) # invalid parameters to dbapi @@ -259,7 +344,9 @@ def test_database_exceptions(client: SqlJobClientBase) -> None: with client.sql_client.with_alternative_dataset_name("UNKNOWN"): qualified_name = client.sql_client.make_qualified_table_name(LOADS_TABLE_NAME) with pytest.raises(DatabaseUndefinedRelation) as term_ex: - with client.sql_client.execute_query(f"SELECT * FROM {qualified_name} ORDER BY inserted_at"): + with client.sql_client.execute_query( + f"SELECT * FROM {qualified_name} ORDER BY inserted_at" + ): pass assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) with pytest.raises(DatabaseUndefinedRelation) as term_ex: @@ -272,28 +359,40 @@ def test_database_exceptions(client: SqlJobClientBase) -> None: assert client.sql_client.is_dbapi_exception(term_ex.value.dbapi_exception) -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_commit_transaction(client: SqlJobClientBase) -> None: table_name = prepare_temp_table(client) f_q_table_name = client.sql_client.make_qualified_table_name(table_name) with client.sql_client.begin_transaction(): client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0")) # check row still in transaction - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 1 # check row after commit - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 1 assert rows[0][0] == 1.0 with client.sql_client.begin_transaction() as tx: - client.sql_client.execute_sql(f"DELETE FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + client.sql_client.execute_sql( + f"DELETE FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) # explicit commit tx.commit_transaction() - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 0 -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_rollback_transaction(client: SqlJobClientBase) -> None: if client.capabilities.supports_transactions is False: pytest.skip("Destination does not support tx") @@ -302,29 +401,43 @@ def test_rollback_transaction(client: SqlJobClientBase) -> None: # test python exception with pytest.raises(RuntimeError): with client.sql_client.begin_transaction(): - client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0")) - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + client.sql_client.execute_sql( + f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0") + ) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 1 # python exception triggers rollback raise RuntimeError("ROLLBACK") - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 0 # test rollback on invalid query f_q_wrong_table_name = client.sql_client.make_qualified_table_name(f"{table_name}_X") with pytest.raises(DatabaseException): with client.sql_client.begin_transaction(): - client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0")) + client.sql_client.execute_sql( + f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0") + ) # table does not exist - client.sql_client.execute_sql(f"SELECT col FROM {f_q_wrong_table_name} WHERE col = %s", Decimal("1.0")) - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + client.sql_client.execute_sql( + f"SELECT col FROM {f_q_wrong_table_name} WHERE col = %s", Decimal("1.0") + ) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 0 # test explicit rollback with client.sql_client.begin_transaction() as tx: client.sql_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES (%s)", Decimal("1.0")) tx.rollback_transaction() - rows = client.sql_client.execute_sql(f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0")) + rows = client.sql_client.execute_sql( + f"SELECT col FROM {f_q_table_name} WHERE col = %s", Decimal("1.0") + ) assert len(rows) == 0 # test double rollback - behavior inconsistent across databases (some raise some not) @@ -335,7 +448,9 @@ def test_rollback_transaction(client: SqlJobClientBase) -> None: # tx.rollback_transaction() -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_transaction_isolation(client: SqlJobClientBase) -> None: if client.capabilities.supports_transactions is False: pytest.skip("Destination does not support tx") @@ -346,7 +461,9 @@ def test_transaction_isolation(client: SqlJobClientBase) -> None: def test_thread(thread_id: Decimal) -> None: # make a copy of the sql_client - thread_client = client.sql_client.__class__(client.sql_client.dataset_name, client.sql_client.credentials) + thread_client = client.sql_client.__class__( + client.sql_client.dataset_name, client.sql_client.credentials + ) with thread_client: with thread_client.begin_transaction(): thread_client.execute_sql(f"INSERT INTO {f_q_table_name} VALUES (%s)", thread_id) @@ -374,11 +491,18 @@ def test_thread(thread_id: Decimal) -> None: assert rows[0][0] == Decimal("2.0") -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_max_table_identifier_length(client: SqlJobClientBase) -> None: if client.capabilities.max_identifier_length >= 65536: - pytest.skip(f"destination {client.config.destination_name} has no table name length restriction") - table_name = 8 * "prospects_external_data__data365_member__member__feed_activities_created_post__items__comments__items__comments__items__author_details__educations" + pytest.skip( + f"destination {client.config.destination_name} has no table name length restriction" + ) + table_name = ( + 8 + * "prospects_external_data__data365_member__member__feed_activities_created_post__items__comments__items__comments__items__author_details__educations" + ) with pytest.raises(IdentifierTooLongException) as py_ex: prepare_table(client, "long_table_name", table_name, make_uniq_table=False) assert py_ex.value.identifier_type == "table" @@ -397,12 +521,19 @@ def test_max_table_identifier_length(client: SqlJobClientBase) -> None: # assert exists is True -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_max_column_identifier_length(client: SqlJobClientBase) -> None: if client.capabilities.max_column_identifier_length >= 65536: - pytest.skip(f"destination {client.config.destination_name} has no column name length restriction") + pytest.skip( + f"destination {client.config.destination_name} has no column name length restriction" + ) table_name = "prospects_external_data__data365_member__member" - column_name = 7 * "prospects_external_data__data365_member__member__feed_activities_created_post__items__comments__items__comments__items__author_details__educations__school_name" + column_name = ( + 7 + * "prospects_external_data__data365_member__member__feed_activities_created_post__items__comments__items__comments__items__author_details__educations__school_name" + ) with pytest.raises(IdentifierTooLongException) as py_ex: prepare_table(client, "long_column_name", table_name, make_uniq_table=False) assert py_ex.value.identifier_type == "column" @@ -414,7 +545,9 @@ def test_max_column_identifier_length(client: SqlJobClientBase) -> None: # assert long_column_name[:client.capabilities.max_column_identifier_length] in table_def -@pytest.mark.parametrize("client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name) +@pytest.mark.parametrize( + "client", destinations_configs(default_sql_configs=True), indirect=True, ids=lambda x: x.name +) def test_recover_on_explicit_tx(client: SqlJobClientBase) -> None: if client.capabilities.supports_transactions is False: pytest.skip("Destination does not support tx") @@ -441,7 +574,11 @@ def test_recover_on_explicit_tx(client: SqlJobClientBase) -> None: assert_load_id(client.sql_client, "EFG") # wrong value inserted - statements = ["BEGIN TRANSACTION;", f"INSERT INTO {version_table}(version) VALUES(1);", "COMMIT;"] + statements = [ + "BEGIN TRANSACTION;", + f"INSERT INTO {version_table}(version) VALUES(1);", + "COMMIT;", + ] # cannot insert NULL value with pytest.raises(DatabaseTerminalException): client.sql_client.execute_fragments(statements) @@ -467,10 +604,15 @@ def prepare_temp_table(client: SqlJobClientBase) -> str: iceberg_table_suffix = "" coltype = "numeric" if client.config.destination_name == "athena": - iceberg_table_suffix = f"LOCATION '{AWS_BUCKET}/ci/{table_name}' TBLPROPERTIES ('table_type'='ICEBERG', 'format'='parquet');" + iceberg_table_suffix = ( + f"LOCATION '{AWS_BUCKET}/ci/{table_name}' TBLPROPERTIES ('table_type'='ICEBERG'," + " 'format'='parquet');" + ) coltype = "bigint" qualified_table_name = table_name else: qualified_table_name = client.sql_client.make_qualified_table_name(table_name) - client.sql_client.execute_sql(f"CREATE TABLE {qualified_table_name} (col {coltype}) {iceberg_table_suffix};") + client.sql_client.execute_sql( + f"CREATE TABLE {qualified_table_name} (col {coltype}) {iceberg_table_suffix};" + ) return table_name diff --git a/tests/load/utils.py b/tests/load/utils.py index f591f51585..dc6576be86 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -12,7 +12,14 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import DestinationClientDwhConfiguration, JobClientBase, LoadJob, DestinationClientStagingConfiguration, WithStagingDataset, TDestinationReferenceArg +from dlt.common.destination.reference import ( + DestinationClientDwhConfiguration, + JobClientBase, + LoadJob, + DestinationClientStagingConfiguration, + WithStagingDataset, + TDestinationReferenceArg, +) from dlt.common.destination import TLoaderFileFormat, Destination from dlt.common.data_writers import DataWriter from dlt.common.schema import TColumnSchema, TTableSchemaColumns, Schema @@ -26,8 +33,18 @@ from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.job_client_impl import SqlJobClientBase -from tests.utils import ACTIVE_DESTINATIONS, IMPLEMENTED_DESTINATIONS, SQL_DESTINATIONS, EXCLUDED_DESTINATION_CONFIGURATIONS -from tests.cases import TABLE_UPDATE_COLUMNS_SCHEMA, TABLE_UPDATE, TABLE_ROW_ALL_DATA_TYPES, assert_all_data_types_row +from tests.utils import ( + ACTIVE_DESTINATIONS, + IMPLEMENTED_DESTINATIONS, + SQL_DESTINATIONS, + EXCLUDED_DESTINATION_CONFIGURATIONS, +) +from tests.cases import ( + TABLE_UPDATE_COLUMNS_SCHEMA, + TABLE_UPDATE, + TABLE_ROW_ALL_DATA_TYPES, + assert_all_data_types_row, +) # bucket urls AWS_BUCKET = dlt.config.get("tests.bucket_url_s3", str) @@ -37,11 +54,20 @@ R2_BUCKET = dlt.config.get("tests.bucket_url_r2", str) MEMORY_BUCKET = dlt.config.get("tests.memory", str) -ALL_FILESYSTEM_DRIVERS = dlt.config.get("ALL_FILESYSTEM_DRIVERS", list) or ["s3", "gs", "az", "file", "memory", "r2"] +ALL_FILESYSTEM_DRIVERS = dlt.config.get("ALL_FILESYSTEM_DRIVERS", list) or [ + "s3", + "gs", + "az", + "file", + "memory", + "r2", +] # Filter out buckets not in all filesystem drivers DEFAULT_BUCKETS = [GCS_BUCKET, AWS_BUCKET, FILE_BUCKET, MEMORY_BUCKET, AZ_BUCKET] -DEFAULT_BUCKETS = [bucket for bucket in DEFAULT_BUCKETS if bucket.split(':')[0] in ALL_FILESYSTEM_DRIVERS] +DEFAULT_BUCKETS = [ + bucket for bucket in DEFAULT_BUCKETS if bucket.split(":")[0] in ALL_FILESYSTEM_DRIVERS +] # Add r2 in extra buckets so it's not run for all tests R2_BUCKET_CONFIG = dict( @@ -51,7 +77,7 @@ aws_access_key_id=dlt.config.get("tests.r2_aws_access_key_id", str), aws_secret_access_key=dlt.config.get("tests.r2_aws_secret_access_key", str), endpoint_url=dlt.config.get("tests.r2_endpoint_url", str), - ) + ), ) EXTRA_BUCKETS: List[Dict[str, Any]] = [] @@ -64,6 +90,7 @@ @dataclass class DestinationTestConfiguration: """Class for defining test setup for one destination.""" + destination: str staging: Optional[str] = None file_format: Optional[TLoaderFileFormat] = None @@ -77,7 +104,7 @@ class DestinationTestConfiguration: @property def name(self) -> str: - name: str = self.destination + name: str = self.destination if self.file_format: name += f"-{self.file_format}" if not self.staging: @@ -90,35 +117,42 @@ def name(self) -> str: def setup(self) -> None: """Sets up environment variables for this destination configuration""" - os.environ['DESTINATION__FILESYSTEM__BUCKET_URL'] = self.bucket_url or "" - os.environ['DESTINATION__STAGE_NAME'] = self.stage_name or "" - os.environ['DESTINATION__STAGING_IAM_ROLE'] = self.staging_iam_role or "" - os.environ['DESTINATION__FORCE_ICEBERG'] = str(self.force_iceberg) or "" + os.environ["DESTINATION__FILESYSTEM__BUCKET_URL"] = self.bucket_url or "" + os.environ["DESTINATION__STAGE_NAME"] = self.stage_name or "" + os.environ["DESTINATION__STAGING_IAM_ROLE"] = self.staging_iam_role or "" + os.environ["DESTINATION__FORCE_ICEBERG"] = str(self.force_iceberg) or "" """For the filesystem destinations we disable compression to make analyzing the result easier""" if self.destination == "filesystem": - os.environ['DATA_WRITER__DISABLE_COMPRESSION'] = "True" + os.environ["DATA_WRITER__DISABLE_COMPRESSION"] = "True" - - def setup_pipeline(self, pipeline_name: str, dataset_name: str = None, full_refresh: bool = False, **kwargs) -> dlt.Pipeline: + def setup_pipeline( + self, pipeline_name: str, dataset_name: str = None, full_refresh: bool = False, **kwargs + ) -> dlt.Pipeline: """Convenience method to setup pipeline with this configuration""" self.setup() - pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination=self.destination, staging=self.staging, dataset_name=dataset_name or pipeline_name, full_refresh=full_refresh, **kwargs) + pipeline = dlt.pipeline( + pipeline_name=pipeline_name, + destination=self.destination, + staging=self.staging, + dataset_name=dataset_name or pipeline_name, + full_refresh=full_refresh, + **kwargs, + ) return pipeline def destinations_configs( - default_sql_configs: bool = False, - default_vector_configs: bool = False, - default_staging_configs: bool = False, - all_staging_configs: bool = False, - local_filesystem_configs: bool = False, - all_buckets_filesystem_configs: bool = False, - subset: Sequence[str] = (), - exclude: Sequence[str] = (), - file_format: Optional[TLoaderFileFormat] = None, + default_sql_configs: bool = False, + default_vector_configs: bool = False, + default_staging_configs: bool = False, + all_staging_configs: bool = False, + local_filesystem_configs: bool = False, + all_buckets_filesystem_configs: bool = False, + subset: Sequence[str] = (), + exclude: Sequence[str] = (), + file_format: Optional[TLoaderFileFormat] = None, ) -> List[DestinationTestConfiguration]: - # sanity check for item in subset: assert item in IMPLEMENTED_DESTINATIONS, f"Destination {item} is not implemented" @@ -128,11 +162,36 @@ def destinations_configs( # default non staging sql based configs, one per destination if default_sql_configs: - destination_configs += [DestinationTestConfiguration(destination=destination) for destination in SQL_DESTINATIONS if destination != "athena"] - destination_configs += [DestinationTestConfiguration(destination="duckdb", file_format="parquet")] + destination_configs += [ + DestinationTestConfiguration(destination=destination) + for destination in SQL_DESTINATIONS + if destination != "athena" + ] + destination_configs += [ + DestinationTestConfiguration(destination="duckdb", file_format="parquet") + ] # athena needs filesystem staging, which will be automatically set, we have to supply a bucket url though - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", supports_merge=False, bucket_url=AWS_BUCKET)] - destination_configs += [DestinationTestConfiguration(destination="athena", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, force_iceberg=True, supports_merge=False, supports_dbt=False, extra_info="iceberg")] + destination_configs += [ + DestinationTestConfiguration( + destination="athena", + staging="filesystem", + file_format="parquet", + supports_merge=False, + bucket_url=AWS_BUCKET, + ) + ] + destination_configs += [ + DestinationTestConfiguration( + destination="athena", + staging="filesystem", + file_format="parquet", + bucket_url=AWS_BUCKET, + force_iceberg=True, + supports_merge=False, + supports_dbt=False, + extra_info="iceberg", + ) + ] if default_vector_configs: # for now only weaviate @@ -140,46 +199,140 @@ def destinations_configs( if default_staging_configs or all_staging_configs: destination_configs += [ - DestinationTestConfiguration(destination="redshift", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, staging_iam_role="arn:aws:iam::267388281016:role/redshift_s3_read", extra_info="s3-role"), - DestinationTestConfiguration(destination="bigquery", staging="filesystem", file_format="parquet", bucket_url=GCS_BUCKET, extra_info="gcs-authorization"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=GCS_BUCKET, stage_name="PUBLIC.dlt_gcs_stage", extra_info="gcs-integration"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=AWS_BUCKET, extra_info="s3-integration"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=AWS_BUCKET, stage_name="PUBLIC.dlt_s3_stage", extra_info="s3-integration"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=AZ_BUCKET, stage_name="PUBLIC.dlt_az_stage", extra_info="az-integration"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="jsonl", bucket_url=AZ_BUCKET, extra_info="az-authorization"), + DestinationTestConfiguration( + destination="redshift", + staging="filesystem", + file_format="parquet", + bucket_url=AWS_BUCKET, + staging_iam_role="arn:aws:iam::267388281016:role/redshift_s3_read", + extra_info="s3-role", + ), + DestinationTestConfiguration( + destination="bigquery", + staging="filesystem", + file_format="parquet", + bucket_url=GCS_BUCKET, + extra_info="gcs-authorization", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="jsonl", + bucket_url=GCS_BUCKET, + stage_name="PUBLIC.dlt_gcs_stage", + extra_info="gcs-integration", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="jsonl", + bucket_url=AWS_BUCKET, + extra_info="s3-integration", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="jsonl", + bucket_url=AWS_BUCKET, + stage_name="PUBLIC.dlt_s3_stage", + extra_info="s3-integration", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="jsonl", + bucket_url=AZ_BUCKET, + stage_name="PUBLIC.dlt_az_stage", + extra_info="az-integration", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="jsonl", + bucket_url=AZ_BUCKET, + extra_info="az-authorization", + ), ] if all_staging_configs: destination_configs += [ - DestinationTestConfiguration(destination="redshift", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, extra_info="credential-forwarding"), - DestinationTestConfiguration(destination="snowflake", staging="filesystem", file_format="parquet", bucket_url=AWS_BUCKET, extra_info="credential-forwarding"), - DestinationTestConfiguration(destination="redshift", staging="filesystem", file_format="jsonl", bucket_url=AWS_BUCKET, extra_info="credential-forwarding"), - DestinationTestConfiguration(destination="bigquery", staging="filesystem", file_format="jsonl", bucket_url=GCS_BUCKET, extra_info="gcs-authorization"), + DestinationTestConfiguration( + destination="redshift", + staging="filesystem", + file_format="parquet", + bucket_url=AWS_BUCKET, + extra_info="credential-forwarding", + ), + DestinationTestConfiguration( + destination="snowflake", + staging="filesystem", + file_format="parquet", + bucket_url=AWS_BUCKET, + extra_info="credential-forwarding", + ), + DestinationTestConfiguration( + destination="redshift", + staging="filesystem", + file_format="jsonl", + bucket_url=AWS_BUCKET, + extra_info="credential-forwarding", + ), + DestinationTestConfiguration( + destination="bigquery", + staging="filesystem", + file_format="jsonl", + bucket_url=GCS_BUCKET, + extra_info="gcs-authorization", + ), ] # add local filesystem destinations if requested if local_filesystem_configs: - destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=FILE_BUCKET, file_format="insert_values")] - destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=FILE_BUCKET, file_format="parquet")] - destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=FILE_BUCKET, file_format="jsonl")] + destination_configs += [ + DestinationTestConfiguration( + destination="filesystem", bucket_url=FILE_BUCKET, file_format="insert_values" + ) + ] + destination_configs += [ + DestinationTestConfiguration( + destination="filesystem", bucket_url=FILE_BUCKET, file_format="parquet" + ) + ] + destination_configs += [ + DestinationTestConfiguration( + destination="filesystem", bucket_url=FILE_BUCKET, file_format="jsonl" + ) + ] if all_buckets_filesystem_configs: for bucket in DEFAULT_BUCKETS: - destination_configs += [DestinationTestConfiguration(destination="filesystem", bucket_url=bucket, extra_info=bucket)] + destination_configs += [ + DestinationTestConfiguration( + destination="filesystem", bucket_url=bucket, extra_info=bucket + ) + ] # filter out non active destinations - destination_configs = [conf for conf in destination_configs if conf.destination in ACTIVE_DESTINATIONS] + destination_configs = [ + conf for conf in destination_configs if conf.destination in ACTIVE_DESTINATIONS + ] # filter out destinations not in subset if subset: destination_configs = [conf for conf in destination_configs if conf.destination in subset] if exclude: - destination_configs = [conf for conf in destination_configs if conf.destination not in exclude] + destination_configs = [ + conf for conf in destination_configs if conf.destination not in exclude + ] if file_format: - destination_configs = [conf for conf in destination_configs if conf.file_format == file_format] + destination_configs = [ + conf for conf in destination_configs if conf.file_format == file_format + ] # filter out excluded configs - destination_configs = [conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS] + destination_configs = [ + conf for conf in destination_configs if conf.name not in EXCLUDED_DESTINATION_CONFIGURATIONS + ] return destination_configs @@ -188,7 +341,10 @@ def get_normalized_dataset_name(client: JobClientBase) -> str: if isinstance(client.config, DestinationClientDwhConfiguration): return client.config.normalize_dataset_name(client.schema) else: - raise TypeError(f"{type(client)} client has configuration {type(client.config)} that does not support dataset name") + raise TypeError( + f"{type(client)} client has configuration {type(client.config)} that does not support" + " dataset name" + ) def load_table(name: str) -> Dict[str, TTableSchemaColumns]: @@ -196,19 +352,32 @@ def load_table(name: str) -> Dict[str, TTableSchemaColumns]: return json.load(f) -def expect_load_file(client: JobClientBase, file_storage: FileStorage, query: str, table_name: str, status = "completed") -> LoadJob: - file_name = ParsedLoadJobFileName(table_name, uniq_id(), 0, client.capabilities.preferred_loader_file_format).job_id() +def expect_load_file( + client: JobClientBase, + file_storage: FileStorage, + query: str, + table_name: str, + status="completed", +) -> LoadJob: + file_name = ParsedLoadJobFileName( + table_name, uniq_id(), 0, client.capabilities.preferred_loader_file_format + ).job_id() file_storage.save(file_name, query.encode("utf-8")) table = client.get_load_table(table_name) job = client.start_file_load(table, file_storage.make_full_path(file_name), uniq_id()) while job.state() == "running": sleep(0.5) assert job.file_name() == file_name - assert job.state() == status + assert job.state() == status return job -def prepare_table(client: JobClientBase, case_name: str = "event_user", table_name: str = "event_user", make_uniq_table: bool = True) -> str: +def prepare_table( + client: JobClientBase, + case_name: str = "event_user", + table_name: str = "event_user", + make_uniq_table: bool = True, +) -> str: client.schema.bump_version() client.update_stored_schema() user_table = load_table(case_name)[table_name] @@ -221,11 +390,12 @@ def prepare_table(client: JobClientBase, case_name: str = "event_user", table_na client.update_stored_schema() return user_table_name + def yield_client( destination_name: str, dataset_name: str = None, default_config_values: StrAny = None, - schema_name: str = "event" + schema_name: str = "event", ) -> Iterator[SqlJobClientBase]: os.environ.pop("DATASET_NAME", None) # import destination reference by name @@ -241,9 +411,10 @@ def yield_client( # also apply to config dest_config.update(default_config_values) # get event default schema - storage_config = resolve_configuration(SchemaStorageConfiguration(), explicit_value={ - "schema_volume_path": "tests/common/cases/schemas/rasa" - }) + storage_config = resolve_configuration( + SchemaStorageConfiguration(), + explicit_value={"schema_volume_path": "tests/common/cases/schemas/rasa"}, + ) schema_storage = SchemaStorage(storage_config) schema = schema_storage.load_schema(schema_name) # create client and dataset @@ -255,35 +426,42 @@ def yield_client( destination_name="fake-stage", dataset_name=dest_config.dataset_name, default_schema_name=dest_config.default_schema_name, - bucket_url=AWS_BUCKET + bucket_url=AWS_BUCKET, ) dest_config.staging_config = staging_config # type: ignore[attr-defined] # lookup for credentials in the section that is destination name - with Container().injectable_context(ConfigSectionContext(sections=("destination", destination_name,))): + with Container().injectable_context( + ConfigSectionContext( + sections=( + "destination", + destination_name, + ) + ) + ): with destination.client(schema, dest_config) as client: # type: ignore[assignment] yield client + @contextlib.contextmanager def cm_yield_client( destination_name: str, dataset_name: str, default_config_values: StrAny = None, - schema_name: str = "event" + schema_name: str = "event", ) -> Iterator[SqlJobClientBase]: return yield_client(destination_name, dataset_name, default_config_values, schema_name) def yield_client_with_storage( - destination_name: str, - default_config_values: StrAny = None, - schema_name: str = "event" + destination_name: str, default_config_values: StrAny = None, schema_name: str = "event" ) -> Iterator[SqlJobClientBase]: - # create dataset with random name dataset_name = "test_" + uniq_id() - with cm_yield_client(destination_name, dataset_name, default_config_values, schema_name) as client: + with cm_yield_client( + destination_name, dataset_name, default_config_values, schema_name + ) as client: client.initialize_storage() yield client # print(dataset_name) @@ -304,40 +482,49 @@ def delete_dataset(client: SqlClientBase[Any], normalized_dataset_name: str) -> @contextlib.contextmanager def cm_yield_client_with_storage( - destination_name: str, - default_config_values: StrAny = None, - schema_name: str = "event" + destination_name: str, default_config_values: StrAny = None, schema_name: str = "event" ) -> Iterator[SqlJobClientBase]: return yield_client_with_storage(destination_name, default_config_values, schema_name) -def write_dataset(client: JobClientBase, f: IO[bytes], rows: Union[List[Dict[str, Any]], List[StrAny]], columns_schema: TTableSchemaColumns) -> None: - data_format = DataWriter.data_format_from_file_format(client.capabilities.preferred_loader_file_format) +def write_dataset( + client: JobClientBase, + f: IO[bytes], + rows: Union[List[Dict[str, Any]], List[StrAny]], + columns_schema: TTableSchemaColumns, +) -> None: + data_format = DataWriter.data_format_from_file_format( + client.capabilities.preferred_loader_file_format + ) # adapt bytes stream to text file format if not data_format.is_binary_format and isinstance(f.read(0), bytes): f = codecs.getwriter("utf-8")(f) # type: ignore[assignment] writer = DataWriter.from_destination_capabilities(client.capabilities, f) # remove None values for idx, row in enumerate(rows): - rows[idx] = {k:v for k, v in row.items() if v is not None} + rows[idx] = {k: v for k, v in row.items() if v is not None} writer.write_all(columns_schema, rows) -def prepare_load_package(load_storage: LoadStorage, cases: Sequence[str], write_disposition: str='append') -> Tuple[str, Schema]: +def prepare_load_package( + load_storage: LoadStorage, cases: Sequence[str], write_disposition: str = "append" +) -> Tuple[str, Schema]: load_id = uniq_id() load_storage.create_temp_load_package(load_id) for case in cases: path = f"./tests/load/cases/loading/{case}" - shutil.copy(path, load_storage.storage.make_full_path(f"{load_id}/{LoadStorage.NEW_JOBS_FOLDER}")) + shutil.copy( + path, load_storage.storage.make_full_path(f"{load_id}/{LoadStorage.NEW_JOBS_FOLDER}") + ) schema_path = Path("./tests/load/cases/loading/schema.json") - data = json.loads(schema_path.read_text(encoding='utf8')) - for name, table in data['tables'].items(): - if name.startswith('_dlt'): + data = json.loads(schema_path.read_text(encoding="utf8")) + for name, table in data["tables"].items(): + if name.startswith("_dlt"): continue - table['write_disposition'] = write_disposition - Path( - load_storage.storage.make_full_path(load_id) - ).joinpath(schema_path.name).write_text(json.dumps(data), encoding='utf8') + table["write_disposition"] = write_disposition + Path(load_storage.storage.make_full_path(load_id)).joinpath(schema_path.name).write_text( + json.dumps(data), encoding="utf8" + ) schema_update_path = "./tests/load/cases/loading/schema_updates.json" shutil.copy(schema_update_path, load_storage.storage.make_full_path(load_id)) diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index dad7fc176f..290879cb67 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -5,12 +5,15 @@ from tests.common.utils import load_yml_case + @dlt.source def small(): - return dlt.resource([1,2,3], name="table") + return dlt.resource([1, 2, 3], name="table") -@pytest.mark.parametrize("n", [NamingConvention(), CINamingConvention()], ids=["naming", "ci_naming"]) +@pytest.mark.parametrize( + "n", [NamingConvention(), CINamingConvention()], ids=["naming", "ci_naming"] +) def test_table_name_normalization(n: NamingConvention) -> None: assert n.normalize_table_identifier("FlatSpace") == "FlatSpace" assert n.normalize_table_identifier("a_snake_case_name") == "ASnakeCaseName" diff --git a/tests/load/weaviate/test_pipeline.py b/tests/load/weaviate/test_pipeline.py index 691281c63e..a4b5098fe7 100644 --- a/tests/load/weaviate/test_pipeline.py +++ b/tests/load/weaviate/test_pipeline.py @@ -15,6 +15,7 @@ from tests.pipeline.utils import assert_load_info from .utils import assert_class, drop_active_pipeline_data + @pytest.fixture(autouse=True) def drop_weaviate_schema() -> Iterator[None]: yield @@ -75,6 +76,7 @@ def some_data(): state = client.get_stored_state("test_pipeline_append") assert state + def test_pipeline_append() -> None: generator_instance1 = sequence_generator() generator_instance2 = sequence_generator() @@ -149,7 +151,6 @@ def some_data(): def test_pipeline_replace() -> None: - generator_instance1 = sequence_generator() generator_instance2 = sequence_generator() @@ -196,16 +197,14 @@ def test_pipeline_merge() -> None: "doc_id": 1, "title": "The Shawshank Redemption", "description": ( - "Two imprisoned men find redemption through acts " - "of decency over the years." + "Two imprisoned men find redemption through acts of decency over the years." ), }, { "doc_id": 2, "title": "The Godfather", "description": ( - "A crime dynasty's aging patriarch transfers " - "control to his reluctant son." + "A crime dynasty's aging patriarch transfers control to his reluctant son." ), }, { @@ -310,20 +309,39 @@ def test_merge_github_nested() -> None: p = dlt.pipeline(destination="weaviate", dataset_name="github1", full_refresh=True) assert p.dataset_name.startswith("github1_202") - with open("tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: data = json.load(f) info = p.run( - weaviate_adapter(data[:17], vectorize=["title", "body"], tokenization={"user__login": "lowercase"}), + weaviate_adapter( + data[:17], vectorize=["title", "body"], tokenization={"user__login": "lowercase"} + ), table_name="issues", write_disposition="merge", - primary_key="id" + primary_key="id", ) assert_load_info(info) # assert if schema contains tables with right names - assert set(p.default_schema.tables.keys()) == {'DltVersion', 'DltLoads', 'Issues', 'DltPipelineState', 'Issues__Labels', 'Issues__Assignees'} - assert set([t["name"] for t in p.default_schema.data_tables()]) == {'Issues', 'Issues__Labels', 'Issues__Assignees'} - assert set([t["name"] for t in p.default_schema.dlt_tables()]) == {'DltVersion', 'DltLoads', 'DltPipelineState'} + assert set(p.default_schema.tables.keys()) == { + "DltVersion", + "DltLoads", + "Issues", + "DltPipelineState", + "Issues__Labels", + "Issues__Assignees", + } + assert set([t["name"] for t in p.default_schema.data_tables()]) == { + "Issues", + "Issues__Labels", + "Issues__Assignees", + } + assert set([t["name"] for t in p.default_schema.dlt_tables()]) == { + "DltVersion", + "DltLoads", + "DltPipelineState", + } issues = p.default_schema.tables["Issues"] # make sure that both "id" column and "primary_key" were changed to __id assert issues["columns"]["__id"]["primary_key"] is True @@ -369,7 +387,11 @@ def test_vectorize_property_without_data() -> None: # here we increase the abuse and try to vectorize a `Value` field, where in the data there's `value` # in standard naming convention this results in property conflict with pytest.raises(PipelineStepFailed) as pipe_ex: - p.run(weaviate_adapter(["a", "b", "c"], vectorize="vAlue"), primary_key="vAlue", columns={"vAlue": {"data_type": "text"}}) + p.run( + weaviate_adapter(["a", "b", "c"], vectorize="vAlue"), + primary_key="vAlue", + columns={"vAlue": {"data_type": "text"}}, + ) assert isinstance(pipe_ex.value.__context__, PropertyNameConflict) # set the naming convention to case insensitive @@ -377,7 +399,11 @@ def test_vectorize_property_without_data() -> None: dlt.config["schema.naming"] = "dlt.destinations.impl.weaviate.ci_naming" # create new schema with changed naming convention p = p.drop() - info = p.run(weaviate_adapter(["there are", "no stop", "words in here"], vectorize="vAlue"), primary_key="vALue", columns={"vAlue": {"data_type": "text"}}) + info = p.run( + weaviate_adapter(["there are", "no stop", "words in here"], vectorize="vAlue"), + primary_key="vALue", + columns={"vAlue": {"data_type": "text"}}, + ) # dataset in load info is empty assert_load_info(info) # print(p.default_schema.to_pretty_yaml()) diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index ca9d853d98..48153f7706 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -14,12 +14,19 @@ from dlt.common.storages.file_storage import FileStorage from dlt.common.schema.utils import new_table -from tests.load.utils import TABLE_ROW_ALL_DATA_TYPES, TABLE_UPDATE, TABLE_UPDATE_COLUMNS_SCHEMA, expect_load_file, write_dataset +from tests.load.utils import ( + TABLE_ROW_ALL_DATA_TYPES, + TABLE_UPDATE, + TABLE_UPDATE_COLUMNS_SCHEMA, + expect_load_file, + write_dataset, +) from tests.utils import TEST_STORAGE_ROOT from .utils import drop_active_pipeline_data + @pytest.fixture(autouse=True) def drop_weaviate_schema() -> Iterator[None]: yield @@ -33,21 +40,21 @@ def get_client_instance(schema: Schema) -> WeaviateClient: # return dest.client(schema, config) -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def client() -> Iterator[WeaviateClient]: yield from make_client("naming") -@pytest.fixture(scope='function') +@pytest.fixture(scope="function") def ci_client() -> Iterator[WeaviateClient]: yield from make_client("ci_naming") def make_client(naming_convention: str) -> Iterator[WeaviateClient]: - schema = Schema('test_schema', { - 'names': f"dlt.destinations.impl.weaviate.{naming_convention}", - 'json': None - }) + schema = Schema( + "test_schema", + {"names": f"dlt.destinations.impl.weaviate.{naming_convention}", "json": None}, + ) _client = get_client_instance(schema) try: yield _client @@ -60,11 +67,15 @@ def file_storage() -> FileStorage: return FileStorage(TEST_STORAGE_ROOT, file_type="b", makedirs=True) -@pytest.mark.parametrize('write_disposition', ["append", "replace", "merge"]) -def test_all_data_types(client: WeaviateClient, write_disposition: TWriteDisposition, file_storage: FileStorage) -> None: +@pytest.mark.parametrize("write_disposition", ["append", "replace", "merge"]) +def test_all_data_types( + client: WeaviateClient, write_disposition: TWriteDisposition, file_storage: FileStorage +) -> None: class_name = "AllTypes" # we should have identical content with all disposition types - client.schema.update_table(new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE)) + client.schema.update_table( + new_table(class_name, write_disposition=write_disposition, columns=TABLE_UPDATE) + ) client.schema.bump_version() client.update_stored_schema() @@ -86,25 +97,22 @@ def test_all_data_types(client: WeaviateClient, write_disposition: TWriteDisposi elif TABLE_UPDATE_COLUMNS_SCHEMA[col_name]["data_type"] == "date": assert table_columns[col_name]["data_type"] == "timestamp" else: - assert table_columns[col_name]["data_type"] == TABLE_UPDATE_COLUMNS_SCHEMA[col_name]["data_type"] + assert ( + table_columns[col_name]["data_type"] + == TABLE_UPDATE_COLUMNS_SCHEMA[col_name]["data_type"] + ) def test_case_sensitive_properties_create(client: WeaviateClient) -> None: class_name = "col_class" # we have two properties which will map to the same name in Weaviate table_create: List[TColumnSchema] = [ - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }, - { - "name": "coL1", - "data_type": "double", - "nullable": False - }, + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "coL1", "data_type": "double", "nullable": False}, ] - client.schema.update_table(client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) + client.schema.update_table( + client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) + ) client.schema.bump_version() with pytest.raises(PropertyNameConflict): client.update_stored_schema() @@ -114,38 +122,25 @@ def test_case_insensitive_properties_create(ci_client: WeaviateClient) -> None: class_name = "col_class" # we have two properties which will map to the same name in Weaviate table_create: List[TColumnSchema] = [ - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }, - { - "name": "coL1", - "data_type": "double", - "nullable": False - }, + {"name": "col1", "data_type": "bigint", "nullable": False}, + {"name": "coL1", "data_type": "double", "nullable": False}, ] - ci_client.schema.update_table(ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create))) + ci_client.schema.update_table( + ci_client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) + ) ci_client.schema.bump_version() ci_client.update_stored_schema() _, table_columns = ci_client.get_storage_table("ColClass") # later column overwrites earlier one so: double - assert table_columns == {'col1': {'name': 'col1', 'data_type': 'double'}} + assert table_columns == {"col1": {"name": "col1", "data_type": "double"}} def test_case_sensitive_properties_add(client: WeaviateClient) -> None: class_name = "col_class" # we have two properties which will map to the same name in Weaviate - table_create: List[TColumnSchema] = [{ - "name": "col1", - "data_type": "bigint", - "nullable": False - }] - table_update: List[TColumnSchema] = [{ - "name": "coL1", - "data_type": "double", - "nullable": False - }, + table_create: List[TColumnSchema] = [{"name": "col1", "data_type": "bigint", "nullable": False}] + table_update: List[TColumnSchema] = [ + {"name": "coL1", "data_type": "double", "nullable": False}, ] client.schema.update_table( client.schema.normalize_table_identifiers(new_table(class_name, columns=table_create)) @@ -167,12 +162,9 @@ def test_case_sensitive_properties_add(client: WeaviateClient) -> None: def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStorage) -> None: class_name = "col_class" # we have two properties which will map to the same name in Weaviate - table_create: TTableSchemaColumns = {"col1": - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }} + table_create: TTableSchemaColumns = { + "col1": {"name": "col1", "data_type": "bigint", "nullable": False} + } client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) client.schema.bump_version() client.update_stored_schema() @@ -189,19 +181,18 @@ def test_load_case_sensitive_data(client: WeaviateClient, file_storage: FileStor def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: FileStorage) -> None: class_name = "col_class" # we have two properties which will map to the same name in Weaviate - table_create: TTableSchemaColumns = {"col1": - { - "name": "col1", - "data_type": "bigint", - "nullable": False - }} + table_create: TTableSchemaColumns = { + "col1": {"name": "col1", "data_type": "bigint", "nullable": False} + } ci_client.schema.update_table(new_table(class_name, columns=[table_create["col1"]])) ci_client.schema.bump_version() ci_client.update_stored_schema() # prepare a data item where is name clash due to Weaviate being CI # but here we normalize the item data_clash = list( - ci_client.schema.normalize_data_item({"col1": 72187328, "coL1": 726171}, "_load_id_", "col_class") + ci_client.schema.normalize_data_item( + {"col1": 72187328, "coL1": 726171}, "_load_id_", "col_class" + ) )[0][1] # write row @@ -212,4 +203,4 @@ def test_load_case_sensitive_data_ci(ci_client: WeaviateClient, file_storage: Fi response = ci_client.query_class(class_name, ["col1"]).do() objects = response["data"]["Get"][ci_client.make_qualified_class_name(class_name)] # the latter of conflicting fields is stored (so data is lost) - assert objects == [{'col1': 726171}] + assert objects == [{"col1": 726171}] diff --git a/tests/normalize/mock_rasa_json_normalizer.py b/tests/normalize/mock_rasa_json_normalizer.py index c54992dc0b..f911c55493 100644 --- a/tests/normalize/mock_rasa_json_normalizer.py +++ b/tests/normalize/mock_rasa_json_normalizer.py @@ -5,14 +5,21 @@ class DataItemNormalizer(RelationalNormalizer): - - def normalize_data_item(self, source_event: TDataItem, load_id: str, table_name: str) -> TNormalizedRowIterator: + def normalize_data_item( + self, source_event: TDataItem, load_id: str, table_name: str + ) -> TNormalizedRowIterator: if self.schema.name == "event": # this emulates rasa parser on standard parser - event = {"sender_id": source_event["sender_id"], "timestamp": source_event["timestamp"], "type": source_event["event"]} + event = { + "sender_id": source_event["sender_id"], + "timestamp": source_event["timestamp"], + "type": source_event["event"], + } yield from super().normalize_data_item(event, load_id, table_name) # add table name which is "event" field in RASA OSS - yield from super().normalize_data_item(source_event, load_id, table_name + "_" + source_event["event"]) + yield from super().normalize_data_item( + source_event, load_id, table_name + "_" + source_event["event"] + ) else: # will generate tables properly yield from super().normalize_data_item(source_event, load_id, table_name) diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index 12b6267a59..d0f02bdfa1 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -1,6 +1,7 @@ import pytest from fnmatch import fnmatch from typing import Dict, Iterator, List, Sequence, Tuple + # from multiprocessing import get_start_method, Pool # from multiprocessing.dummy import Pool as ThreadPool from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor @@ -18,8 +19,19 @@ from dlt.normalize import Normalize from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_TYPES -from tests.utils import TEST_DICT_CONFIG_PROVIDER, assert_no_dict_key_starts_with, clean_test_storage, init_test_logging -from tests.normalize.utils import json_case_path, INSERT_CAPS, JSONL_CAPS, DEFAULT_CAPS, ALL_CAPABILITIES +from tests.utils import ( + TEST_DICT_CONFIG_PROVIDER, + assert_no_dict_key_starts_with, + clean_test_storage, + init_test_logging, +) +from tests.normalize.utils import ( + json_case_path, + INSERT_CAPS, + JSONL_CAPS, + DEFAULT_CAPS, + ALL_CAPABILITIES, +) @pytest.fixture(scope="module", autouse=True) @@ -57,7 +69,9 @@ def rasa_normalize() -> Iterator[Normalize]: def init_normalize(default_schemas_path: str = None) -> Iterator[Normalize]: clean_test_storage() # pass schema config fields to schema storage via dict config provider - with TEST_DICT_CONFIG_PROVIDER().values({"import_schema_path": default_schemas_path, "external_schema_format": "json"}): + with TEST_DICT_CONFIG_PROVIDER().values( + {"import_schema_path": default_schemas_path, "external_schema_format": "json"} + ): # inject the destination capabilities n = Normalize() yield n @@ -74,8 +88,12 @@ def test_initialize(rasa_normalize: Normalize) -> None: @pytest.mark.parametrize("caps", JSONL_CAPS, indirect=True) -def test_normalize_single_user_event_jsonl(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: - expected_tables, load_files = normalize_event_user(raw_normalize, "event.event.user_load_1", EXPECTED_USER_TABLES) +def test_normalize_single_user_event_jsonl( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: + expected_tables, load_files = normalize_event_user( + raw_normalize, "event.event.user_load_1", EXPECTED_USER_TABLES + ) # load, parse and verify jsonl for expected_table in expected_tables: get_line_from_file(raw_normalize.load_storage, load_files[expected_table]) @@ -86,7 +104,11 @@ def test_normalize_single_user_event_jsonl(caps: DestinationCapabilitiesContext, assert event_json["event"] == "user" assert event_json["parse_data__intent__name"] == "greet" assert event_json["text"] == "hello" - event_text, lines = get_line_from_file(raw_normalize.load_storage, load_files["event__parse_data__response_selector__default__ranking"], 9) + event_text, lines = get_line_from_file( + raw_normalize.load_storage, + load_files["event__parse_data__response_selector__default__ranking"], + 9, + ) assert lines == 10 event_json = json.loads(event_text) assert "id" in event_json @@ -95,31 +117,47 @@ def test_normalize_single_user_event_jsonl(caps: DestinationCapabilitiesContext, @pytest.mark.parametrize("caps", INSERT_CAPS, indirect=True) -def test_normalize_single_user_event_insert(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: +def test_normalize_single_user_event_insert( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: # mock_destination_caps(raw_normalize, caps) - expected_tables, load_files = normalize_event_user(raw_normalize, "event.event.user_load_1", EXPECTED_USER_TABLES) + expected_tables, load_files = normalize_event_user( + raw_normalize, "event.event.user_load_1", EXPECTED_USER_TABLES + ) # verify values line for expected_table in expected_tables: get_line_from_file(raw_normalize.load_storage, load_files[expected_table]) # return first values line from event_user file event_text, lines = get_line_from_file(raw_normalize.load_storage, load_files["event"], 2) assert lines == 3 - assert "'user'" in event_text + assert "'user'" in event_text assert "'greet'" in event_text assert "'hello'" in event_text - event_text, lines = get_line_from_file(raw_normalize.load_storage, load_files["event__parse_data__response_selector__default__ranking"], 11) + event_text, lines = get_line_from_file( + raw_normalize.load_storage, + load_files["event__parse_data__response_selector__default__ranking"], + 11, + ) assert lines == 12 assert "(7005479104644416710," in event_text @pytest.mark.parametrize("caps", JSONL_CAPS, indirect=True) -def test_normalize_filter_user_event(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: +def test_normalize_filter_user_event( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.user_load_v228_1"]) _, load_files = expect_load_package( rasa_normalize.load_storage, load_id, - ["event", "event_user", "event_user__metadata__user_nicknames", - "event_user__parse_data__entities", "event_user__parse_data__entities__processors", "event_user__parse_data__intent_ranking"] + [ + "event", + "event_user", + "event_user__metadata__user_nicknames", + "event_user__parse_data__entities", + "event_user__parse_data__entities__processors", + "event_user__parse_data__intent_ranking", + ], ) event_text, lines = get_line_from_file(rasa_normalize.load_storage, load_files["event_user"], 0) assert lines == 1 @@ -130,9 +168,15 @@ def test_normalize_filter_user_event(caps: DestinationCapabilitiesContext, rasa_ @pytest.mark.parametrize("caps", JSONL_CAPS, indirect=True) -def test_normalize_filter_bot_event(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: - load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.bot_load_metadata_2987398237498798"]) - _, load_files = expect_load_package(rasa_normalize.load_storage, load_id, ["event", "event_bot"]) +def test_normalize_filter_bot_event( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: + load_id = extract_and_normalize_cases( + rasa_normalize, ["event.event.bot_load_metadata_2987398237498798"] + ) + _, load_files = expect_load_package( + rasa_normalize.load_storage, load_id, ["event", "event_bot"] + ) event_text, lines = get_line_from_file(rasa_normalize.load_storage, load_files["event_bot"], 0) assert lines == 1 filtered_row = json.loads(event_text) @@ -141,35 +185,41 @@ def test_normalize_filter_bot_event(caps: DestinationCapabilitiesContext, rasa_n @pytest.mark.parametrize("caps", JSONL_CAPS, indirect=True) -def test_preserve_slot_complex_value_json_l(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: +def test_preserve_slot_complex_value_json_l( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.slot_session_metadata_1"]) - _, load_files = expect_load_package(rasa_normalize.load_storage, load_id, ["event", "event_slot"]) + _, load_files = expect_load_package( + rasa_normalize.load_storage, load_id, ["event", "event_slot"] + ) event_text, lines = get_line_from_file(rasa_normalize.load_storage, load_files["event_slot"], 0) assert lines == 1 filtered_row = json.loads(event_text) assert type(filtered_row["value"]) is dict - assert filtered_row["value"] == { - "user_id": "world", - "mitter_id": "hello" - } + assert filtered_row["value"] == {"user_id": "world", "mitter_id": "hello"} @pytest.mark.parametrize("caps", INSERT_CAPS, indirect=True) -def test_preserve_slot_complex_value_insert(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: +def test_preserve_slot_complex_value_insert( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.slot_session_metadata_1"]) - _, load_files = expect_load_package(rasa_normalize.load_storage, load_id, ["event", "event_slot"]) + _, load_files = expect_load_package( + rasa_normalize.load_storage, load_id, ["event", "event_slot"] + ) event_text, lines = get_line_from_file(rasa_normalize.load_storage, load_files["event_slot"], 2) assert lines == 3 - c_val = json.dumps({ - "user_id": "world", - "mitter_id": "hello" - }) + c_val = json.dumps({"user_id": "world", "mitter_id": "hello"}) assert c_val in event_text @pytest.mark.parametrize("caps", INSERT_CAPS, indirect=True) -def test_normalize_many_events_insert(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: - load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.many_load_2", "event.event.user_load_1"]) +def test_normalize_many_events_insert( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: + load_id = extract_and_normalize_cases( + rasa_normalize, ["event.event.many_load_2", "event.event.user_load_1"] + ) expected_tables = EXPECTED_USER_TABLES_RASA_NORMALIZER + ["event_bot", "event_action"] _, load_files = expect_load_package(rasa_normalize.load_storage, load_id, expected_tables) # return first values line from event_user file @@ -180,8 +230,12 @@ def test_normalize_many_events_insert(caps: DestinationCapabilitiesContext, rasa @pytest.mark.parametrize("caps", JSONL_CAPS, indirect=True) -def test_normalize_many_events(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: - load_id = extract_and_normalize_cases(rasa_normalize, ["event.event.many_load_2", "event.event.user_load_1"]) +def test_normalize_many_events( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: + load_id = extract_and_normalize_cases( + rasa_normalize, ["event.event.many_load_2", "event.event.user_load_1"] + ) expected_tables = EXPECTED_USER_TABLES_RASA_NORMALIZER + ["event_bot", "event_action"] _, load_files = expect_load_package(rasa_normalize.load_storage, load_id, expected_tables) # return first values line from event_user file @@ -192,22 +246,26 @@ def test_normalize_many_events(caps: DestinationCapabilitiesContext, rasa_normal @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_normalize_raw_no_type_hints(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: +def test_normalize_raw_no_type_hints( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: normalize_event_user(raw_normalize, "event.event.user_load_1", EXPECTED_USER_TABLES) assert_timestamp_data_type(raw_normalize.load_storage, "double") @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_normalize_raw_type_hints(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: +def test_normalize_raw_type_hints( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: extract_and_normalize_cases(rasa_normalize, ["event.event.user_load_1"]) assert_timestamp_data_type(rasa_normalize.load_storage, "timestamp") + @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_multiprocess_row_counting(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: - extract_cases( - raw_normalize.normalize_storage, - ["github.events.load_page_1_duck"] - ) +def test_multiprocess_row_counting( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: + extract_cases(raw_normalize.normalize_storage, ["github.events.load_page_1_duck"]) # use real process pool in tests with ProcessPoolExecutor(max_workers=4) as p: raw_normalize.run(p) @@ -217,10 +275,16 @@ def test_multiprocess_row_counting(caps: DestinationCapabilitiesContext, raw_nor @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_normalize_many_schemas(caps: DestinationCapabilitiesContext, rasa_normalize: Normalize) -> None: +def test_normalize_many_schemas( + caps: DestinationCapabilitiesContext, rasa_normalize: Normalize +) -> None: extract_cases( rasa_normalize.normalize_storage, - ["event.event.many_load_2", "event.event.user_load_1", "ethereum.blocks.9c1d9b504ea240a482b007788d5cd61c_2"] + [ + "event.event.many_load_2", + "event.event.user_load_1", + "ethereum.blocks.9c1d9b504ea240a482b007788d5cd61c_2", + ], ) # use real process pool in tests with ProcessPoolExecutor(max_workers=4) as p: @@ -238,12 +302,16 @@ def test_normalize_many_schemas(caps: DestinationCapabilitiesContext, rasa_norma expected_tables = EXPECTED_USER_TABLES_RASA_NORMALIZER + ["event_bot", "event_action"] expect_load_package(rasa_normalize.load_storage, load_id, expected_tables) if schema.name == "ethereum": - expect_load_package(rasa_normalize.load_storage, load_id, EXPECTED_ETH_TABLES, full_schema_update=False) + expect_load_package( + rasa_normalize.load_storage, load_id, EXPECTED_ETH_TABLES, full_schema_update=False + ) assert set(schemas) == set(["ethereum", "event"]) @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_normalize_typed_json(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: +def test_normalize_typed_json( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: extract_items(raw_normalize.normalize_storage, [JSON_TYPED_DICT], "special", "special") with ThreadPoolExecutor(max_workers=1) as pool: raw_normalize.run(pool) @@ -296,16 +364,30 @@ def test_schema_changes(caps: DestinationCapabilitiesContext, raw_normalize: Nor assert len(table_files["doc__comp"]) == 1 s = raw_normalize.load_or_create_schema(raw_normalize.schema_storage, "evolution") doc_table = s.get_table("doc") - assert {"_dlt_load_id", "_dlt_id", "str", "int", "bool", "int__v_text"} == set(doc_table["columns"].keys()) + assert {"_dlt_load_id", "_dlt_id", "str", "int", "bool", "int__v_text"} == set( + doc_table["columns"].keys() + ) doc__comp_table = s.get_table("doc__comp") assert doc__comp_table["parent"] == "doc" - assert {"_dlt_id", "_dlt_list_idx", "_dlt_parent_id", "str", "int", "bool", "int__v_text"} == set(doc__comp_table["columns"].keys()) + assert { + "_dlt_id", + "_dlt_list_idx", + "_dlt_parent_id", + "str", + "int", + "bool", + "int__v_text", + } == set(doc__comp_table["columns"].keys()) @pytest.mark.parametrize("caps", ALL_CAPABILITIES, indirect=True) -def test_normalize_twice_with_flatten(caps: DestinationCapabilitiesContext, raw_normalize: Normalize) -> None: +def test_normalize_twice_with_flatten( + caps: DestinationCapabilitiesContext, raw_normalize: Normalize +) -> None: load_id = extract_and_normalize_cases(raw_normalize, ["github.issues.load_page_5_duck"]) - _, table_files = expect_load_package(raw_normalize.load_storage, load_id, ["issues", "issues__labels", "issues__assignees"]) + _, table_files = expect_load_package( + raw_normalize.load_storage, load_id, ["issues", "issues__labels", "issues__assignees"] + ) assert len(table_files["issues"]) == 1 _, lines = get_line_from_file(raw_normalize.load_storage, table_files["issues"], 0) # insert writer adds 2 lines @@ -318,12 +400,16 @@ def assert_schema(_schema: Schema): assert "reactions__x1" in _schema.tables["issues"]["columns"] assert "reactions__1" not in _schema.tables["issues"]["columns"] - schema = raw_normalize.load_or_create_schema(raw_normalize.schema_storage, "github") assert_schema(schema) load_id = extract_and_normalize_cases(raw_normalize, ["github.issues.load_page_5_duck"]) - _, table_files = expect_load_package(raw_normalize.load_storage, load_id, ["issues", "issues__labels", "issues__assignees"], full_schema_update=False) + _, table_files = expect_load_package( + raw_normalize.load_storage, + load_id, + ["issues", "issues__labels", "issues__assignees"], + full_schema_update=False, + ) assert len(table_files["issues"]) == 1 _, lines = get_line_from_file(raw_normalize.load_storage, table_files["issues"], 0) # insert writer adds 2 lines @@ -333,35 +419,74 @@ def assert_schema(_schema: Schema): def test_group_worker_files() -> None: - files = ["f%03d" % idx for idx in range(0, 100)] assert Normalize.group_worker_files([], 4) == [] assert Normalize.group_worker_files(["f001"], 1) == [["f001"]] assert Normalize.group_worker_files(["f001"], 100) == [["f001"]] assert Normalize.group_worker_files(files[:4], 4) == [["f000"], ["f001"], ["f002"], ["f003"]] - assert Normalize.group_worker_files(files[:5], 4) == [["f000"], ["f001"], ["f002"], ["f003", "f004"]] - assert Normalize.group_worker_files(files[:8], 4) == [["f000", "f001"], ["f002", "f003"], ["f004", "f005"], ["f006", "f007"]] - assert Normalize.group_worker_files(files[:8], 3) == [["f000", "f001"], ["f002", "f003", "f006"], ["f004", "f005", "f007"]] - assert Normalize.group_worker_files(files[:5], 3) == [["f000"], ["f001", "f003"], ["f002", "f004"]] + assert Normalize.group_worker_files(files[:5], 4) == [ + ["f000"], + ["f001"], + ["f002"], + ["f003", "f004"], + ] + assert Normalize.group_worker_files(files[:8], 4) == [ + ["f000", "f001"], + ["f002", "f003"], + ["f004", "f005"], + ["f006", "f007"], + ] + assert Normalize.group_worker_files(files[:8], 3) == [ + ["f000", "f001"], + ["f002", "f003", "f006"], + ["f004", "f005", "f007"], + ] + assert Normalize.group_worker_files(files[:5], 3) == [ + ["f000"], + ["f001", "f003"], + ["f002", "f004"], + ] # check if sorted files = ["tab1.1", "chd.3", "tab1.2", "chd.4", "tab1.3"] - assert Normalize.group_worker_files(files, 3) == [["chd.3"], ["chd.4", "tab1.2"], ["tab1.1", "tab1.3"]] - - -EXPECTED_ETH_TABLES = ["blocks", "blocks__transactions", "blocks__transactions__logs", "blocks__transactions__logs__topics", - "blocks__uncles", "blocks__transactions__access_list", "blocks__transactions__access_list__storage_keys"] - -EXPECTED_USER_TABLES_RASA_NORMALIZER = ["event", "event_user", "event_user__parse_data__intent_ranking"] - - -EXPECTED_USER_TABLES = ["event", "event__parse_data__intent_ranking", "event__parse_data__response_selector__all_retrieval_intents", - "event__parse_data__response_selector__default__ranking", "event__parse_data__response_selector__default__response__response_templates", - "event__parse_data__response_selector__default__response__responses"] - - -def extract_items(normalize_storage: NormalizeStorage, items: Sequence[StrAny], schema_name: str, table_name: str) -> None: + assert Normalize.group_worker_files(files, 3) == [ + ["chd.3"], + ["chd.4", "tab1.2"], + ["tab1.1", "tab1.3"], + ] + + +EXPECTED_ETH_TABLES = [ + "blocks", + "blocks__transactions", + "blocks__transactions__logs", + "blocks__transactions__logs__topics", + "blocks__uncles", + "blocks__transactions__access_list", + "blocks__transactions__access_list__storage_keys", +] + +EXPECTED_USER_TABLES_RASA_NORMALIZER = [ + "event", + "event_user", + "event_user__parse_data__intent_ranking", +] + + +EXPECTED_USER_TABLES = [ + "event", + "event__parse_data__intent_ranking", + "event__parse_data__response_selector__all_retrieval_intents", + "event__parse_data__response_selector__default__ranking", + "event__parse_data__response_selector__default__response__response_templates", + "event__parse_data__response_selector__default__response__responses", +] + + +def extract_items( + normalize_storage: NormalizeStorage, items: Sequence[StrAny], schema_name: str, table_name: str +) -> None: extractor = ExtractorStorage(normalize_storage.config) extract_id = extractor.create_extract_id() extractor.write_data_item("puae-jsonl", extract_id, schema_name, table_name, items, None) @@ -369,7 +494,9 @@ def extract_items(normalize_storage: NormalizeStorage, items: Sequence[StrAny], extractor.commit_extract_files(extract_id) -def normalize_event_user(normalize: Normalize, case: str, expected_user_tables: List[str] = None) -> Tuple[List[str], Dict[str, List[str]]]: +def normalize_event_user( + normalize: Normalize, case: str, expected_user_tables: List[str] = None +) -> Tuple[List[str], Dict[str, List[str]]]: expected_user_tables = expected_user_tables or EXPECTED_USER_TABLES_RASA_NORMALIZER load_id = extract_and_normalize_cases(normalize, [case]) return expect_load_package(normalize.load_storage, load_id, expected_user_tables) @@ -399,12 +526,20 @@ def extract_cases(normalize_storage: NormalizeStorage, cases: Sequence[str]) -> extract_items(normalize_storage, items, schema_name, table_name) -def expect_load_package(load_storage: LoadStorage, load_id: str, expected_tables: Sequence[str], full_schema_update: bool = True) -> Tuple[List[str], Dict[str, List[str]]]: +def expect_load_package( + load_storage: LoadStorage, + load_id: str, + expected_tables: Sequence[str], + full_schema_update: bool = True, +) -> Tuple[List[str], Dict[str, List[str]]]: # normalize tables as paths (original json is snake case so we may do it without real lineage info) schema = load_storage.load_package_schema(load_id) # we are still in destination caps context so schema contains length assert schema.naming.max_length > 0 - expected_tables = [schema.naming.shorten_fragments(*schema.naming.break_path(table)) for table in expected_tables] + expected_tables = [ + schema.naming.shorten_fragments(*schema.naming.break_path(table)) + for table in expected_tables + ] # find jobs and processed files files = load_storage.list_new_jobs(load_id) @@ -428,7 +563,9 @@ def expect_load_package(load_storage: LoadStorage, load_id: str, expected_tables return expected_tables, ofl -def get_line_from_file(load_storage: LoadStorage, loaded_files: List[str], return_line: int = 0) -> Tuple[str, int]: +def get_line_from_file( + load_storage: LoadStorage, loaded_files: List[str], return_line: int = 0 +) -> Tuple[str, int]: lines = [] for file in loaded_files: with load_storage.storage.open_file(file) as f: diff --git a/tests/pipeline/cases/github_pipeline/github_extract.py b/tests/pipeline/cases/github_pipeline/github_extract.py index 74ff99033f..9bf7944b53 100644 --- a/tests/pipeline/cases/github_pipeline/github_extract.py +++ b/tests/pipeline/cases/github_pipeline/github_extract.py @@ -5,7 +5,9 @@ from github_pipeline import github # type: ignore[import-not-found] if __name__ == "__main__": - p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False) + p = dlt.pipeline( + "dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False + ) github_source = github() if len(sys.argv) > 1: # load only N issues diff --git a/tests/pipeline/cases/github_pipeline/github_pipeline.py b/tests/pipeline/cases/github_pipeline/github_pipeline.py index 6d19709947..789fcc4ef3 100644 --- a/tests/pipeline/cases/github_pipeline/github_pipeline.py +++ b/tests/pipeline/cases/github_pipeline/github_pipeline.py @@ -4,20 +4,29 @@ from dlt.common import json + @dlt.source(root_key=True) def github(): - - @dlt.resource(table_name="issues", write_disposition="merge", primary_key="id", merge_key=("node_id", "url")) + @dlt.resource( + table_name="issues", + write_disposition="merge", + primary_key="id", + merge_key=("node_id", "url"), + ) def load_issues(): # we should be in TEST_STORAGE folder - with open("../tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8") as f: + with open( + "../tests/normalize/cases/github.issues.load_page_5_duck.json", "r", encoding="utf-8" + ) as f: yield from json.load(f) return load_issues if __name__ == "__main__": - p = dlt.pipeline("dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False) + p = dlt.pipeline( + "dlt_github_pipeline", destination="duckdb", dataset_name="github_3", full_refresh=False + ) github_source = github() if len(sys.argv) > 1: # load only N issues diff --git a/tests/pipeline/conftest.py b/tests/pipeline/conftest.py index a9a94230a2..f6c47e35b1 100644 --- a/tests/pipeline/conftest.py +++ b/tests/pipeline/conftest.py @@ -1,2 +1,8 @@ -from tests.utils import preserve_environ, autouse_test_storage, patch_home_dir, wipe_pipeline, duckdb_pipeline_location -from tests.pipeline.utils import drop_dataset_from_env \ No newline at end of file +from tests.utils import ( + preserve_environ, + autouse_test_storage, + patch_home_dir, + wipe_pipeline, + duckdb_pipeline_location, +) +from tests.pipeline.utils import drop_dataset_from_env diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 686ad2ffd3..1a877818e3 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -18,9 +18,16 @@ from tests.utils import preserve_environ - @pytest.mark.parametrize( - ("item_type", "is_list"), [("pandas", False), ("table", False), ("record_batch", False), ("pandas", True), ("table", True), ("record_batch", True)] + ("item_type", "is_list"), + [ + ("pandas", False), + ("table", False), + ("record_batch", False), + ("pandas", True), + ("table", True), + ("record_batch", True), + ], ) def test_extract_and_normalize(item_type: TArrowFormat, is_list: bool): item, records = arrow_table_all_data_types(item_type) @@ -34,25 +41,26 @@ def some_data(): else: yield item - pipeline.extract(some_data()) norm_storage = pipeline._get_normalize_storage() - extract_files = [fn for fn in norm_storage.list_files_to_normalize_sorted() if fn.endswith(".parquet")] + extract_files = [ + fn for fn in norm_storage.list_files_to_normalize_sorted() if fn.endswith(".parquet") + ] assert len(extract_files) == 1 - with norm_storage.storage.open_file(extract_files[0], 'rb') as f: + with norm_storage.storage.open_file(extract_files[0], "rb") as f: extracted_bytes = f.read() info = pipeline.normalize() - assert info.row_counts['some_data'] == len(records) + assert info.row_counts["some_data"] == len(records) load_id = pipeline.list_normalized_load_packages()[0] storage = pipeline._get_load_storage() jobs = storage.list_new_jobs(load_id) job = [j for j in jobs if "some_data" in j][0] - with storage.storage.open_file(job, 'rb') as f: + with storage.storage.open_file(job, "rb") as f: normalized_bytes = f.read() # Normalized is linked/copied exactly and should be the same as the extracted file @@ -77,23 +85,31 @@ def some_data(): schema = pipeline.default_schema # Check schema detection - schema_columns = schema.tables['some_data']['columns'] + schema_columns = schema.tables["some_data"]["columns"] assert set(df_tbl.columns) == set(schema_columns) - assert schema_columns['date']['data_type'] == 'date' - assert schema_columns['int']['data_type'] == 'bigint' - assert schema_columns['float']['data_type'] == 'double' - assert schema_columns['decimal']['data_type'] == 'decimal' - assert schema_columns['time']['data_type'] == 'time' - assert schema_columns['binary']['data_type'] == 'binary' - assert schema_columns['string']['data_type'] == 'text' - assert schema_columns['json']['data_type'] == 'complex' + assert schema_columns["date"]["data_type"] == "date" + assert schema_columns["int"]["data_type"] == "bigint" + assert schema_columns["float"]["data_type"] == "double" + assert schema_columns["decimal"]["data_type"] == "decimal" + assert schema_columns["time"]["data_type"] == "time" + assert schema_columns["binary"]["data_type"] == "binary" + assert schema_columns["string"]["data_type"] == "text" + assert schema_columns["json"]["data_type"] == "complex" @pytest.mark.parametrize( - ("item_type", "is_list"), [("pandas", False), ("table", False), ("record_batch", False), ("pandas", True), ("table", True), ("record_batch", True)] + ("item_type", "is_list"), + [ + ("pandas", False), + ("table", False), + ("record_batch", False), + ("pandas", True), + ("table", True), + ("record_batch", True), + ], ) def test_normalize_jsonl(item_type: TArrowFormat, is_list: bool): - os.environ['DUMMY__LOADER_FILE_FORMAT'] = "jsonl" + os.environ["DUMMY__LOADER_FILE_FORMAT"] = "jsonl" item, records = arrow_table_all_data_types(item_type) @@ -106,7 +122,6 @@ def some_data(): else: yield item - pipeline.extract(some_data()) pipeline.normalize() @@ -114,17 +129,17 @@ def some_data(): storage = pipeline._get_load_storage() jobs = storage.list_new_jobs(load_id) job = [j for j in jobs if "some_data" in j][0] - with storage.storage.open_file(job, 'r') as f: + with storage.storage.open_file(job, "r") as f: result = [json.loads(line) for line in f] for row in result: - row['decimal'] = Decimal(row['decimal']) + row["decimal"] = Decimal(row["decimal"]) for record in records: - record['datetime'] = record['datetime'].replace(tzinfo=None) + record["datetime"] = record["datetime"].replace(tzinfo=None) expected = json.loads(json.dumps(records)) for record in expected: - record['decimal'] = Decimal(record['decimal']) + record["decimal"] = Decimal(record["decimal"]) assert result == expected @@ -137,7 +152,7 @@ def some_data(): yield item def map_func(item): - return item.filter(pa.compute.greater(item['int'], 80)) + return item.filter(pa.compute.greater(item["int"], 80)) # Add map that filters the table some_data.add_map(map_func) @@ -147,7 +162,7 @@ def map_func(item): result_tbl = result[0] assert len(result_tbl) < len(item) - assert pa.compute.all(pa.compute.greater(result_tbl['int'], 80)).as_py() + assert pa.compute.all(pa.compute.greater(result_tbl["int"], 80)).as_py() @pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) @@ -241,11 +256,11 @@ def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: @pytest.mark.parametrize("item_type", ["table"]) # , "pandas", "record_batch" def test_normalize_with_dlt_columns(item_type: TArrowFormat): item, records = arrow_table_all_data_types(item_type, num_rows=5432) - os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID'] = "True" - os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_ID'] = "True" + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID"] = "True" + os.environ["NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_ID"] = "True" # Test with buffer smaller than the number of batches to be written - os.environ['DATA_WRITER__BUFFER_MAX_ITEMS'] = "100" - os.environ['DATA_WRITER__ROW_GROUP_SIZE'] = "100" + os.environ["DATA_WRITER__BUFFER_MAX_ITEMS"] = "100" + os.environ["DATA_WRITER__ROW_GROUP_SIZE"] = "100" @dlt.resource def some_data(): @@ -260,17 +275,17 @@ def some_data(): storage = pipeline._get_load_storage() jobs = storage.list_new_jobs(load_id) job = [j for j in jobs if "some_data" in j][0] - with storage.storage.open_file(job, 'rb') as f: + with storage.storage.open_file(job, "rb") as f: tbl = pa.parquet.read_table(f) assert len(tbl) == 5432 # Test one column matches source data - assert tbl['string'].to_pylist() == [r['string'] for r in records] + assert tbl["string"].to_pylist() == [r["string"] for r in records] - assert pa.compute.all(pa.compute.equal(tbl['_dlt_load_id'], load_id)).as_py() + assert pa.compute.all(pa.compute.equal(tbl["_dlt_load_id"], load_id)).as_py() - all_ids = tbl['_dlt_id'].to_pylist() + all_ids = tbl["_dlt_id"].to_pylist() assert len(all_ids[0]) >= 14 # All ids are unique @@ -278,8 +293,8 @@ def some_data(): # _dlt_id and _dlt_load_id are added to pipeline schema schema = pipeline.default_schema - assert schema.tables['some_data']['columns']['_dlt_id']['data_type'] == 'text' - assert schema.tables['some_data']['columns']['_dlt_load_id']['data_type'] == 'text' + assert schema.tables["some_data"]["columns"]["_dlt_id"]["data_type"] == "text" + assert schema.tables["some_data"]["columns"]["_dlt_load_id"]["data_type"] == "text" pipeline.load().raise_on_failed_jobs() diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 71f6b5b813..faf4d8712a 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -28,20 +28,36 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # store dlt data in test storage (like patch_home_dir) with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): # save database outside of pipeline dir - with custom_environ({"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"}): + with custom_environ( + {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} + ): # create virtual env with (0.3.0) before the current schema upgrade with Venv.create(tempfile.mkdtemp(), ["dlt[duckdb]==0.3.0"]) as venv: # NOTE: we force a newer duckdb into the 0.3.0 dlt version to get compatible duckdb storage venv._install_deps(venv.context, ["duckdb" + "==" + pkg_version("duckdb")]) # load 20 issues - print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py", "20")) + print( + venv.run_script( + "../tests/pipeline/cases/github_pipeline/github_pipeline.py", "20" + ) + ) # load schema and check _dlt_loads definition - github_schema: TStoredSchema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) + github_schema: TStoredSchema = json.loads( + test_storage.load( + f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json" + ) + ) # print(github_schema["tables"][LOADS_TABLE_NAME]) assert github_schema["engine_version"] == 5 - assert "schema_version_hash" not in github_schema["tables"][LOADS_TABLE_NAME]["columns"] + assert ( + "schema_version_hash" + not in github_schema["tables"][LOADS_TABLE_NAME]["columns"] + ) # check loads table without attaching to pipeline - duckdb_cfg = resolve_configuration(DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb")) + duckdb_cfg = resolve_configuration( + DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), + sections=("destination", "duckdb"), + ) with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME}") # make sure we have just 4 columns @@ -54,11 +70,17 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: # load all issues print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py")) # hash hash in schema - github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) + github_schema = json.loads( + test_storage.load( + f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json" + ) + ) assert github_schema["engine_version"] == 8 assert "schema_version_hash" in github_schema["tables"][LOADS_TABLE_NAME]["columns"] with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: - rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at") + rows = client.execute_sql( + f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at" + ) # we have two loads assert len(rows) == 2 assert len(rows[0]) == 5 @@ -94,24 +116,41 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: # store dlt data in test storage (like patch_home_dir) with custom_environ({"DLT_DATA_DIR": get_dlt_data_dir()}): # save database outside of pipeline dir - with custom_environ({"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"}): + with custom_environ( + {"DESTINATION__DUCKDB__CREDENTIALS": "duckdb:///test_github_3.duckdb"} + ): # create virtual env with (0.3.0) before the current schema upgrade with Venv.create(tempfile.mkdtemp(), ["dlt[duckdb]==0.3.0"]) as venv: venv._install_deps(venv.context, ["duckdb" + "==" + pkg_version("duckdb")]) # extract and normalize on old version but DO NOT LOAD - print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_extract.py", "70")) + print( + venv.run_script( + "../tests/pipeline/cases/github_pipeline/github_extract.py", "70" + ) + ) # switch to current version and make sure the load package loads and schema migrates venv = Venv.restore_current() print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_load.py")) - duckdb_cfg = resolve_configuration(DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), sections=("destination", "duckdb")) + duckdb_cfg = resolve_configuration( + DuckDbClientConfiguration(dataset_name=GITHUB_DATASET), + sections=("destination", "duckdb"), + ) with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: rows = client.execute_sql("SELECT * FROM issues") assert len(rows) == 70 - github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) + github_schema = json.loads( + test_storage.load( + f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json" + ) + ) # attach to existing pipeline pipeline = dlt.attach(GITHUB_PIPELINE_NAME, credentials=duckdb_cfg.credentials) # get the schema from schema storage before we sync - github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) + github_schema = json.loads( + test_storage.load( + f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json" + ) + ) pipeline = pipeline.drop() pipeline.sync_destination() assert pipeline.default_schema.ENGINE_VERSION == 8 diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index af21eb9f81..6fb3cd775d 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -14,7 +14,12 @@ from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import WithStateSync -from dlt.common.exceptions import DestinationHasFailedJobs, DestinationTerminalException, PipelineStateNotAvailable, UnknownDestinationModule +from dlt.common.exceptions import ( + DestinationHasFailedJobs, + DestinationTerminalException, + PipelineStateNotAvailable, + UnknownDestinationModule, +) from dlt.common.pipeline import PipelineContext from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table @@ -188,23 +193,45 @@ def test_deterministic_salt(environment) -> None: def test_destination_explicit_credentials(environment: Any) -> None: # test redshift - p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="redshift", credentials="redshift://loader:loader@localhost:5432/dlt_data") + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + destination="redshift", + credentials="redshift://loader:loader@localhost:5432/dlt_data", + ) config = p._get_destination_client_initial_config() assert config.credentials.is_resolved() # with staging - p = dlt.pipeline(pipeline_name="postgres_pipeline", staging="filesystem", destination="redshift", credentials="redshift://loader:loader@localhost:5432/dlt_data") + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + staging="filesystem", + destination="redshift", + credentials="redshift://loader:loader@localhost:5432/dlt_data", + ) config = p._get_destination_client_initial_config(p.destination) assert config.credentials.is_resolved() config = p._get_destination_client_initial_config(p.staging, as_staging=True) assert config.credentials is None p._wipe_working_folder() # try filesystem which uses union of credentials that requires bucket_url to resolve - p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="filesystem", credentials={"aws_access_key_id": "key_id", "aws_secret_access_key": "key"}) + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + destination="filesystem", + credentials={"aws_access_key_id": "key_id", "aws_secret_access_key": "key"}, + ) config = p._get_destination_client_initial_config(p.destination) assert isinstance(config.credentials, AwsCredentials) assert config.credentials.is_resolved() # resolve gcp oauth - p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="filesystem", credentials={"project_id": "pxid", "refresh_token": "123token", "client_id": "cid", "client_secret": "s"}) + p = dlt.pipeline( + pipeline_name="postgres_pipeline", + destination="filesystem", + credentials={ + "project_id": "pxid", + "refresh_token": "123token", + "client_id": "cid", + "client_secret": "s", + }, + ) config = p._get_destination_client_initial_config(p.destination) assert isinstance(config.credentials, GcpOAuthCredentials) assert config.credentials.is_resolved() @@ -215,7 +242,7 @@ def test_destination_staging_config(environment: Any) -> None: p = dlt.pipeline( pipeline_name="staging_pipeline", destination=redshift(credentials="redshift://loader:loader@localhost:5432/dlt_data"), - staging=fs_dest + staging=fs_dest, ) schema = Schema("foo") p._inject_schema(schema) @@ -241,7 +268,7 @@ def test_destination_factory_defaults_resolve_from_config(environment: Any) -> N def test_destination_credentials_in_factory(environment: Any) -> None: - os.environ['DESTINATION__REDSHIFT__CREDENTIALS'] = "redshift://abc:123@localhost:5432/some_db" + os.environ["DESTINATION__REDSHIFT__CREDENTIALS"] = "redshift://abc:123@localhost:5432/some_db" redshift_dest = redshift("redshift://abc:123@localhost:5432/other_db") @@ -263,13 +290,14 @@ def test_destination_credentials_in_factory(environment: Any) -> None: @pytest.mark.skip(reason="does not work on CI. probably takes right credentials from somewhere....") def test_destination_explicit_invalid_credentials_filesystem(environment: Any) -> None: # if string cannot be parsed - p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="filesystem", credentials="PR8BLEM") + p = dlt.pipeline( + pipeline_name="postgres_pipeline", destination="filesystem", credentials="PR8BLEM" + ) with pytest.raises(NativeValueError): p._get_destination_client_initial_config(p.destination) def test_extract_source_twice() -> None: - def some_data(): yield [1, 2, 3] yield [1, 2, 3] @@ -305,8 +333,16 @@ def some_data(): def test_extract_multiple_sources() -> None: - s1 = DltSource(dlt.Schema("default"), "module", [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) - s2 = DltSource(dlt.Schema("default_2"),"module", [dlt.resource([6, 7, 8], name="resource_3"), dlt.resource([9, 10, 0], name="resource_4")]) + s1 = DltSource( + dlt.Schema("default"), + "module", + [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")], + ) + s2 = DltSource( + dlt.Schema("default_2"), + "module", + [dlt.resource([6, 7, 8], name="resource_3"), dlt.resource([9, 10, 0], name="resource_4")], + ) p = dlt.pipeline(destination="dummy") p.config.restore_from_destination = False @@ -325,12 +361,18 @@ def test_extract_multiple_sources() -> None: def i_fail(): raise NotImplementedError() - s3 = DltSource(dlt.Schema("default_3"), "module", [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")]) - s4 = DltSource(dlt.Schema("default_4"), "module", [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) + s3 = DltSource( + dlt.Schema("default_3"), + "module", + [dlt.resource([1, 2, 3], name="resource_1"), dlt.resource([3, 4, 5], name="resource_2")], + ) + s4 = DltSource( + dlt.Schema("default_4"), "module", [dlt.resource([6, 7, 8], name="resource_3"), i_fail] + ) with pytest.raises(PipelineStepFailed): - # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later - p.extract([s4, s3]) + # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later + p.extract([s4, s3]) # nothing to normalize assert len(storage.list_files_to_normalize_sorted()) == 0 @@ -430,7 +472,7 @@ def test_sentry_tracing() -> None: def r_check_sentry(): assert sentry_sdk.Hub.current.scope.span.op == "extract" assert sentry_sdk.Hub.current.scope.span.containing_transaction.name == "run" - yield [1,2,3] + yield [1, 2, 3] p.run(r_check_sentry) assert sentry_sdk.Hub.current.scope.span is None @@ -455,12 +497,10 @@ def r_fail(): assert sentry_sdk.Hub.current.scope.span is None - def test_pipeline_state_on_extract_exception() -> None: pipeline_name = "pipe_" + uniq_id() p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") - @dlt.resource def data_piece_1(): yield [1, 2, 3] @@ -587,7 +627,6 @@ def test_run_load_pending() -> None: pipeline_name = "pipe_" + uniq_id() p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") - def some_data(): yield from [1, 2, 3] @@ -632,7 +671,11 @@ def fail_extract(): attempt = None - for attempt in Retrying(stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(("load", "extract"))), reraise=True): + for attempt in Retrying( + stop=stop_after_attempt(3), + retry=retry_if_exception(retry_load(("load", "extract"))), + reraise=True, + ): with attempt: p.run(fail_extract()) # it retried @@ -641,7 +684,9 @@ def fail_extract(): # now it fails (extract is terminal exception) retry_count = 2 with pytest.raises(PipelineStepFailed) as py_ex: - for attempt in Retrying(stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(())), reraise=True): + for attempt in Retrying( + stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(())), reraise=True + ): with attempt: p.run(fail_extract()) assert isinstance(py_ex.value, PipelineStepFailed) @@ -651,7 +696,11 @@ def fail_extract(): os.environ["RAISE_ON_FAILED_JOBS"] = "true" os.environ["FAIL_PROB"] = "1.0" with pytest.raises(PipelineStepFailed) as py_ex: - for attempt in Retrying(stop=stop_after_attempt(3), retry=retry_if_exception(retry_load(("load", "extract"))), reraise=True): + for attempt in Retrying( + stop=stop_after_attempt(3), + retry=retry_if_exception(retry_load(("load", "extract"))), + reraise=True, + ): with attempt: p.run(fail_extract()) assert isinstance(py_ex.value, PipelineStepFailed) @@ -681,6 +730,7 @@ def test_set_get_local_value() -> None: assert p.state["_local"][value] == value # type: ignore[literal-required] new_val = uniq_id() + # check in context manager @dlt.resource def _w_local_state(): @@ -709,30 +759,38 @@ def resource_1(): p.run(resource_1, write_disposition="replace") print(list(p._schema_storage.live_schemas.values())[0].to_pretty_yaml()) - assert p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace" + assert ( + p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace" + ) assert p.default_schema.get_table("resource_1")["write_disposition"] == "replace" -@dlt.transformer(name="github_repo_events", primary_key="id", write_disposition="merge", table_name=lambda i: i['type']) +@dlt.transformer( + name="github_repo_events", + primary_key="id", + write_disposition="merge", + table_name=lambda i: i["type"], +) def github_repo_events(page): yield page @dlt.transformer(name="github_repo_events", primary_key="id", write_disposition="merge") def github_repo_events_table_meta(page): - yield from [dlt.mark.with_table_name(p, p['type']) for p in page] + yield from [dlt.mark.with_table_name(p, p["type"]) for p in page] @dlt.resource def _get_shuffled_events(): - with open("tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8") as f: + with open( + "tests/normalize/cases/github.events.load_page_1_duck.json", "r", encoding="utf-8" + ) as f: issues = json.load(f) yield issues -@pytest.mark.parametrize('github_resource', (github_repo_events_table_meta, github_repo_events)) +@pytest.mark.parametrize("github_resource", (github_repo_events_table_meta, github_repo_events)) def test_dispatch_rows_to_tables(github_resource: DltResource): - os.environ["COMPLETED_PROB"] = "1.0" pipeline_name = "pipe_" + uniq_id() p = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") @@ -742,51 +800,55 @@ def test_dispatch_rows_to_tables(github_resource: DltResource): # get all expected tables events = list(_get_shuffled_events) - expected_tables = set(map(lambda e: p.default_schema.naming.normalize_identifier(e["type"]), events)) + expected_tables = set( + map(lambda e: p.default_schema.naming.normalize_identifier(e["type"]), events) + ) # all the tables present - assert expected_tables.intersection([t["name"] for t in p.default_schema.data_tables()]) == expected_tables + assert ( + expected_tables.intersection([t["name"] for t in p.default_schema.data_tables()]) + == expected_tables + ) # all the columns have primary keys and merge disposition derived from resource - for table in p.default_schema.data_tables(): + for table in p.default_schema.data_tables(): if table.get("parent") is None: assert table["write_disposition"] == "merge" assert table["columns"]["id"]["primary_key"] is True def test_resource_name_in_schema() -> None: - @dlt.resource(table_name='some_table') + @dlt.resource(table_name="some_table") def static_data(): - yield {'a': 1, 'b': 2} + yield {"a": 1, "b": 2} - @dlt.resource(table_name=lambda x: 'dynamic_func_table') + @dlt.resource(table_name=lambda x: "dynamic_func_table") def dynamic_func_data(): - yield {'a': 1, 'b': 2} + yield {"a": 1, "b": 2} @dlt.resource def dynamic_mark_data(): - yield dlt.mark.with_table_name({'a': 1, 'b': 2}, 'dynamic_mark_table') + yield dlt.mark.with_table_name({"a": 1, "b": 2}, "dynamic_mark_table") - @dlt.resource(table_name='parent_table') + @dlt.resource(table_name="parent_table") def nested_data(): - yield {'a': 1, 'items': [{'c': 2}, {'c': 3}, {'c': 4}]} + yield {"a": 1, "items": [{"c": 2}, {"c": 3}, {"c": 4}]} @dlt.source def some_source(): return [static_data(), dynamic_func_data(), dynamic_mark_data(), nested_data()] - source = some_source() - p = dlt.pipeline(pipeline_name=uniq_id(), destination='dummy') + p = dlt.pipeline(pipeline_name=uniq_id(), destination="dummy") p.run(source) schema = p.default_schema - assert schema.tables['some_table']['resource'] == 'static_data' - assert schema.tables['dynamic_func_table']['resource'] == 'dynamic_func_data' - assert schema.tables['dynamic_mark_table']['resource'] == 'dynamic_mark_data' - assert schema.tables['parent_table']['resource'] == 'nested_data' - assert 'resource' not in schema.tables['parent_table__items'] + assert schema.tables["some_table"]["resource"] == "static_data" + assert schema.tables["dynamic_func_table"]["resource"] == "dynamic_func_data" + assert schema.tables["dynamic_mark_table"]["resource"] == "dynamic_mark_data" + assert schema.tables["parent_table"]["resource"] == "nested_data" + assert "resource" not in schema.tables["parent_table__items"] def test_preserve_fields_order() -> None: @@ -810,16 +872,29 @@ def reverse_order(item): p.extract(ordered_dict().add_map(reverse_order)) p.normalize() - assert list(p.default_schema.tables["order_1"]["columns"].keys()) == ["col_1", "col_2", "col_3", '_dlt_load_id', '_dlt_id'] - assert list(p.default_schema.tables["order_2"]["columns"].keys()) == ["col_3", "col_2", "col_1", '_dlt_load_id', '_dlt_id'] + assert list(p.default_schema.tables["order_1"]["columns"].keys()) == [ + "col_1", + "col_2", + "col_3", + "_dlt_load_id", + "_dlt_id", + ] + assert list(p.default_schema.tables["order_2"]["columns"].keys()) == [ + "col_3", + "col_2", + "col_1", + "_dlt_load_id", + "_dlt_id", + ] def test_pipeline_log_progress() -> None: - os.environ["TIMEOUT"] = "3.0" # will attach dlt logger - p = dlt.pipeline(destination="dummy", progress=dlt.progress.log(0.5, logger=None, log_level=logging.WARNING)) + p = dlt.pipeline( + destination="dummy", progress=dlt.progress.log(0.5, logger=None, log_level=logging.WARNING) + ) # collector was created before pipeline so logger is not attached assert cast(LogCollector, p.collector).logger is None p.extract(many_delayed(2, 10)) @@ -833,7 +908,6 @@ def test_pipeline_log_progress() -> None: def test_pipeline_source_state_activation() -> None: - appendix_yielded = None @dlt.source @@ -852,7 +926,7 @@ def appendix(): def writes_state(): dlt.current.source_state()["appendix"] = source_st dlt.current.resource_state()["RX"] = resource_st - yield from [1,2,3] + yield from [1, 2, 3] yield writes_state @@ -863,8 +937,11 @@ def writes_state(): assert s_appendix.state == {} # create state by running extract p_appendix.extract(s_appendix) - assert s_appendix.state == {'appendix': 'appendix', 'resources': {'writes_state': {'RX': 'r_appendix'}}} - assert s_appendix.writes_state.state == {'RX': 'r_appendix'} + assert s_appendix.state == { + "appendix": "appendix", + "resources": {"writes_state": {"RX": "r_appendix"}}, + } + assert s_appendix.writes_state.state == {"RX": "r_appendix"} # change the active pipeline p_postfix = dlt.pipeline(pipeline_name="postfix_p") @@ -872,7 +949,7 @@ def writes_state(): assert s_appendix.state == {} # and back p_appendix.activate() - assert s_appendix.writes_state.state == {'RX': 'r_appendix'} + assert s_appendix.writes_state.state == {"RX": "r_appendix"} # create another source s_w_appendix = reads_state("appendix", "r_appendix") @@ -902,7 +979,10 @@ def test_extract_add_tables() -> None: assert s.resources["🦚Peacock"].compute_table_schema()["resource"] == "🦚Peacock" # only name will be normalized assert s.resources["🦚Peacock"].compute_table_schema()["name"] == "🦚Peacock" - assert s.resources["💰Budget"].compute_table_schema()["columns"]["🔑book_id"]["name"] == "🔑book_id" + assert ( + s.resources["💰Budget"].compute_table_schema()["columns"]["🔑book_id"]["name"] + == "🔑book_id" + ) pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy") info = pipeline.extract(s) assert info.extract_data_info[0]["name"] == "airtable_emojis" @@ -949,7 +1029,12 @@ def test_emojis_resource_names() -> None: table = info.load_packages[0].schema_update["_schedule"] assert table["resource"] == "📆 Schedule" # only schedule is added - assert set(info.load_packages[0].schema_update.keys()) == {"_dlt_version", "_dlt_loads", "_schedule", "_dlt_pipeline_state"} + assert set(info.load_packages[0].schema_update.keys()) == { + "_dlt_version", + "_dlt_loads", + "_schedule", + "_dlt_pipeline_state", + } info = pipeline.run(airtable_emojis()) assert_load_info(info) # here we add _peacock with has primary_key (so at least single column) @@ -973,12 +1058,13 @@ def test_apply_hints_infer_hints() -> None: @dlt.source def infer(): - yield dlt.resource([{"id": 1, "timestamp": "NOW"}], name="table1", columns=[new_column("timestamp", nullable=True)]) + yield dlt.resource( + [{"id": 1, "timestamp": "NOW"}], + name="table1", + columns=[new_column("timestamp", nullable=True)], + ) - new_new_hints = { - "not_null": ["timestamp"], - "primary_key": ["id"] - } + new_new_hints = {"not_null": ["timestamp"], "primary_key": ["id"]} s = infer() s.schema.merge_hints(new_new_hints) # type: ignore[arg-type] pipeline = dlt.pipeline(pipeline_name="inf", destination="dummy") @@ -986,18 +1072,41 @@ def infer(): # check schema table = pipeline.default_schema.get_table("table1") # nullable True coming from hint overrides inferred hint - assert table["columns"]["timestamp"] == {"name": "timestamp", "data_type": "text", "nullable": True} + assert table["columns"]["timestamp"] == { + "name": "timestamp", + "data_type": "text", + "nullable": True, + } # fully from data - assert table["columns"]["id"] == {"name": "id", "data_type": "bigint", "nullable": True, "primary_key": True} + assert table["columns"]["id"] == { + "name": "id", + "data_type": "bigint", + "nullable": True, + "primary_key": True, + } # remove primary key and change nullable s = infer() - s.table1.apply_hints(columns=[{"name": "timestamp", "nullable": False}, {"name": "id", "nullable": False, "primary_key": False}]) + s.table1.apply_hints( + columns=[ + {"name": "timestamp", "nullable": False}, + {"name": "id", "nullable": False, "primary_key": False}, + ] + ) pipeline.run(s) table = pipeline.default_schema.get_table("table1") # hints overwrite pipeline schema - assert table["columns"]["timestamp"] == {"name": "timestamp", "data_type": "text", "nullable": False} - assert table["columns"]["id"] == {"name": "id", "data_type": "bigint", "nullable": False, "primary_key": False} + assert table["columns"]["timestamp"] == { + "name": "timestamp", + "data_type": "text", + "nullable": False, + } + assert table["columns"]["id"] == { + "name": "id", + "data_type": "bigint", + "nullable": False, + "primary_key": False, + } # print(pipeline.default_schema.to_pretty_yaml()) @@ -1049,13 +1158,13 @@ def test_resource_rename_same_table(): @dlt.resource(write_disposition="replace") def generic(start): dlt.current.resource_state()["start"] = start - yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] + yield [{"id": idx, "text": "A" * idx} for idx in range(start, start + 10)] - pipeline = dlt.pipeline(destination='duckdb') - load_info = pipeline.run([ - generic(10).with_name("state1"), - generic(20).with_name("state2") - ], table_name="single_table") + pipeline = dlt.pipeline(destination="duckdb") + load_info = pipeline.run( + [generic(10).with_name("state1"), generic(20).with_name("state2")], + table_name="single_table", + ) assert_load_info(load_info) # both resources loaded assert pipeline.last_trace.last_normalize_info.row_counts["single_table"] == 20 @@ -1072,9 +1181,12 @@ def generic(start): assert pipeline.default_schema.get_table("single_table")["resource"] == "state2" # now load only state1 - load_info = pipeline.run([ - generic(5).with_name("state1"), - ], table_name="single_table") + load_info = pipeline.run( + [ + generic(5).with_name("state1"), + ], + table_name="single_table", + ) assert_load_info(load_info) # both resources loaded assert pipeline.last_trace.last_normalize_info.row_counts["single_table"] == 10 @@ -1090,20 +1202,25 @@ def test_remove_autodetect() -> None: def autodetect(): # add unix ts autodetection to current source schema dlt.current.source_schema().add_type_detection("timestamp") - return dlt.resource([int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)], name="numbers") + return dlt.resource( + [int(now.timestamp()), int(now.timestamp() + 1), int(now.timestamp() + 2)], + name="numbers", + ) - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") pipeline.run(autodetect()) # unix ts recognized - assert pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "timestamp" + assert ( + pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "timestamp" + ) pipeline = pipeline.drop() source = autodetect() source.schema.remove_type_detection("timestamp") - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") pipeline.run(source) assert pipeline.default_schema.get_table("numbers")["columns"]["value"]["data_type"] == "bigint" @@ -1118,7 +1235,10 @@ def flattened_dict(): # dlt.current.source_schema().add_type_detection("timestamp") for delta in range(4): - yield {"delta": delta, "values": [{"Value": {"timestampValue": now.timestamp() + delta}}]} + yield { + "delta": delta, + "values": [{"Value": {"timestampValue": now.timestamp() + delta}}], + } @dlt.source def nested_resource(): @@ -1129,16 +1249,21 @@ def nested_resource(): values_table = new_table( dict_resource.name + "__values", parent_table_name=dict_resource.name, - columns=[{"name": "value__timestamp_value", "data_type": "timestamp"}] + columns=[{"name": "value__timestamp_value", "data_type": "timestamp"}], ) # and child table dlt.current.source_schema().update_table(values_table) return dict_resource - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") pipeline.run(nested_resource()) # print(pipeline.default_schema.to_pretty_yaml()) - assert pipeline.default_schema.get_table("flattened_dict__values")["columns"]["value__timestamp_value"]["data_type"] == "timestamp" + assert ( + pipeline.default_schema.get_table("flattened_dict__values")["columns"][ + "value__timestamp_value" + ]["data_type"] + == "timestamp" + ) # make sure data is there assert pipeline.last_trace.last_normalize_info.row_counts["flattened_dict__values"] == 4 @@ -1147,7 +1272,7 @@ def test_empty_rows_are_included() -> None: """Empty rows where all values are `None` or empty dicts create rows in the dataset with `NULL` in all columns """ - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") pipeline.run(iter([{}, {}, {}]), table_name="empty_rows") pipeline.run(iter([{"a": 1}, {}, {}]), table_name="empty_rows") @@ -1170,6 +1295,7 @@ def test_resource_state_name_not_normalized() -> None: # get state from destination from dlt.pipeline.state_sync import load_state_from_destination + client: WithStateSync with pipeline.destination_client() as client: # type: ignore[assignment] state = load_state_from_destination(pipeline.pipeline_name, client) diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py index d29bac13f2..9f49360f7e 100644 --- a/tests/pipeline/test_pipeline_extra.py +++ b/tests/pipeline/test_pipeline_extra.py @@ -8,7 +8,12 @@ from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.capabilities import TLoaderFileFormat from dlt.common.libs.pydantic import DltConfig -from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.common.runtime.collector import ( + AliveCollector, + EnlightenCollector, + LogCollector, + TqdmCollector, +) from dlt.extract.storage import ExtractorStorage from dlt.extract.validation import PydanticValidator @@ -19,25 +24,36 @@ from tests.pipeline.utils import assert_load_info, load_data_table_counts, many_delayed -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +@pytest.mark.parametrize( + "destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name +) def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: # create pipelines, extract and normalize. that should be possible without installing any dependencies - p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) + p = dlt.pipeline( + pipeline_name=destination_config.destination + "_pipeline", + destination=destination_config.destination, + staging=destination_config.staging, + ) # are capabilities injected caps = p._container[DestinationCapabilitiesContext] print(caps.naming_convention) # are right naming conventions created - assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + assert p._default_naming.max_length == min( + caps.max_column_identifier_length, caps.max_identifier_length + ) p.extract([1, "2", 3], table_name="data") # is default schema with right naming convention - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + assert p.default_schema.naming.max_length == min( + caps.max_column_identifier_length, caps.max_identifier_length + ) p.normalize() - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + assert p.default_schema.naming.max_length == min( + caps.max_column_identifier_length, caps.max_identifier_length + ) @pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) def test_pipeline_progress(progress: TCollectorArg) -> None: - os.environ["TIMEOUT"] = "3.0" p = dlt.pipeline(destination="dummy", progress=progress) @@ -64,10 +80,10 @@ def test_pipeline_progress(progress: TCollectorArg) -> None: assert isinstance(collector, LogCollector) -@pytest.mark.parametrize('method', ('extract', 'run')) +@pytest.mark.parametrize("method", ("extract", "run")) def test_column_argument_pydantic(method: str) -> None: """Test columns schema is created from pydantic model""" - p = dlt.pipeline(destination='duckdb') + p = dlt.pipeline(destination="duckdb") @dlt.resource def some_data() -> Iterator[Dict[str, Any]]: @@ -77,15 +93,15 @@ class Columns(BaseModel): a: Optional[int] = None b: Optional[str] = None - if method == 'run': + if method == "run": p.run(some_data(), columns=Columns) else: p.extract(some_data(), columns=Columns) - assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' - assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True - assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' - assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True + assert p.default_schema.tables["some_data"]["columns"]["a"]["data_type"] == "bigint" + assert p.default_schema.tables["some_data"]["columns"]["a"]["nullable"] is True + assert p.default_schema.tables["some_data"]["columns"]["b"]["data_type"] == "text" + assert p.default_schema.tables["some_data"]["columns"]["b"]["nullable"] is True @pytest.mark.parametrize("yield_list", [True, False]) @@ -111,7 +127,7 @@ class User(BaseModel): created_at=pendulum.now(), labels=["l1", "l2"], user_label=UserLabel(label="in_l1"), - user_labels=[UserLabel(label="l_l1"), UserLabel(label="l_l1")] + user_labels=[UserLabel(label="l_l1"), UserLabel(label="l_l1")], ) @dlt.resource(columns=User) @@ -121,12 +137,16 @@ def users(users_list: List[Any]) -> Iterator[Any]: else: yield from users_list - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") info = pipeline.run(users([user.dict(), user.dict()])) assert_load_info(info) print(pipeline.last_trace.last_normalize_info) # data is passing validation, all filled in - assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + assert load_data_table_counts(pipeline) == { + "users": 2, + "users__labels": 4, + "users__user_labels": 4, + } # produce two users with extra attrs in the child model but set the rows to discard so nothing is loaded u1 = user.dict() @@ -141,12 +161,16 @@ def users(users_list: List[Any]) -> Iterator[Any]: assert validator.data_mode == "discard_row" assert validator.column_mode == "discard_row" pipeline.run(r) - assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + assert load_data_table_counts(pipeline) == { + "users": 2, + "users__labels": 4, + "users__user_labels": 4, + } print(pipeline.last_trace.last_normalize_info) def test_extract_pydantic_models() -> None: - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") class User(BaseModel): user_id: int @@ -161,16 +185,18 @@ def users() -> Iterator[User]: storage = ExtractorStorage(pipeline._normalize_storage_config) expect_extracted_file( - storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) + storage, + pipeline.default_schema_name, + "users", + json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]), ) @pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: - @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) def generic(start=8): - yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] + yield [{"id": idx, "text": "A" * idx} for idx in range(start, start + 10)] - pipeline = dlt.pipeline(destination='duckdb') + pipeline = dlt.pipeline(destination="duckdb") pipeline.run(generic(), loader_file_format=file_format) diff --git a/tests/pipeline/test_pipeline_file_format_resolver.py b/tests/pipeline/test_pipeline_file_format_resolver.py index 6602b5f876..49a38c455b 100644 --- a/tests/pipeline/test_pipeline_file_format_resolver.py +++ b/tests/pipeline/test_pipeline_file_format_resolver.py @@ -3,13 +3,20 @@ import dlt import pytest -from dlt.common.exceptions import DestinationIncompatibleLoaderFileFormatException, DestinationLoadingViaStagingNotSupported, DestinationNoStagingMode +from dlt.common.exceptions import ( + DestinationIncompatibleLoaderFileFormatException, + DestinationLoadingViaStagingNotSupported, + DestinationNoStagingMode, +) from dlt.common.destination.capabilities import DestinationCapabilitiesContext -def test_file_format_resolution() -> None: + +def test_file_format_resolution() -> None: # raise on destinations that does not support staging with pytest.raises(DestinationLoadingViaStagingNotSupported): - p = dlt.pipeline(pipeline_name="managed_state_pipeline", destination="postgres", staging="filesystem") + p = dlt.pipeline( + pipeline_name="managed_state_pipeline", destination="postgres", staging="filesystem" + ) # raise on staging that does not support staging interface with pytest.raises(DestinationNoStagingMode): @@ -20,7 +27,7 @@ def test_file_format_resolution() -> None: if TYPE_CHECKING: cp = DestinationCapabilitiesContext - class cp(): # type: ignore[no-redef] + class cp: # type: ignore[no-redef] def __init__(self) -> None: self.preferred_loader_file_format: str = None self.supported_loader_file_formats: List[str] = [] diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 019997ef6e..574d69dce5 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -22,19 +22,24 @@ @dlt.resource() def some_data(): last_value = dlt.current.source_state().get("last_value", 0) - yield [1,2,3] + yield [1, 2, 3] dlt.current.source_state()["last_value"] = last_value + 1 @dlt.resource() def some_data_resource_state(): last_value = dlt.current.resource_state().get("last_value", 0) - yield [1,2,3] + yield [1, 2, 3] dlt.current.resource_state()["last_value"] = last_value + 1 def test_restore_state_props() -> None: - p = dlt.pipeline(pipeline_name="restore_state_props", destination="redshift", staging="filesystem", dataset_name="the_dataset") + p = dlt.pipeline( + pipeline_name="restore_state_props", + destination="redshift", + staging="filesystem", + dataset_name="the_dataset", + ) p.extract(some_data()) state = p.state assert state["dataset_name"] == "the_dataset" @@ -77,7 +82,9 @@ def some_source(): sources_state = p.state["sources"] # the source name is the source state key assert sources_state[s.name]["last_value"] == 1 - assert sources_state["managed_state"]["last_value"] == 2 # the state for standalone resource not affected + assert ( + sources_state["managed_state"]["last_value"] == 2 + ) # the state for standalone resource not affected @dlt.source def source_same_section(): @@ -114,7 +121,6 @@ def test_no_active_pipeline_required_for_resource() -> None: def test_active_pipeline_required_for_source() -> None: - @dlt.source def some_source(): dlt.current.source_state().get("last_value", 0) @@ -134,6 +140,7 @@ def some_source(): p.deactivate() list(s) + def test_source_state_iterator(): os.environ["COMPLETED_PROB"] = "1.0" pipeline_name = "pipe_" + uniq_id() @@ -146,7 +153,7 @@ def main(): # increase the multiplier each time state is obtained state["mark"] *= 2 yield [1, 2, 3] - assert dlt.current.source_state()["mark"] == mark*2 + assert dlt.current.source_state()["mark"] == mark * 2 @dlt.transformer(data_from=main) def feeding(item): @@ -154,7 +161,7 @@ def feeding(item): assert dlt.current.source_state()["mark"] > 1 print(f"feeding state {dlt.current.source_state()}") mark = dlt.current.source_state()["mark"] - yield from map(lambda i: i*mark, item) + yield from map(lambda i: i * mark, item) @dlt.source def pass_the_state(): @@ -188,6 +195,7 @@ def test_unmanaged_state() -> None: def _gen_inner(): dlt.state()["gen"] = True yield 1 + list(dlt.resource(_gen_inner)) list(dlt.resource(_gen_inner())) assert state_module._last_full_state["sources"]["unmanaged"]["gen"] is True @@ -236,7 +244,12 @@ def _gen_inner(): def test_resource_state_write() -> None: r = some_data_resource_state() assert list(r) == [1, 2, 3] - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["some_data_resource_state"]["last_value"] == 1 + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "some_data_resource_state" + ]["last_value"] + == 1 + ) with pytest.raises(ResourceNameNotAvailable): get_current_pipe_name() @@ -247,7 +260,12 @@ def _gen_inner(): p = dlt.pipeline() r = dlt.resource(_gen_inner(), name="name_ovrd") assert list(r) == [1] - assert state_module._last_full_state["sources"][p._make_schema_with_default_name().name]["resources"]["name_ovrd"]["gen"] is True + assert ( + state_module._last_full_state["sources"][p._make_schema_with_default_name().name][ + "resources" + ]["name_ovrd"]["gen"] + is True + ) with pytest.raises(ResourceNameNotAvailable): get_current_pipe_name() @@ -267,20 +285,29 @@ def _gen_inner(tv="df"): r = dlt.resource(_gen_inner("gen_tf"), name="name_ovrd") p.extract(r) assert r.state["gen"] == "gen_tf" - assert state_module._last_full_state["sources"][p.default_schema_name]["resources"]["name_ovrd"]["gen"] == "gen_tf" + assert ( + state_module._last_full_state["sources"][p.default_schema_name]["resources"]["name_ovrd"][ + "gen" + ] + == "gen_tf" + ) with pytest.raises(ResourceNameNotAvailable): get_current_pipe_name() r = dlt.resource(_gen_inner, name="pure_function") p.extract(r) assert r.state["gen"] == "df" - assert state_module._last_full_state["sources"][p.default_schema_name]["resources"]["pure_function"]["gen"] == "df" + assert ( + state_module._last_full_state["sources"][p.default_schema_name]["resources"][ + "pure_function" + ]["gen"] + == "df" + ) with pytest.raises(ResourceNameNotAvailable): get_current_pipe_name() # get resource state in defer function def _gen_inner_defer(tv="df"): - @dlt.defer def _run(): dlt.current.resource_state()["gen"] = tv @@ -296,7 +323,6 @@ def _run(): # get resource state in defer explicitly def _gen_inner_defer_explicit_name(resource_name, tv="df"): - @dlt.defer def _run(): dlt.current.resource_state(resource_name)["gen"] = tv @@ -307,11 +333,15 @@ def _run(): r = dlt.resource(_gen_inner_defer_explicit_name, name="defer_function_explicit") p.extract(r("defer_function_explicit", "expl")) assert r.state["gen"] == "expl" - assert state_module._last_full_state["sources"][p.default_schema_name]["resources"]["defer_function_explicit"]["gen"] == "expl" + assert ( + state_module._last_full_state["sources"][p.default_schema_name]["resources"][ + "defer_function_explicit" + ]["gen"] + == "expl" + ) # get resource state in yielding defer (which btw is invalid and will be resolved in main thread) def _gen_inner_defer_yielding(tv="yielding"): - @dlt.defer def _run(): dlt.current.resource_state()["gen"] = tv @@ -322,11 +352,15 @@ def _run(): r = dlt.resource(_gen_inner_defer_yielding, name="defer_function_yielding") p.extract(r) assert r.state["gen"] == "yielding" - assert state_module._last_full_state["sources"][p.default_schema_name]["resources"]["defer_function_yielding"]["gen"] == "yielding" + assert ( + state_module._last_full_state["sources"][p.default_schema_name]["resources"][ + "defer_function_yielding" + ]["gen"] + == "yielding" + ) # get resource state in async function def _gen_inner_async(tv="async"): - async def _run(): dlt.current.resource_state()["gen"] = tv return 1 @@ -351,8 +385,18 @@ def _gen_inner(item): # p = dlt.pipeline() # p.extract(dlt.transformer(_gen_inner, data_from=r, name="tx_other_name")) assert list(dlt.transformer(_gen_inner, data_from=r, name="tx_other_name")) == [2, 4, 6] - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["some_data_resource_state"]["last_value"] == 1 - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["tx_other_name"]["gen"] is True + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "some_data_resource_state" + ]["last_value"] + == 1 + ) + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "tx_other_name" + ]["gen"] + is True + ) # returning transformer def _gen_inner_rv(item): @@ -360,8 +404,20 @@ def _gen_inner_rv(item): return item * 2 r = some_data_resource_state() - assert list(dlt.transformer(_gen_inner_rv, data_from=r, name="tx_other_name_rv")) == [1, 2, 3, 1, 2, 3] - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["tx_other_name_rv"]["gen"] is True + assert list(dlt.transformer(_gen_inner_rv, data_from=r, name="tx_other_name_rv")) == [ + 1, + 2, + 3, + 1, + 2, + 3, + ] + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "tx_other_name_rv" + ]["gen"] + is True + ) # deferred transformer @dlt.defer @@ -390,8 +446,17 @@ async def _gen_inner_rv_async_name(item, r_name): return item r = some_data_resource_state() - assert list(dlt.transformer(_gen_inner_rv_async_name, data_from=r, name="tx_other_name_async")("tx_other_name_async")) == [1, 2, 3] - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["tx_other_name_async"]["gen"] is True + assert list( + dlt.transformer(_gen_inner_rv_async_name, data_from=r, name="tx_other_name_async")( + "tx_other_name_async" + ) + ) == [1, 2, 3] + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "tx_other_name_async" + ]["gen"] + is True + ) def test_transform_function_state_write() -> None: @@ -400,29 +465,41 @@ def test_transform_function_state_write() -> None: # transform executed within the same thread def transform(item): dlt.current.resource_state()["form"] = item - return item*2 + return item * 2 r.add_map(transform) assert list(r) == [2, 4, 6] - assert state_module._last_full_state["sources"]["test_pipeline_state"]["resources"]["some_data_resource_state"]["form"] == 3 + assert ( + state_module._last_full_state["sources"]["test_pipeline_state"]["resources"][ + "some_data_resource_state" + ]["form"] + == 3 + ) def test_migrate_state(test_storage: FileStorage) -> None: state_v1 = load_json_case("state/state.v1") - state = migrate_state("test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION) + state = migrate_state( + "test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION + ) assert state["_state_engine_version"] == STATE_ENGINE_VERSION assert "_local" in state with pytest.raises(PipelineStateEngineNoUpgradePathException) as py_ex: state_v1 = load_json_case("state/state.v1") - migrate_state("test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION + 1) + migrate_state( + "test_pipeline", state_v1, state_v1["_state_engine_version"], STATE_ENGINE_VERSION + 1 + ) assert py_ex.value.init_engine == state_v1["_state_engine_version"] assert py_ex.value.from_engine == STATE_ENGINE_VERSION assert py_ex.value.to_engine == STATE_ENGINE_VERSION + 1 # also test pipeline init where state is old test_storage.create_folder("debug_pipeline") - shutil.copy(json_case_path("state/state.v1"), test_storage.make_full_path(f"debug_pipeline/{Pipeline.STATE_FILE}")) + shutil.copy( + json_case_path("state/state.v1"), + test_storage.make_full_path(f"debug_pipeline/{Pipeline.STATE_FILE}"), + ) p = dlt.attach(pipeline_name="debug_pipeline", pipelines_dir=test_storage.storage_path) assert p.dataset_name == "debug_pipeline_data" assert p.default_schema_name == "example_source" diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 5644a32d2f..4259c591e7 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -20,7 +20,12 @@ from dlt.pipeline.exceptions import PipelineStepFailed from dlt.pipeline.pipeline import Pipeline -from dlt.pipeline.trace import PipelineTrace, SerializableResolvedValueTrace, describe_extract_data, load_trace +from dlt.pipeline.trace import ( + PipelineTrace, + SerializableResolvedValueTrace, + describe_extract_data, + load_trace, +) from dlt.pipeline.track import slack_notify_load_success from dlt.extract import DltResource, DltSource from dlt.extract.pipe import Pipe @@ -28,15 +33,14 @@ from tests.utils import start_test_telemetry from tests.common.configuration.utils import toml_providers, environment -def test_create_trace(toml_providers: ConfigProvidersContext) -> None: +def test_create_trace(toml_providers: ConfigProvidersContext) -> None: @dlt.source def inject_tomls( - api_type = dlt.config.value, - credentials: CredentialsConfiguration = dlt.secrets.value, - secret_value: TSecretValue = TSecretValue("123") # noqa: B008 + api_type=dlt.config.value, + credentials: CredentialsConfiguration = dlt.secrets.value, + secret_value: TSecretValue = TSecretValue("123"), # noqa: B008 ): - @dlt.resource def data(): yield [1, 2, 3] @@ -86,7 +90,6 @@ def data(): # extract with exception @dlt.source def async_exception(max_range=1): - async def get_val(v): await asyncio.sleep(0.1) if v % 3 == 0: @@ -95,7 +98,7 @@ async def get_val(v): @dlt.resource def data(): - yield from [get_val(v) for v in range(1,max_range)] + yield from [get_val(v) for v in range(1, max_range)] return data() @@ -122,7 +125,7 @@ def data(): assert step.step_info is norm_info assert_trace_printable(trace) assert isinstance(p.last_trace.last_normalize_info, NormalizeInfo) - assert p.last_trace.last_normalize_info.row_counts == {'_dlt_pipeline_state': 1, 'data': 3} + assert p.last_trace.last_normalize_info.row_counts == {"_dlt_pipeline_state": 1, "data": 3} # load os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @@ -162,7 +165,7 @@ def data(): def test_save_load_trace() -> None: os.environ["COMPLETED_PROB"] = "1.0" - info = dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") pipeline = dlt.pipeline() # will get trace from working dir trace = pipeline.last_trace @@ -177,7 +180,10 @@ def test_save_load_trace() -> None: assert resolved.config_type_name == "DummyClientConfiguration" assert_trace_printable(trace) # check row counts - assert pipeline.last_trace.last_normalize_info.row_counts == {'_dlt_pipeline_state': 1, 'data': 3} + assert pipeline.last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "data": 3, + } # exception also saves trace @dlt.resource @@ -207,22 +213,31 @@ def data(): def test_disable_trace(environment: DictStrStr) -> None: environment["ENABLE_RUNTIME_TRACE"] = "false" environment["COMPLETED_PROB"] = "1.0" - dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") assert dlt.pipeline().last_trace is None def test_trace_on_restore_state(environment: DictStrStr) -> None: environment["COMPLETED_PROB"] = "1.0" - def _sync_destination_patch(self: Pipeline, destination: str = None, staging: str = None, dataset_name: str = None): + def _sync_destination_patch( + self: Pipeline, destination: str = None, staging: str = None, dataset_name: str = None + ): # just wipe the pipeline simulating deleted dataset self._wipe_working_folder() - self._configure(self._schema_storage_config.export_schema_path, self._schema_storage_config.import_schema_path, False) - - with patch.object(Pipeline, 'sync_destination', _sync_destination_patch): - dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + self._configure( + self._schema_storage_config.export_schema_path, + self._schema_storage_config.import_schema_path, + False, + ) + + with patch.object(Pipeline, "sync_destination", _sync_destination_patch): + dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") assert len(dlt.pipeline().last_trace.steps) == 4 - assert dlt.pipeline().last_trace.last_normalize_info.row_counts == {'_dlt_pipeline_state': 1, 'data': 3} + assert dlt.pipeline().last_trace.last_normalize_info.row_counts == { + "_dlt_pipeline_state": 1, + "data": 3, + } def test_load_none_trace() -> None: @@ -231,14 +246,18 @@ def test_load_none_trace() -> None: def test_trace_telemetry() -> None: - with patch("dlt.common.runtime.sentry.before_send", _mock_sentry_before_send), patch("dlt.common.runtime.segment.before_send", _mock_segment_before_send): + with patch("dlt.common.runtime.sentry.before_send", _mock_sentry_before_send), patch( + "dlt.common.runtime.segment.before_send", _mock_segment_before_send + ): # os.environ["FAIL_PROB"] = "1.0" # make it complete immediately start_test_telemetry() SEGMENT_SENT_ITEMS.clear() SENTRY_SENT_ITEMS.clear() # default dummy fails all files - load_info = dlt.pipeline().run([1,2,3], table_name="data", destination="dummy", dataset_name="data_data") + load_info = dlt.pipeline().run( + [1, 2, 3], table_name="data", destination="dummy", dataset_name="data_data" + ) # we should have 4 segment items assert len(SEGMENT_SENT_ITEMS) == 4 expected_steps = ["extract", "normalize", "load", "run"] @@ -246,9 +265,15 @@ def test_trace_telemetry() -> None: assert event["event"] == f"pipeline_{step}" assert event["properties"]["success"] is True assert event["properties"]["destination_name"] == "dummy" - assert event["properties"]["pipeline_name_hash"] == digest128(load_info.pipeline.pipeline_name) - assert event["properties"]["dataset_name_hash"] == digest128(load_info.pipeline.dataset_name) - assert event["properties"]["default_schema_name_hash"] == digest128(load_info.pipeline.default_schema_name) + assert event["properties"]["pipeline_name_hash"] == digest128( + load_info.pipeline.pipeline_name + ) + assert event["properties"]["dataset_name_hash"] == digest128( + load_info.pipeline.dataset_name + ) + assert event["properties"]["default_schema_name_hash"] == digest128( + load_info.pipeline.default_schema_name + ) assert isinstance(event["properties"]["elapsed"], float) assert isinstance(event["properties"]["transaction_id"], str) # check extract info @@ -278,7 +303,9 @@ def data(): assert isinstance(event["properties"]["elapsed"], float) # check extract info if step == "extract": - assert event["properties"]["extract_data"] == [{"name": "data", "data_type": "resource"}] + assert event["properties"]["extract_data"] == [ + {"name": "data", "data_type": "resource"} + ] # we didn't log any errors assert len(SENTRY_SENT_ITEMS) == 0 @@ -298,25 +325,32 @@ def data(): def test_extract_data_describe() -> None: schema = Schema("test") - assert describe_extract_data(DltSource(schema, "sect")) == [{"name": "test", "data_type": "source"}] - assert describe_extract_data(DltResource(Pipe("rrr_extract"), None, False)) == [{"name": "rrr_extract", "data_type": "resource"}] - assert describe_extract_data([DltSource(schema, "sect")]) == [{"name": "test", "data_type": "source"}] - assert describe_extract_data([DltResource(Pipe("rrr_extract"), None, False)]) == [{"name": "rrr_extract", "data_type": "resource"}] + assert describe_extract_data(DltSource(schema, "sect")) == [ + {"name": "test", "data_type": "source"} + ] + assert describe_extract_data(DltResource(Pipe("rrr_extract"), None, False)) == [ + {"name": "rrr_extract", "data_type": "resource"} + ] + assert describe_extract_data([DltSource(schema, "sect")]) == [ + {"name": "test", "data_type": "source"} + ] + assert describe_extract_data([DltResource(Pipe("rrr_extract"), None, False)]) == [ + {"name": "rrr_extract", "data_type": "resource"} + ] assert describe_extract_data( [DltResource(Pipe("rrr_extract"), None, False), DltSource(schema, "sect")] - ) == [ - {"name": "rrr_extract", "data_type": "resource"}, {"name": "test", "data_type": "source"} - ] + ) == [{"name": "rrr_extract", "data_type": "resource"}, {"name": "test", "data_type": "source"}] assert describe_extract_data([{"a": "b"}]) == [{"name": "", "data_type": "dict"}] from pandas import DataFrame + # we assume that List content has same type - assert describe_extract_data([DataFrame(), {"a": "b"}]) == [{"name": "", "data_type": "DataFrame"}] + assert describe_extract_data([DataFrame(), {"a": "b"}]) == [ + {"name": "", "data_type": "DataFrame"} + ] # first unnamed element in the list breaks checking info assert describe_extract_data( [DltResource(Pipe("rrr_extract"), None, False), DataFrame(), DltSource(schema, "sect")] - ) == [ - {"name": "rrr_extract", "data_type": "resource"}, {"name": "", "data_type": "DataFrame"} - ] + ) == [{"name": "rrr_extract", "data_type": "resource"}, {"name": "", "data_type": "DataFrame"}] def test_slack_hook(environment: DictStrStr) -> None: @@ -328,7 +362,7 @@ def test_slack_hook(environment: DictStrStr) -> None: environment["RUNTIME__SLACK_INCOMING_HOOK"] = hook_url with requests_mock.mock() as m: m.post(hook_url, json={}) - load_info = dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + load_info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") assert slack_notify_load_success(load_info.pipeline.runtime_config.slack_incoming_hook, load_info, load_info.pipeline.last_trace) == 200 # type: ignore[attr-defined] assert m.called message = m.last_request.json() @@ -339,7 +373,7 @@ def test_slack_hook(environment: DictStrStr) -> None: def test_broken_slack_hook(environment: DictStrStr) -> None: environment["COMPLETED_PROB"] = "1.0" environment["RUNTIME__SLACK_INCOMING_HOOK"] = "http://localhost:22" - load_info = dlt.pipeline().run([1,2,3], table_name="data", destination="dummy") + load_info = dlt.pipeline().run([1, 2, 3], table_name="data", destination="dummy") # connection error assert slack_notify_load_success(load_info.pipeline.runtime_config.slack_incoming_hook, load_info, load_info.pipeline.last_trace) == -1 # type: ignore[attr-defined] # pipeline = dlt.pipeline() @@ -352,21 +386,28 @@ def test_broken_slack_hook(environment: DictStrStr) -> None: # assert run_step.step_exception is None -def _find_resolved_value(resolved: List[SerializableResolvedValueTrace], key: str, sections: List[str]) -> SerializableResolvedValueTrace: +def _find_resolved_value( + resolved: List[SerializableResolvedValueTrace], key: str, sections: List[str] +) -> SerializableResolvedValueTrace: return next((v for v in resolved if v.key == key and v.sections == sections), None) SEGMENT_SENT_ITEMS = [] + + def _mock_segment_before_send(event: DictStrAny) -> DictStrAny: SEGMENT_SENT_ITEMS.append(event) return event SENTRY_SENT_ITEMS = [] + + def _mock_sentry_before_send(event: DictStrAny, _unused_hint: Any = None) -> DictStrAny: SENTRY_SENT_ITEMS.append(event) return event + def assert_trace_printable(trace: PipelineTrace) -> None: str(trace) trace.asstr(0) diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py index 93a5abf44c..f585f5896c 100644 --- a/tests/pipeline/test_schema_contracts.py +++ b/tests/pipeline/test_schema_contracts.py @@ -11,7 +11,12 @@ from dlt.pipeline.exceptions import PipelineStepFailed from tests.load.pipeline.utils import load_table_counts -from tests.utils import TDataItemFormat, skip_if_not_active, data_to_item_format, ALL_DATA_ITEM_FORMATS +from tests.utils import ( + TDataItemFormat, + skip_if_not_active, + data_to_item_format, + ALL_DATA_ITEM_FORMATS, +) skip_if_not_active("duckdb") @@ -31,77 +36,55 @@ def raises_frozen_exception(check_raise: bool = True) -> Any: def items(settings: TSchemaContract) -> Any: - # NOTE: names must be normalizeds @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): - yield { - "id": index, - "SomeInt": 1, - "name": f"item {index}" - } + yield {"id": index, "SomeInt": 1, "name": f"item {index}"} return load_items def items_with_variant(settings: TSchemaContract) -> Any: - @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): - yield { - "id": index, - "name": f"item {index}", - "SomeInt": "hello" - } + yield {"id": index, "name": f"item {index}", "SomeInt": "hello"} return load_items def items_with_new_column(settings: TSchemaContract) -> Any: - @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): - yield { - "id": index, - "name": f"item {index}", - "New^Col": "hello" - } + yield {"id": index, "name": f"item {index}", "New^Col": "hello"} return load_items def items_with_subtable(settings: TSchemaContract) -> Any: - @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): yield { "id": index, "name": f"item {index}", - "sub_items": [{ - "id": index + 1000, - "name": f"sub item {index + 1000}" - }] + "sub_items": [{"id": index + 1000, "name": f"sub item {index + 1000}"}], } return load_items -def new_items(settings: TSchemaContract) -> Any: +def new_items(settings: TSchemaContract) -> Any: @dlt.resource(name="new_items", write_disposition="append", schema_contract=settings) def load_items(): for _, index in enumerate(range(0, 10), 1): - yield { - "id": index, - "some_int": 1, - "name": f"item {index}" - } + yield {"id": index, "some_int": 1, "name": f"item {index}"} return load_items + OLD_COLUMN_NAME = "name" NEW_COLUMN_NAME = "new_col" VARIANT_COLUMN_NAME = "some_int__v_text" @@ -109,8 +92,13 @@ def load_items(): NEW_ITEMS_TABLE = "new_items" -def run_resource(pipeline: Pipeline, resource_fun: Callable[..., DltResource], settings: Any, item_format: TDataItemFormat = "json", duplicates: int = 1) -> None: - +def run_resource( + pipeline: Pipeline, + resource_fun: Callable[..., DltResource], + settings: Any, + item_format: TDataItemFormat = "json", + duplicates: int = 1, +) -> None: for item in settings.keys(): assert item in LOCATIONS ev_settings = settings[item] @@ -133,7 +121,9 @@ def source() -> Iterator[DltResource]: pipeline.run(source(), schema_contract=settings.get("override")) # check global settings - assert pipeline.default_schema._settings.get("schema_contract", None) == (settings.get("override") or settings.get("source")) + assert pipeline.default_schema._settings.get("schema_contract", None) == ( + settings.get("override") or settings.get("source") + ) # check items table settings # assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("resource") or {}) @@ -141,36 +131,47 @@ def source() -> Iterator[DltResource]: # check effective table settings # assert resolve_contract_settings_for_table(None, "items", pipeline.default_schema) == expand_schema_contract_settings(settings.get("resource") or settings.get("override") or "evolve") + def get_pipeline(): import duckdb - return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) + + return dlt.pipeline( + pipeline_name=uniq_id(), + destination="duckdb", + credentials=duckdb.connect(":memory:"), + full_refresh=True, + ) @pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) @pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) -def test_new_tables(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: - +def test_new_tables( + contract_setting: str, setting_location: str, item_format: TDataItemFormat +) -> None: pipeline = get_pipeline() - full_settings = { - setting_location: { - "tables": contract_setting - }} + full_settings = {setting_location: {"tables": contract_setting}} run_resource(pipeline, items, {}, item_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] run_resource(pipeline, items_with_new_column, full_settings, item_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 20 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # test adding new table with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, new_items, full_settings, item_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) # delete extracted files if left after exception pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) @@ -179,7 +180,9 @@ def test_new_tables(contract_setting: str, setting_location: str, item_format: T if item_format == "json": # run add variant column run_resource(pipeline, items_with_variant, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 30 assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -187,7 +190,9 @@ def test_new_tables(contract_setting: str, setting_location: str, item_format: T with raises_frozen_exception(contract_setting == "freeze"): run_resource(pipeline, items_with_subtable, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) @@ -195,22 +200,24 @@ def test_new_tables(contract_setting: str, setting_location: str, item_format: T @pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) @pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) -def test_new_columns(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: - - full_settings = { - setting_location: { - "columns": contract_setting - }} +def test_new_columns( + contract_setting: str, setting_location: str, item_format: TDataItemFormat +) -> None: + full_settings = {setting_location: {"columns": contract_setting}} pipeline = get_pipeline() run_resource(pipeline, items, {}, item_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # new should work run_resource(pipeline, new_items, full_settings, item_format) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) expected_items_count = 10 assert table_counts["items"] == expected_items_count assert table_counts[NEW_ITEMS_TABLE] == 10 @@ -225,15 +232,19 @@ def test_new_columns(contract_setting: str, setting_location: str, item_format: assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] else: assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) - expected_items_count += (20 if contract_setting in ["evolve", "discard_value"] else 0) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) + expected_items_count += 20 if contract_setting in ["evolve", "discard_value"] else 0 assert table_counts["items"] == expected_items_count # NOTE: arrow / pandas do not support variants and subtables so we must skip if item_format == "json": # subtable should work run_resource(pipeline, items_with_subtable, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) expected_items_count += 10 assert table_counts["items"] == expected_items_count assert table_counts[SUBITEMS_TABLE] == 10 @@ -242,7 +253,9 @@ def test_new_columns(contract_setting: str, setting_location: str, item_format: run_resource(pipeline, items_with_variant, full_settings) # variants are not new columns and should be able to always evolve assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) expected_items_count += 10 assert table_counts["items"] == expected_items_count @@ -250,32 +263,36 @@ def test_new_columns(contract_setting: str, setting_location: str, item_format: @pytest.mark.parametrize("contract_setting", schema_contract) @pytest.mark.parametrize("setting_location", LOCATIONS) def test_freeze_variants(contract_setting: str, setting_location: str) -> None: - - full_settings = { - setting_location: { - "data_type": contract_setting - }} + full_settings = {setting_location: {"data_type": contract_setting}} pipeline = get_pipeline() run_resource(pipeline, items, {}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] # subtable should work run_resource(pipeline, items_with_subtable, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 20 assert table_counts[SUBITEMS_TABLE] == 10 # new should work run_resource(pipeline, new_items, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 20 assert table_counts[NEW_ITEMS_TABLE] == 10 # test adding new column run_resource(pipeline, items_with_new_column, full_settings) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 30 assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] @@ -287,7 +304,9 @@ def test_freeze_variants(contract_setting: str, setting_location: str) -> None: assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] else: assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) @@ -298,49 +317,57 @@ def test_settings_precedence() -> None: run_resource(pipeline, items, {}) # trying to add new column when forbidden on resource will fail - run_resource(pipeline, items_with_new_column, {"resource": { - "columns": "discard_row" - }}) + run_resource(pipeline, items_with_new_column, {"resource": {"columns": "discard_row"}}) # when allowed on override it will work - run_resource(pipeline, items_with_new_column, { - "resource": {"columns": "freeze"}, - "override": {"columns": "evolve"} - }) + run_resource( + pipeline, + items_with_new_column, + {"resource": {"columns": "freeze"}, "override": {"columns": "evolve"}}, + ) def test_settings_precedence_2() -> None: pipeline = get_pipeline() # load some data - run_resource(pipeline, items, {"source": { - "data_type": "discard_row" - }}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource(pipeline, items, {"source": {"data_type": "discard_row"}}) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # trying to add variant when forbidden on source will fail - run_resource(pipeline, items_with_variant, {"source": { - "data_type": "discard_row" - }}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource(pipeline, items_with_variant, {"source": {"data_type": "discard_row"}}) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # if allowed on resource it will pass - run_resource(pipeline, items_with_variant, { - "resource": {"data_type": "evolve"}, - "source": {"data_type": "discard_row"} - }) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource( + pipeline, + items_with_variant, + {"resource": {"data_type": "evolve"}, "source": {"data_type": "discard_row"}}, + ) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 20 # if allowed on override it will also pass - run_resource(pipeline, items_with_variant, { - "resource": {"data_type": "discard_row"}, - "source": {"data_type": "discard_row"}, - "override": {"data_type": "evolve"}, - }) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource( + pipeline, + items_with_variant, + { + "resource": {"data_type": "discard_row"}, + "source": {"data_type": "discard_row"}, + "override": {"data_type": "evolve"}, + }, + ) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 30 @@ -350,21 +377,23 @@ def test_change_mode(setting_location: str) -> None: # load some data run_resource(pipeline, items, {}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # trying to add variant when forbidden will fail - run_resource(pipeline, items_with_variant, {setting_location: { - "data_type": "discard_row" - }}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource(pipeline, items_with_variant, {setting_location: {"data_type": "discard_row"}}) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # now allow - run_resource(pipeline, items_with_variant, {setting_location: { - "data_type": "evolve" - }}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + run_resource(pipeline, items_with_variant, {setting_location: {"data_type": "evolve"}}) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 20 @@ -373,22 +402,30 @@ def test_single_settings_value(setting_location: str) -> None: pipeline = get_pipeline() run_resource(pipeline, items, {}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # trying to add variant when forbidden will fail run_resource(pipeline, items_with_variant, {setting_location: "discard_row"}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # trying to add new column will fail run_resource(pipeline, items_with_new_column, {setting_location: "discard_row"}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 # trying to add new table will fail run_resource(pipeline, new_items, {setting_location: "discard_row"}) - table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + table_counts = load_table_counts( + pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()] + ) assert table_counts["items"] == 10 assert "new_items" not in table_counts @@ -403,42 +440,37 @@ class Items(BaseModel): id: int # noqa: A003 name: Optional[str] amount: Union[int, str, None] + class Config: extra = Extra.forbid @dlt.resource(name="items") def get_items(): - yield from [{ - "id": 5, - "name": "dave", - "amount": 6, - }] + yield from [ + { + "id": 5, + "name": "dave", + "amount": 6, + } + ] @dlt.resource(name="items", columns=Items) def get_items_with_model(): - yield from [{ - "id": 5, - "name": "dave", - "amount": 6, - }] + yield from [ + { + "id": 5, + "name": "dave", + "amount": 6, + } + ] @dlt.resource(name="items") def get_items_new_col(): - yield from [{ - "id": 5, - "name": "dave", - "amount": 6, - "new_col": "hello" - }] + yield from [{"id": 5, "name": "dave", "amount": 6, "new_col": "hello"}] @dlt.resource(name="items") def get_items_subtable(): - yield from [{ - "id": 5, - "name": "dave", - "amount": 6, - "sub": [{"hello": "dave"}] - }] + yield from [{"id": 5, "name": "dave", "amount": 6, "sub": [{"hello": "dave"}]}] # test valid object pipeline = get_pipeline() @@ -461,30 +493,19 @@ def get_items_subtable(): def test_different_objects_in_one_load() -> None: - pipeline = get_pipeline() @dlt.resource(name="items") def get_items(): - yield { - "id": 1, - "name": "dave", - "amount": 50 - } - yield { - "id": 2, - "name": "dave", - "amount": 50, - "new_column": "some val" - } + yield {"id": 1, "name": "dave", "amount": 50} + yield {"id": 2, "name": "dave", "amount": 50, "new_column": "some val"} - pipeline.run([get_items()], schema_contract={"columns": "freeze", "tables":"evolve"}) + pipeline.run([get_items()], schema_contract={"columns": "freeze", "tables": "evolve"}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 @pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "freeze"]) def test_dynamic_tables(table_mode: str) -> None: - pipeline = get_pipeline() # adding columns with a data type makes this columns complete which makes this table complete -> it fails in the normalize because @@ -497,17 +518,18 @@ def get_items(): "id": 1, "tables": "one", } - yield { - "id": 2, - "tables": "two", - "new_column": "some val" - } + yield {"id": 2, "tables": "two", "new_column": "some val"} + with raises_frozen_exception(table_mode == "freeze"): pipeline.run([get_items()], schema_contract={"tables": table_mode}) if table_mode != "freeze": - assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) - assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == ( + 1 if table_mode == "evolve" else 0 + ) + assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == ( + 1 if table_mode == "evolve" else 0 + ) @pytest.mark.parametrize("column_mode", ["discard_row", "evolve", "freeze"]) @@ -520,22 +542,20 @@ def get_items(): "id": 1, "key": "value", } + pipeline.run([get_items()], schema_contract={"columns": column_mode}) assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 @pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) def test_new_column_from_hint_and_data(column_mode: str) -> None: - pipeline = get_pipeline() # we define complete column on id, this creates a complete table # normalizer does not know that it is a new table and discards the row # and it also excepts on column freeze - @dlt.resource( - name="items", - columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) def get_items(): yield { "id": 1, @@ -548,7 +568,6 @@ def get_items(): @pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) def test_two_new_columns_from_two_rows(column_mode: str) -> None: - pipeline = get_pipeline() # this creates a complete table in first row @@ -564,13 +583,13 @@ def items(): "id": 1, "key": "value", } + pipeline.run([items()], schema_contract={"columns": column_mode}) assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 @pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) def test_dynamic_new_columns(column_mode: str) -> None: - pipeline = get_pipeline() # fails because dlt is not able to add _dlt_load_id to tables. I think we should do an exception for those diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index b88c1a7773..be397f796c 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -11,7 +11,8 @@ def test_schema_updates() -> None: def source(): @dlt.resource() def resource(): - yield [1,2,3] + yield [1, 2, 3] + return resource # test without normalizer attributes @@ -23,11 +24,7 @@ def resource(): s = source() p.run(s, table_name="items", write_disposition="merge") assert p.default_schema._normalizers_config["json"]["config"] == { - "propagation": { - "tables": { - "items": {'_dlt_id': '_dlt_root_id'} - } - } + "propagation": {"tables": {"items": {"_dlt_id": "_dlt_root_id"}}} } # set root key @@ -36,10 +33,8 @@ def resource(): p.run(s, table_name="items", write_disposition="merge") assert p.default_schema._normalizers_config["json"]["config"] == { "propagation": { - "tables": { - "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + "tables": {"items": {"_dlt_id": "_dlt_root_id"}}, + "root": {"_dlt_id": "_dlt_root_id"}, } } @@ -49,11 +44,7 @@ def resource(): p.run(s, table_name="items", write_disposition="merge") # source schema overwrites normalizer settings so `root` propagation is gone assert p.default_schema._normalizers_config["json"]["config"] == { - "propagation": { - "tables": { - "items": {'_dlt_id': '_dlt_root_id'} - } - } + "propagation": {"tables": {"items": {"_dlt_id": "_dlt_root_id"}}} } # set max nesting @@ -61,12 +52,8 @@ def resource(): s.max_table_nesting = 5 p.run(s, table_name="items", write_disposition="merge") assert p.default_schema._normalizers_config["json"]["config"] == { - "propagation": { - "tables": { - "items": {'_dlt_id': '_dlt_root_id'} - } - }, - "max_nesting": 5 + "propagation": {"tables": {"items": {"_dlt_id": "_dlt_root_id"}}}, + "max_nesting": 5, } # update max nesting and new table @@ -76,9 +63,9 @@ def resource(): assert p.default_schema._normalizers_config["json"]["config"] == { "propagation": { "tables": { - "items": {'_dlt_id': '_dlt_root_id'}, - "items2": {'_dlt_id': '_dlt_root_id'}, + "items": {"_dlt_id": "_dlt_root_id"}, + "items2": {"_dlt_id": "_dlt_root_id"}, } }, - "max_nesting": 50 - } \ No newline at end of file + "max_nesting": 50, + } diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 0d36ff3021..94683e4995 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -48,7 +48,12 @@ def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: try: with p.sql_client() as c: qualified_names = [c.make_qualified_table_name(name) for name in table_names] - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) + query = "\nUNION ALL\n".join( + [ + f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" + for name, q_name in zip(table_names, qualified_names) + ] + ) with c.execute_query(query) as cur: rows = list(cur.fetchall()) return {r[0]: r[1] for r in rows} @@ -62,6 +67,7 @@ def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: result[table_name] = len(items) return result + def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: tables = [table["name"] for table in p.default_schema.data_tables()] return load_table_counts(p, *tables) @@ -69,7 +75,9 @@ def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: table_counts = load_data_table_counts(p) - assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" + assert ( + table_counts == expected_counts + ), f"Table counts do not match, expected {expected_counts}, got {table_counts}" def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: @@ -114,6 +122,7 @@ def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: # load parquet elif ext == "parquet": import pyarrow.parquet as pq + with open(full_path, "rb") as f: table = pq.read_table(f) cols = table.column_names @@ -136,7 +145,9 @@ def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, A """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" client: FilesystemClient = p.destination_client() # type: ignore[assignment] result: Dict[str, Any] = {} - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for basedir, _dirs, files in client.fs_client.walk( + client.dataset_path, detail=False, refresh=True + ): for file in files: table_name, items = load_file(basedir, file) if table_name not in table_names: @@ -153,9 +164,7 @@ def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, A return result - def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - # try sql, could be other destination though try: result = {} @@ -180,9 +189,16 @@ def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[D return load_files(p, *table_names) -def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: +def load_table_distinct_counts( + p: dlt.Pipeline, distinct_column: str, *table_names: str +) -> DictStrAny: """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) + query = "\nUNION ALL\n".join( + [ + f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" + for name in table_names + ] + ) with p.sql_client() as c: with c.execute_query(query) as cur: rows = list(cur.fetchall()) @@ -191,10 +207,8 @@ def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_nam @dlt.source def airtable_emojis(): - @dlt.resource(name="📆 Schedule") def schedule(): - yield [1, 2, 3] @dlt.resource(name="💰Budget", primary_key=("🔑book_id", "asset_id")) @@ -211,12 +225,10 @@ def peacock(): def wide_peacock(): yield [{"peacock": [1, 2, 3]}] - return budget, schedule, peacock, wide_peacock def run_deferred(iters): - @dlt.defer def item(n): sleep(random.random() / 2) diff --git a/tests/reflection/module_cases/__init__.py b/tests/reflection/module_cases/__init__.py index 851514132d..4b792d81c0 100644 --- a/tests/reflection/module_cases/__init__.py +++ b/tests/reflection/module_cases/__init__.py @@ -1,4 +1,4 @@ import xxx.absolutely from xxx.absolutely import a1, a3 -from dlt.common.utils import uniq_id \ No newline at end of file +from dlt.common.utils import uniq_id diff --git a/tests/reflection/module_cases/all_imports.py b/tests/reflection/module_cases/all_imports.py index 0cfde3a9a1..32ca48ec6f 100644 --- a/tests/reflection/module_cases/all_imports.py +++ b/tests/reflection/module_cases/all_imports.py @@ -1 +1 @@ -from dlt.common.utils import uniq_id \ No newline at end of file +from dlt.common.utils import uniq_id diff --git a/tests/reflection/module_cases/executes_resource.py b/tests/reflection/module_cases/executes_resource.py index a2024398fc..3049eb51f9 100644 --- a/tests/reflection/module_cases/executes_resource.py +++ b/tests/reflection/module_cases/executes_resource.py @@ -1,9 +1,10 @@ import dlt + @dlt.resource def aleph(n: int): for i in range(0, n): yield i -print(list(aleph(10))) \ No newline at end of file +print(list(aleph(10))) diff --git a/tests/reflection/module_cases/import_as_type.py b/tests/reflection/module_cases/import_as_type.py index 500a1bf8a0..38604304ba 100644 --- a/tests/reflection/module_cases/import_as_type.py +++ b/tests/reflection/module_cases/import_as_type.py @@ -1,6 +1,8 @@ from xxx.aa import Tx + def create_tx() -> Tx: return Tx() + tx = Tx() diff --git a/tests/reflection/module_cases/no_pkg.py b/tests/reflection/module_cases/no_pkg.py index 62e3377048..497740970c 100644 --- a/tests/reflection/module_cases/no_pkg.py +++ b/tests/reflection/module_cases/no_pkg.py @@ -1 +1 @@ -from . import uniq_id \ No newline at end of file +from . import uniq_id diff --git a/tests/reflection/module_cases/raises.py b/tests/reflection/module_cases/raises.py index 2c4cc4daa1..d2f5167716 100644 --- a/tests/reflection/module_cases/raises.py +++ b/tests/reflection/module_cases/raises.py @@ -1,4 +1,4 @@ from xxx.absolutely import a1, a3 from dlt.common.utils import uniq_id -raise NotImplementedError("empty module") \ No newline at end of file +raise NotImplementedError("empty module") diff --git a/tests/reflection/module_cases/stripe_analytics/__init__.py b/tests/reflection/module_cases/stripe_analytics/__init__.py index 6877ef5475..8f0b2ff6b6 100644 --- a/tests/reflection/module_cases/stripe_analytics/__init__.py +++ b/tests/reflection/module_cases/stripe_analytics/__init__.py @@ -1,2 +1,2 @@ from .stripe_analytics import VALUE -from .helpers import HELPERS_VALUE \ No newline at end of file +from .helpers import HELPERS_VALUE diff --git a/tests/reflection/module_cases/stripe_analytics/stripe_analytics.py b/tests/reflection/module_cases/stripe_analytics/stripe_analytics.py index d41cb0c51a..6ee95e6bf8 100644 --- a/tests/reflection/module_cases/stripe_analytics/stripe_analytics.py +++ b/tests/reflection/module_cases/stripe_analytics/stripe_analytics.py @@ -1,3 +1,3 @@ import stripe -VALUE = 1 \ No newline at end of file +VALUE = 1 diff --git a/tests/reflection/module_cases/stripe_analytics_pipeline.py b/tests/reflection/module_cases/stripe_analytics_pipeline.py index 7cb84c9e6e..67002f6ed9 100644 --- a/tests/reflection/module_cases/stripe_analytics_pipeline.py +++ b/tests/reflection/module_cases/stripe_analytics_pipeline.py @@ -1,4 +1,4 @@ from stripe_analytics import VALUE, HELPERS_VALUE print(VALUE) -print(HELPERS_VALUE) \ No newline at end of file +print(HELPERS_VALUE) diff --git a/tests/reflection/test_script_inspector.py b/tests/reflection/test_script_inspector.py index 291c823357..0769a2aa82 100644 --- a/tests/reflection/test_script_inspector.py +++ b/tests/reflection/test_script_inspector.py @@ -1,12 +1,18 @@ from types import SimpleNamespace import pytest -from dlt.reflection.script_inspector import load_script_module, inspect_pipeline_script, DummyModule, PipelineIsRunning +from dlt.reflection.script_inspector import ( + load_script_module, + inspect_pipeline_script, + DummyModule, + PipelineIsRunning, +) from tests.utils import unload_modules MODULE_CASES = "./tests/reflection/module_cases" + def test_import_init_module() -> None: with pytest.raises(ModuleNotFoundError): load_script_module("./tests/reflection/", "module_cases", ignore_missing_imports=False) @@ -27,7 +33,9 @@ def test_import_module() -> None: with pytest.raises(ImportError): load_script_module(MODULE_CASES, "no_pkg", ignore_missing_imports=True) # but with package name in module name it will work - m = load_script_module("./tests/reflection/", "module_cases.no_pkg", ignore_missing_imports=True) + m = load_script_module( + "./tests/reflection/", "module_cases.no_pkg", ignore_missing_imports=True + ) # uniq_id got imported assert isinstance(m.uniq_id(), str) @@ -58,4 +66,4 @@ def test_package_dummy_clash() -> None: m = load_script_module(MODULE_CASES, "stripe_analytics_pipeline", ignore_missing_imports=True) # and those would fails assert m.VALUE == 1 - assert m.HELPERS_VALUE == 3 \ No newline at end of file + assert m.HELPERS_VALUE == 3 diff --git a/tests/sources/helpers/test_requests.py b/tests/sources/helpers/test_requests.py index ea728b92cb..695fa93eca 100644 --- a/tests/sources/helpers/test_requests.py +++ b/tests/sources/helpers/test_requests.py @@ -15,18 +15,23 @@ from dlt.common.configuration.specs import RunConfiguration from dlt.sources.helpers.requests import Session, Client, client as default_client from dlt.sources.helpers.requests.retry import ( - DEFAULT_RETRY_EXCEPTIONS, DEFAULT_RETRY_STATUS, retry_if_status, retry_any, Retrying, wait_exponential_retry_after + DEFAULT_RETRY_EXCEPTIONS, + DEFAULT_RETRY_STATUS, + retry_if_status, + retry_any, + Retrying, + wait_exponential_retry_after, ) -@pytest.fixture(scope='function', autouse=True) +@pytest.fixture(scope="function", autouse=True) def mock_sleep() -> Iterator[mock.MagicMock]: - with mock.patch('time.sleep') as m: + with mock.patch("time.sleep") as m: yield m def test_default_session_retry_settings() -> None: - retry: Retrying = Client().session.request.retry # type: ignore + retry: Retrying = Client().session.request.retry # type: ignore assert retry.stop.max_attempt_number == 5 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries @@ -36,7 +41,7 @@ def test_default_session_retry_settings() -> None: assert retry.wait.multiplier == 1 -@pytest.mark.parametrize('respect_retry_after_header', (True, False)) +@pytest.mark.parametrize("respect_retry_after_header", (True, False)) def test_custom_session_retry_settings(respect_retry_after_header: bool) -> None: def custom_retry_cond(response, exception): return True @@ -52,14 +57,14 @@ def custom_retry_cond(response, exception): assert retry.stop.max_attempt_number == 14 # type: ignore assert isinstance(retry.retry, retry_any) retries = retry.retry.retries - assert retries[2].predicate == custom_retry_cond # type: ignore + assert retries[2].predicate == custom_retry_cond # type: ignore assert isinstance(retry.wait, wait_exponential) assert retry.wait.multiplier == 2 def test_retry_on_status_all_fails(mock_sleep: mock.MagicMock) -> None: session = Client().session - url = 'https://example.com/data' + url = "https://example.com/data" with requests_mock.mock(session=session) as m: m.get(url, status_code=503) @@ -68,16 +73,16 @@ def test_retry_on_status_all_fails(mock_sleep: mock.MagicMock) -> None: assert m.call_count == RunConfiguration.request_max_attempts + def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: - """Test successful request after 2 retries - """ + """Test successful request after 2 retries""" session = Client().session - url = 'https://example.com/data' + url = "https://example.com/data" responses = [ - dict(text='error', status_code=503), - dict(text='error', status_code=503), - dict(text='error', status_code=200) + dict(text="error", status_code=503), + dict(text="error", status_code=503), + dict(text="error", status_code=200), ] with requests_mock.mock(session=session) as m: @@ -87,8 +92,9 @@ def test_retry_on_status_success_after_2(mock_sleep: mock.MagicMock) -> None: assert resp.status_code == 200 assert m.call_count == 3 + def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> None: - url = 'https://example.com/data' + url = "https://example.com/data" session = Client(raise_for_status=False).session with requests_mock.mock(session=session) as m: @@ -98,10 +104,16 @@ def test_retry_on_status_without_raise_for_status(mock_sleep: mock.MagicMock) -> assert m.call_count == RunConfiguration.request_max_attempts -@pytest.mark.parametrize('exception_class', [requests.ConnectionError, requests.ConnectTimeout, requests.exceptions.ChunkedEncodingError]) -def test_retry_on_exception_all_fails(exception_class: Type[Exception], mock_sleep: mock.MagicMock) -> None: + +@pytest.mark.parametrize( + "exception_class", + [requests.ConnectionError, requests.ConnectTimeout, requests.exceptions.ChunkedEncodingError], +) +def test_retry_on_exception_all_fails( + exception_class: Type[Exception], mock_sleep: mock.MagicMock +) -> None: session = Client().session - url = 'https://example.com/data' + url = "https://example.com/data" with requests_mock.mock(session=session) as m: m.get(url, exc=exception_class) @@ -110,41 +122,44 @@ def test_retry_on_exception_all_fails(exception_class: Type[Exception], mock_sle assert m.call_count == RunConfiguration.request_max_attempts + def test_retry_on_custom_condition(mock_sleep: mock.MagicMock) -> None: def retry_on(response: requests.Response, exception: BaseException) -> bool: - return response.text == 'error' + return response.text == "error" session = Client(retry_condition=retry_on).session - url = 'https://example.com/data' + url = "https://example.com/data" with requests_mock.mock(session=session) as m: - m.get(url, text='error') + m.get(url, text="error") response = session.get(url) assert response.content == b"error" assert m.call_count == RunConfiguration.request_max_attempts + def test_retry_on_custom_condition_success_after_2(mock_sleep: mock.MagicMock) -> None: def retry_on(response: requests.Response, exception: BaseException) -> bool: - return response.text == 'error' + return response.text == "error" session = Client(retry_condition=retry_on).session - url = 'https://example.com/data' - responses = [dict(text='error'), dict(text='error'), dict(text='success')] + url = "https://example.com/data" + responses = [dict(text="error"), dict(text="error"), dict(text="success")] with requests_mock.mock(session=session) as m: m.get(url, responses) resp = session.get(url) - assert resp.text == 'success' + assert resp.text == "success" assert m.call_count == 3 + def test_wait_retry_after_int(mock_sleep: mock.MagicMock) -> None: session = Client(request_backoff_factor=0).session - url = 'https://example.com/data' + url = "https://example.com/data" responses = [ - dict(text='error', headers={'retry-after': '4'}, status_code=429), - dict(text='success') + dict(text="error", headers={"retry-after": "4"}, status_code=429), + dict(text="success"), ] with requests_mock.mock(session=session) as m: @@ -155,46 +170,46 @@ def test_wait_retry_after_int(mock_sleep: mock.MagicMock) -> None: assert 4 <= mock_sleep.call_args[0][0] <= 5 # Adds jitter up to 1s -@pytest.mark.parametrize('existing_session', (False, True)) +@pytest.mark.parametrize("existing_session", (False, True)) def test_init_default_client(existing_session: bool) -> None: """Test that the default client config is updated from runtime configuration. Run twice. 1. Clean start with no existing session attached. 2. With session in thread local (session is updated) """ cfg = { - 'RUNTIME__REQUEST_TIMEOUT': random.randrange(1, 100), - 'RUNTIME__REQUEST_MAX_ATTEMPTS': random.randrange(1, 100), - 'RUNTIME__REQUEST_BACKOFF_FACTOR': random.randrange(1, 100), - 'RUNTIME__REQUEST_MAX_RETRY_DELAY': random.randrange(1, 100), + "RUNTIME__REQUEST_TIMEOUT": random.randrange(1, 100), + "RUNTIME__REQUEST_MAX_ATTEMPTS": random.randrange(1, 100), + "RUNTIME__REQUEST_BACKOFF_FACTOR": random.randrange(1, 100), + "RUNTIME__REQUEST_MAX_RETRY_DELAY": random.randrange(1, 100), } os.environ.update({key: str(value) for key, value in cfg.items()}) - dlt.pipeline(pipeline_name='dummy_pipeline') + dlt.pipeline(pipeline_name="dummy_pipeline") session = default_client.session - assert session.timeout == cfg['RUNTIME__REQUEST_TIMEOUT'] + assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] retry = session.request.retry # type: ignore[attr-defined] - assert retry.wait.multiplier == cfg['RUNTIME__REQUEST_BACKOFF_FACTOR'] - assert retry.stop.max_attempt_number == cfg['RUNTIME__REQUEST_MAX_ATTEMPTS'] - assert retry.wait.max == cfg['RUNTIME__REQUEST_MAX_RETRY_DELAY'] + assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] + assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] + assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"] -@pytest.mark.parametrize('existing_session', (False, True)) +@pytest.mark.parametrize("existing_session", (False, True)) def test_client_instance_with_config(existing_session: bool) -> None: cfg = { - 'RUNTIME__REQUEST_TIMEOUT': random.randrange(1, 100), - 'RUNTIME__REQUEST_MAX_ATTEMPTS': random.randrange(1, 100), - 'RUNTIME__REQUEST_BACKOFF_FACTOR': random.randrange(1, 100), - 'RUNTIME__REQUEST_MAX_RETRY_DELAY': random.randrange(1, 100), + "RUNTIME__REQUEST_TIMEOUT": random.randrange(1, 100), + "RUNTIME__REQUEST_MAX_ATTEMPTS": random.randrange(1, 100), + "RUNTIME__REQUEST_BACKOFF_FACTOR": random.randrange(1, 100), + "RUNTIME__REQUEST_MAX_RETRY_DELAY": random.randrange(1, 100), } os.environ.update({key: str(value) for key, value in cfg.items()}) client = Client() session = client.session - assert session.timeout == cfg['RUNTIME__REQUEST_TIMEOUT'] + assert session.timeout == cfg["RUNTIME__REQUEST_TIMEOUT"] retry = session.request.retry # type: ignore[attr-defined] - assert retry.wait.multiplier == cfg['RUNTIME__REQUEST_BACKOFF_FACTOR'] - assert retry.stop.max_attempt_number == cfg['RUNTIME__REQUEST_MAX_ATTEMPTS'] - assert retry.wait.max == cfg['RUNTIME__REQUEST_MAX_RETRY_DELAY'] + assert retry.wait.multiplier == cfg["RUNTIME__REQUEST_BACKOFF_FACTOR"] + assert retry.stop.max_attempt_number == cfg["RUNTIME__REQUEST_MAX_ATTEMPTS"] + assert retry.wait.max == cfg["RUNTIME__REQUEST_MAX_RETRY_DELAY"] diff --git a/tests/tools/clean_redshift.py b/tests/tools/clean_redshift.py index 27680b26cd..f81407f74a 100644 --- a/tests/tools/clean_redshift.py +++ b/tests/tools/clean_redshift.py @@ -4,7 +4,7 @@ CONNECTION_STRING = "" -if __name__ == '__main__': +if __name__ == "__main__": # connect connection = psycopg2.connect(CONNECTION_STRING) connection.set_isolation_level(0) diff --git a/tests/tools/create_storages.py b/tests/tools/create_storages.py index 4f0abe3512..5b8788f99f 100644 --- a/tests/tools/create_storages.py +++ b/tests/tools/create_storages.py @@ -1,4 +1,11 @@ -from dlt.common.storages import NormalizeStorage, LoadStorage, SchemaStorage, NormalizeStorageConfiguration, LoadStorageConfiguration, SchemaStorageConfiguration +from dlt.common.storages import ( + NormalizeStorage, + LoadStorage, + SchemaStorage, + NormalizeStorageConfiguration, + LoadStorageConfiguration, + SchemaStorageConfiguration, +) # NormalizeStorage(True, NormalizeVolumeConfiguration) diff --git a/tests/utils.py b/tests/utils.py index 8ec15a20ad..74e21eef9f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -29,12 +29,27 @@ # destination constants -IMPLEMENTED_DESTINATIONS = {"athena", "duckdb", "bigquery", "redshift", "postgres", "snowflake", "filesystem", "weaviate", "dummy", "motherduck", "mssql", "qdrant"} +IMPLEMENTED_DESTINATIONS = { + "athena", + "duckdb", + "bigquery", + "redshift", + "postgres", + "snowflake", + "filesystem", + "weaviate", + "dummy", + "motherduck", + "mssql", + "qdrant", +} NON_SQL_DESTINATIONS = {"filesystem", "weaviate", "dummy", "motherduck", "qdrant"} SQL_DESTINATIONS = IMPLEMENTED_DESTINATIONS - NON_SQL_DESTINATIONS # exclude destination configs (for now used for athena and athena iceberg separation) -EXCLUDED_DESTINATION_CONFIGURATIONS = set(dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set()) +EXCLUDED_DESTINATION_CONFIGURATIONS = set( + dlt.config.get("EXCLUDED_DESTINATION_CONFIGURATIONS", list) or set() +) # filter out active destinations for current tests @@ -72,6 +87,7 @@ def TEST_DICT_CONFIG_PROVIDER(): providers_context.add_provider(provider) return provider + class MockHttpResponse(Response): def __init__(self, status_code: int) -> None: self.status_code = status_code @@ -156,7 +172,9 @@ def wipe_pipeline() -> Iterator[None]: container[PipelineContext].deactivate() -def data_to_item_format(item_format: TDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]]) -> Any: +def data_to_item_format( + item_format: TDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]] +) -> Any: """Return the given data in the form of pandas, arrow table/batch or json items""" if item_format == "json": return data @@ -189,15 +207,19 @@ def start_test_telemetry(c: RunConfiguration = None): start_telemetry(c) -def clean_test_storage(init_normalize: bool = False, init_loader: bool = False, mode: str = "t") -> FileStorage: +def clean_test_storage( + init_normalize: bool = False, init_loader: bool = False, mode: str = "t" +) -> FileStorage: storage = FileStorage(TEST_STORAGE_ROOT, mode, makedirs=True) storage.delete_folder("", recursively=True, delete_ro=True) storage.create_folder(".") if init_normalize: from dlt.common.storages import NormalizeStorage + NormalizeStorage(True) if init_loader: from dlt.common.storages import LoadStorage + LoadStorage(True, "jsonl", LoadStorage.ALL_SUPPORTED_FILE_FORMATS) return storage @@ -232,9 +254,7 @@ def is_running_in_github_fork() -> bool: platform.python_implementation() == "PyPy", reason="won't run in PyPy interpreter" ) -skipifnotwindows = pytest.mark.skipif( - platform.system() != "Windows", reason="runs only on windows" -) +skipifnotwindows = pytest.mark.skipif(platform.system() != "Windows", reason="runs only on windows") skipifwindows = pytest.mark.skipif( platform.system() == "Windows", reason="does not runs on windows" @@ -242,4 +262,4 @@ def is_running_in_github_fork() -> bool: skipifgithubfork = pytest.mark.skipif( is_running_in_github_fork(), reason="Skipping test because it runs on a PR coming from fork" -) \ No newline at end of file +) From a2bb35d965b67935ea3bc54f7913949e5100d6f7 Mon Sep 17 00:00:00 2001 From: Dave Date: Wed, 22 Nov 2023 17:20:34 +0100 Subject: [PATCH 07/10] fix snippets linting --- .../examples/incremental_loading/code/zendesk-snippets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 569b554f16..ecc0420854 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -20,9 +20,9 @@ def incremental_snippet() -> None: @dlt.source(max_table_nesting=2) def zendesk_support( credentials: Dict[str, str] = dlt.secrets.value, - start_date: Optional[TAnyDateTime] = pendulum.datetime( + start_date: Optional[TAnyDateTime] = pendulum.datetime( # noqa: B008 year=2000, month=1, day=1 - ), # noqa: B008 + ), end_date: Optional[TAnyDateTime] = None, ): """ From 49978c5bf839dda32ab95082ff3e41ca5ffbbcfc Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 23 Nov 2023 07:58:37 +0100 Subject: [PATCH 08/10] exclude venv from formatting --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 49982de2c6..1059cfdf0a 100644 --- a/Makefile +++ b/Makefile @@ -48,7 +48,7 @@ dev: has-poetry lint: ./check-package.sh - poetry run black ./ --diff --exclude=".*syntax_error.py" + poetry run black ./ --diff --exclude=".*syntax_error.py|\.venv.*" # poetry run isort ./ --diff poetry run mypy --config-file mypy.ini dlt tests poetry run flake8 --max-line-length=200 dlt @@ -56,7 +56,7 @@ lint: # $(MAKE) lint-security format: - poetry run black ./ --exclude=".*syntax_error.py" + poetry run black ./ --exclude=".*syntax_error.py|\.venv.*" # poetry run isort ./ test-and-lint-snippets: From 2a0e00807c1f90f51a4acf9731c42fb044e67f71 Mon Sep 17 00:00:00 2001 From: Dave Date: Thu, 23 Nov 2023 08:32:48 +0100 Subject: [PATCH 09/10] fix one lint error --- dlt/common/pendulum.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/common/pendulum.py b/dlt/common/pendulum.py index fdf003531b..cad5e099eb 100644 --- a/dlt/common/pendulum.py +++ b/dlt/common/pendulum.py @@ -2,7 +2,7 @@ import pendulum # noqa: I251 # force UTC as the local timezone to prevent local dates to be written to dbs -pendulum.set_local_timezone(pendulum.timezone("UTC")) # type: ignore +pendulum.set_local_timezone(pendulum.timezone("UTC")) def __utcnow() -> pendulum.DateTime: From 3d35f9badd319e158dfff5e23f1ea8c4804c3707 Mon Sep 17 00:00:00 2001 From: David Scharf Date: Thu, 23 Nov 2023 10:34:25 +0100 Subject: [PATCH 10/10] Create .git-blame-ignore-revs --- .git-blame-ignore-revs | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000000..5d86975b75 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# introduce formatting with black +c3ddbaa6e61c44a3809e625c802cb4c7632934a3