diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b597d49e6b..155b429b92 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --all-extras --with airflow + run: poetry install --no-interaction --all-extras --with airflow --with docs --with providers --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index d78a48e8f7..bbed326344 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -41,7 +41,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow -E duckdb -E parquet + run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet --with sentry-sdk - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 23b6eb9fdd..24c8215c2b 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -55,40 +55,67 @@ jobs: virtualenvs-in-project: true installer-parallel: true - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v3 - with: - # path: ${{ steps.pip-cache.outputs.dir }} - path: .venv - key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} + # NOTE: do not cache. we want to have a clean state each run and we upgrade depdendencies later + # - name: Load cached venv + # id: cached-poetry-dependencies + # uses: actions/cache@v3 + # with: + # # path: ${{ steps.pip-cache.outputs.dir }} + # path: .venv + # key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + run: poetry install --no-interaction --with sentry-sdk + + - run: | + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py + if: runner.os != 'Windows' + name: Run common tests with minimum dependencies Linux/MAC + - run: | + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py -m "not forked" + if: runner.os == 'Windows' + name: Run common tests with minimum dependencies Windows + shell: cmd - - name: Install dependencies + sentry - run: poetry install --no-interaction -E parquet -E pydantic && pip install sentry-sdk + - name: Install duckdb dependencies + run: poetry install --no-interaction -E duckdb --with sentry-sdk - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os != 'Windows' - name: Run tests Linux/MAC + name: Run pipeline smoke tests with minimum deps Linux/MAC - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources -m "not forked" + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os == 'Windows' - name: Run tests Windows + name: Run smoke tests with minimum deps Windows shell: cmd - - name: Install extra dependencies - run: poetry install --no-interaction -E duckdb -E cli -E parquet -E pydantic + - name: Install pipeline dependencies + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common if: runner.os != 'Windows' - name: Run extra tests Linux/MAC + name: Run extract and pipeline tests Linux/MAC - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common if: runner.os == 'Windows' - name: Run extra tests Windows + name: Run extract tests Windows shell: cmd + # - name: Install Pydantic 1.0 + # run: pip install "pydantic<2" + + # - run: | + # poetry run pytest tests/libs + # if: runner.os != 'Windows' + # name: Run extract and pipeline tests Linux/MAC + # - run: | + # poetry run pytest tests/libs + # if: runner.os == 'Windows' + # name: Run extract tests Windows + # shell: cmd + matrix_job_required_check: name: Common tests needs: run_common diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index db3b53e9fa..1803a53fc1 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -68,7 +68,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres -E dbt + run: poetry install --no-interaction -E postgres -E dbt --with sentry-sdk - run: | poetry run pytest tests/helpers/dbt_tests -k '(not venv)' diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 704e66522b..b849188ddd 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 6892a96bf1..97544f24d1 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index dcc7e7ba9b..e12d7bd0f0 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -79,7 +79,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index bba44e750d..6eb4427bbf 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 09ded40f59..0ce3e3a3f9 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -59,7 +59,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E qdrant -E parquet + run: poetry install --no-interaction -E qdrant -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index 4aae3ec62e..fe81c6121f 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -71,7 +71,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index e86e29ebf6..d0f364c382 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -5,9 +5,9 @@ on: branches: - master - devel - + workflow_dispatch: - + env: DESTINATION__SYNAPSE__CREDENTIALS: ${{ secrets.SYNAPSE_CREDENTIALS }} DESTINATION__SYNAPSE__CREDENTIALS__PASSWORD: ${{ secrets.SYNAPSE_PASSWORD }} @@ -42,7 +42,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - + - name: Check out uses: actions/checkout@master @@ -70,7 +70,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E s3 -E gs -E az + run: poetry install --no-interaction -E synapse -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_weaviate.yml b/.github/workflows/test_destination_weaviate.yml index 6a7a2e95cd..c771a28204 100644 --- a/.github/workflows/test_destination_weaviate.yml +++ b/.github/workflows/test_destination_weaviate.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E weaviate -E parquet + run: poetry install --no-interaction -E weaviate -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index f3f6c492db..f37feb872f 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -87,7 +87,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index ad7d544219..004bafba05 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow --with sentry-sdk --with pipeline - name: Run linter and tests run: make test-and-lint-snippets diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6c538d1968..42c3c2d13a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -84,7 +84,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline - run: poetry run pytest tests/load && poetry run pytest tests/cli name: Run tests Linux diff --git a/Makefile b/Makefile index 85f67818ac..ba447be3b3 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with airflow --with docs --with providers + poetry install --all-extras --with airflow --with docs --with providers --with pipeline --with sentry-sdk lint: ./check-package.sh diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 5c93e22bc6..783a3501d2 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -68,7 +68,7 @@ def __init__( except TypeError: raise InvalidFileNameTemplateException(file_name_template) - def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> int: self._ensure_open() # rotate file if columns changed and writer does not allow for that # as the only allowed change is to add new column (no updates/deletes), we detect the change by comparing lengths @@ -78,21 +78,24 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # until the first chunk is written we can change the columns schema freely if columns is not None: self._current_columns = dict(columns) + + new_rows_count: int if isinstance(item, List): # items coming in single list will be written together, not matter how many are there self._buffered_items.extend(item) # update row count, if item supports "num_rows" it will be used to count items if len(item) > 0 and hasattr(item[0], "num_rows"): - self._buffered_items_count += sum(tbl.num_rows for tbl in item) + new_rows_count = sum(tbl.num_rows for tbl in item) else: - self._buffered_items_count += len(item) + new_rows_count = len(item) else: self._buffered_items.append(item) # update row count, if item supports "num_rows" it will be used to count items if hasattr(item, "num_rows"): - self._buffered_items_count += item.num_rows + new_rows_count = item.num_rows else: - self._buffered_items_count += 1 + new_rows_count = 1 + self._buffered_items_count += new_rows_count # flush if max buffer exceeded if self._buffered_items_count >= self.buffer_max_items: self._flush_items() @@ -104,6 +107,7 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # rotate on max items elif self.file_max_items and self._writer.items_count >= self.file_max_items: self._rotate_file() + return new_rows_count def write_empty_file(self, columns: TTableSchemaColumns) -> None: if columns is not None: diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 401f6aafd2..412e732e97 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -220,7 +220,7 @@ def __init__(self, self.parquet_row_group_size = row_group_size def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter": - from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype + from dlt.common.libs.pyarrow import pyarrow return pyarrow.parquet.ParquetWriter(self._f, schema, flavor=self.parquet_flavor, version=self.parquet_version, data_page_size=self.parquet_data_page_size) def write_header(self, columns_schema: TTableSchemaColumns) -> None: diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index edb48643ef..c4acf66c72 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -181,6 +181,11 @@ def custom_pua_remove(obj: Any) -> Any: return obj +def may_have_pua(line: bytes) -> bool: + """Checks if bytes string contains pua marker""" + return b'\xef\x80' in line + + # pick the right impl json: SupportsJson = None if os.environ.get("DLT_USE_JSON") == "simplejson": diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index fb2f5c2e72..585bee0d2f 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -1,12 +1,14 @@ from typing import Any, Tuple, Optional, Union, Callable, Iterable, Iterator, Sequence, Tuple +from copy import copy + from dlt import version from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.typing import DLT_NAME_PREFIX, TTableSchemaColumns from dlt.common.destination.capabilities import DestinationCapabilitiesContext -from dlt.common.schema.typing import TColumnType, TColumnSchemaBase -from dlt.common.data_types import TDataType -from dlt.common.typing import TFileOrPath +from dlt.common.schema.typing import TColumnType +from dlt.common.typing import StrStr, TFileOrPath +from dlt.common.normalizers.naming import NamingConvention try: import pyarrow @@ -140,23 +142,120 @@ def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: def remove_null_columns(item: TAnyArrowItem) -> TAnyArrowItem: - """Remove all columns of datatype pyarrow.null() from the table or record batch - """ + """Remove all columns of datatype pyarrow.null() from the table or record batch""" + return remove_columns(item, [field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + + +def remove_columns(item: TAnyArrowItem, columns: Sequence[str]) -> TAnyArrowItem: + """Remove `columns` from Arrow `item`""" + if not columns: + return item + if isinstance(item, pyarrow.Table): - return item.drop([field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + return item.drop(columns) elif isinstance(item, pyarrow.RecordBatch): - null_idx = [i for i, col in enumerate(item.columns) if pyarrow.types.is_null(col.type)] - new_schema = item.schema - for i in reversed(null_idx): - new_schema = new_schema.remove(i) - return pyarrow.RecordBatch.from_arrays( - [col for i, col in enumerate(item.columns) if i not in null_idx], - schema=new_schema - ) + # NOTE: select is available in pyarrow 12 an up + return item.select([n for n in item.schema.names if n not in columns]) # reverse selection else: raise ValueError(item) +def append_column(item: TAnyArrowItem, name: str, data: Any) -> TAnyArrowItem: + """Appends new column to Table or RecordBatch""" + if isinstance(item, pyarrow.Table): + return item.append_column(name, data) + elif isinstance(item, pyarrow.RecordBatch): + new_field = pyarrow.field(name, data.type) + return pyarrow.RecordBatch.from_arrays(item.columns + [data], schema=item.schema.append(new_field)) + else: + raise ValueError(item) + + +def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAnyArrowItem: + """Rename arrow columns on Table or RecordBatch, returns same data but with renamed schema""" + + if list(item.schema.names) == list(new_column_names): + # No need to rename + return item + + if isinstance(item, pyarrow.Table): + return item.rename_columns(new_column_names) + elif isinstance(item, pyarrow.RecordBatch): + new_fields = [field.with_name(new_name) for new_name, field in zip(new_column_names, item.schema)] + return pyarrow.RecordBatch.from_arrays(item.columns, schema=pyarrow.schema(new_fields)) + else: + raise TypeError(f"Unsupported data item type {type(item)}") + + +def normalize_py_arrow_schema( + item: TAnyArrowItem, + columns: TTableSchemaColumns, + naming: NamingConvention, + caps: DestinationCapabilitiesContext +) -> TAnyArrowItem: + """Normalize arrow `item` schema according to the `columns`. + + 1. arrow schema field names will be normalized according to `naming` + 2. arrows columns will be reordered according to `columns` + 3. empty columns will be inserted if they are missing, types will be generated using `caps` + """ + rename_mapping = get_normalized_arrow_fields_mapping(item, naming) + rev_mapping = {v: k for k, v in rename_mapping.items()} + dlt_table_prefix = naming.normalize_table_identifier(DLT_NAME_PREFIX) + + # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns + # that should happen in the normalizer + columns = {name:column for name, column in columns.items() if not name.startswith(dlt_table_prefix) or name in rev_mapping} + + # check if nothing to rename + if list(rename_mapping.keys()) == list(rename_mapping.values()): + # check if nothing to reorder + if list(rename_mapping.keys())[:len(columns)]== list(columns.keys()): + return item + + schema = item.schema + new_fields = [] + new_columns = [] + + for column_name, column in columns.items(): + # get original field name + field_name = rev_mapping.pop(column_name, column_name) + if field_name in rename_mapping: + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + else: + # column does not exist in pyarrow. create empty field and column + new_field = pyarrow.field( + column_name, + get_py_arrow_datatype(column, caps, "UTC"), + nullable=column.get("nullable", True) + ) + new_fields.append(new_field) + new_columns.append(pyarrow.nulls(item.num_rows, type=new_field.type)) + + # add the remaining columns + for column_name, field_name in rev_mapping.items(): + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + + # create desired type + return item.__class__.from_arrays(new_columns, schema=pyarrow.schema(new_fields)) + + +def get_normalized_arrow_fields_mapping(item: TAnyArrowItem, naming: NamingConvention) -> StrStr: + """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" + norm_f = naming.normalize_identifier + name_mapping = {n.name: norm_f(n.name) for n in item.schema} + # verify if names uniquely normalize + normalized_names = set(name_mapping.values()) + if len(name_mapping) != len(normalized_names): + raise NameNormalizationClash(f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") + return name_mapping + def py_arrow_to_table_schema_columns(schema: pyarrow.Schema) -> TTableSchemaColumns: """Convert a PyArrow schema to a table schema columns dict. @@ -193,9 +292,8 @@ def get_row_count(parquet_file: TFileOrPath) -> int: def is_arrow_item(item: Any) -> bool: return isinstance(item, (pyarrow.Table, pyarrow.RecordBatch)) - -TNewColumns = Sequence[Tuple[pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] - +TNewColumns = Sequence[Tuple[int, pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] +"""Sequence of tuples: (field index, field, generating function)""" def pq_stream_with_new_columns( parquet_file: TFileOrPath, columns: TNewColumns, row_groups_per_read: int = 1 @@ -206,7 +304,7 @@ def pq_stream_with_new_columns( Args: parquet_file: path or file object to parquet file - columns: list of columns to add in the form of (`pyarrow.Field`, column_value_callback) + columns: list of columns to add in the form of (insertion index, `pyarrow.Field`, column_value_callback) The callback should accept a `pyarrow.Table` and return an array of values for the column. row_groups_per_read: number of row groups to read at a time. Defaults to 1. @@ -218,6 +316,15 @@ def pq_stream_with_new_columns( # Iterate through n row groups at a time for i in range(0, n_groups, row_groups_per_read): tbl: pyarrow.Table = reader.read_row_groups(range(i, min(i + row_groups_per_read, n_groups))) - for col in columns: - tbl = tbl.append_column(col[0], col[1](tbl)) + for idx, field, gen_ in columns: + if idx == -1: + tbl = tbl.append_column(field, gen_(tbl)) + else: + tbl = tbl.add_column(idx, field, gen_(tbl)) yield tbl + + +class NameNormalizationClash(ValueError): + def __init__(self, reason: str) -> None: + msg = f"Arrow column name clash after input data normalization. {reason}" + super().__init__(msg) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index c66d67f1f7..1b65fa3a7e 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,30 +1,68 @@ -from typing import Type, Union, get_type_hints, get_args, Any +from __future__ import annotations +import inspect +from copy import copy +from typing import Dict, Generic, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_args, Any from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.data_types import py_type_to_sc_type, TDataType -from dlt.common.typing import is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union +from dlt.common.schema import DataValidationError +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns +from dlt.common.data_types import py_type_to_sc_type +from dlt.common.typing import TDataItem, TDataItems, extract_union_types, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union try: - from pydantic import BaseModel, Field, Json + from pydantic import BaseModel, ValidationError, Json, create_model except ImportError: - raise MissingDependencyException("DLT pydantic Helpers", ["pydantic"], "DLT Helpers for for pydantic.") + raise MissingDependencyException("dlt Pydantic helpers", ["pydantic"], "Both Pydantic 1.x and 2.x are supported") + +_PYDANTIC_2 = False +try: + from pydantic import PydanticDeprecatedSince20 + _PYDANTIC_2 = True + # hide deprecation warning + import warnings + warnings.simplefilter("ignore", category=PydanticDeprecatedSince20) +except ImportError: + pass + +_TPydanticModel = TypeVar("_TPydanticModel", bound=BaseModel) + + +class ListModel(BaseModel, Generic[_TPydanticModel]): + items: List[_TPydanticModel] -def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], skip_complex_types: bool = False) -> TTableSchemaColumns: +class DltConfig(TypedDict, total=False): + """dlt configuration that can be attached to Pydantic model + + Example below removes `nested` field from the resulting dlt schema. + >>> class ItemModel(BaseModel): + >>> b: bool + >>> nested: Dict[str, Any] + >>> dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + """ + skip_complex_types: bool + """If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from dlt schema generated from the model""" + + +def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) -> TTableSchemaColumns: """Convert a pydantic model to a table schema columns dict + See also DltConfig for more control over how the schema is created + Args: model: The pydantic model to convert. Can be a class or an instance. - skip_complex_types: If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from the result. + Returns: TTableSchemaColumns: table schema columns dict """ + skip_complex_types = False + if hasattr(model, "dlt_config"): + skip_complex_types = model.dlt_config.get("skip_complex_types", False) + result: TTableSchemaColumns = {} - fields = model.__fields__ - for field_name, field in fields.items(): + for field_name, field in model.__fields__.items(): # type: ignore[union-attr] annotation = field.annotation if inner_annotation := getattr(annotation, 'inner_type', None): # This applies to pydantic.Json fields, the inner type is the type after json parsing @@ -49,7 +87,12 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s inner_type = dict name = field.alias or field_name - data_type = py_type_to_sc_type(inner_type) + try: + data_type = py_type_to_sc_type(inner_type) + except TypeError: + # try to coerce unknown type to text + data_type = "text" + if data_type == 'complex' and skip_complex_types: continue @@ -60,3 +103,195 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s } return result + + +def column_mode_to_extra(column_mode: TSchemaEvolutionMode) -> str: + extra = "forbid" + if column_mode == "evolve": + extra = "allow" + elif column_mode == "discard_value": + extra = "ignore" + return extra + + +def extra_to_column_mode(extra: str) -> TSchemaEvolutionMode: + if extra == "forbid": + return "freeze" + if extra == "allow": + return "evolve" + return "discard_value" + + +def get_extra_from_model(model: Type[BaseModel]) -> str: + default_extra = "ignore" + if _PYDANTIC_2: + default_extra = model.model_config.get("extra", default_extra) + else: + default_extra = model.Config.extra or default_extra # type: ignore[attr-defined] + return default_extra + + +def apply_schema_contract_to_model( + model: Type[_TPydanticModel], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode = "freeze" +) -> Type[_TPydanticModel]: + """Configures or re-creates `model` so it behaves according to `column_mode` and `data_mode` settings. + + `column_mode` sets the model behavior when unknown field is found. + `data_mode` sets model behavior when known field does not validate. currently `evolve` and `freeze` are supported here. + + `discard_row` is implemented in `validate_item`. + """ + if data_mode == "evolve": + # create a lenient model that accepts any data + model = create_model(model.__name__ + "Any", **{n:(Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] + elif data_mode == "discard_value": + raise NotImplementedError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models.") + + extra = column_mode_to_extra(column_mode) + + if extra == get_extra_from_model(model): + # no need to change the model + return model + + if _PYDANTIC_2: + config = copy(model.model_config) + config["extra"] = extra # type: ignore[typeddict-item] + else: + config = copy(model.Config) # type: ignore[attr-defined] + config.extra = extra # type: ignore[attr-defined] + + _child_models: Dict[int, Type[BaseModel]] = {} + + def _process_annotation(t_: Type[Any]) -> Type[Any]: + """Recursively recreates models with applied schema contract """ + if is_list_generic_type(t_): + l_t: Type[Any] = get_args(t_)[0] + try: + return get_origin(t_)[_process_annotation(l_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return List[_process_annotation(l_t)] # type: ignore + elif is_dict_generic_type(t_): + k_t: Type[Any] + v_t: Type[Any] + k_t, v_t = get_args(t_) + try: + return get_origin(t_)[k_t, _process_annotation(v_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return Dict[k_t, _process_annotation(v_t)] # type: ignore + elif is_union(t_): + u_t_s = tuple(_process_annotation(u_t) for u_t in extract_union_types(t_)) + return Union[u_t_s] # type: ignore[return-value] + elif inspect.isclass(t_) and issubclass(t_, BaseModel): + # types must be same before and after processing + if id(t_) in _child_models: + return _child_models[id(t_)] + else: + _child_models[id(t_)] = child_model = apply_schema_contract_to_model(t_, column_mode, data_mode) + return child_model + return t_ + + new_model: Type[_TPydanticModel] = create_model( # type: ignore[call-overload] + model.__name__ + "Extra" + extra.title(), + __config__ = config, + **{n:(_process_annotation(f.annotation), f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] + ) + # pass dlt config along + dlt_config = getattr(model, "dlt_config", None) + if dlt_config: + new_model.dlt_config = dlt_config # type: ignore[attr-defined] + return new_model + + +def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze") -> Type[ListModel[_TPydanticModel]]: + """Creates a model from `model` for validating list of items in batch according to `data_mode` + + Currently only freeze is supported. See comments in the code + """ + # TODO: use LenientList to create list model that automatically discards invalid items + # https://github.com/pydantic/pydantic/issues/2274 and https://gist.github.com/dmontagu/7f0cef76e5e0e04198dd608ad7219573 + return create_model( + "List" + __name__, + items=(List[model], ...) # type: ignore[return-value,valid-type] + ) + + +def validate_items( + table_name: str, + list_model: Type[ListModel[_TPydanticModel]], + items: List[TDataItem], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> List[_TPydanticModel]: + """Validates list of `item` with `list_model` and returns parsed Pydantic models + + `list_model` should be created with `create_list_model` and have `items` field which this function returns. + """ + try: + return list_model(items=items).items + except ValidationError as e: + deleted: Set[int] = set() + for err in e.errors(): + # TODO: we can get rid of most of the code if we use LenientList as explained above + if len(err["loc"]) >= 2: + err_idx = int(err["loc"][1]) + if err_idx in deleted: + # already dropped + continue + err_item = items[err_idx - len(deleted)] + else: + # top level error which means misalignment of list model and items + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, items) from e + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, err_item) from e + elif column_mode == "discard_row": + # pop at the right index + items.pop(err_idx - len(deleted)) + # store original index so we do not pop again + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + else: + if data_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", list_model, {"data_type": "freeze"}, err_item) from e + elif data_mode == "discard_row": + items.pop(err_idx - len(deleted)) + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + + # validate again with error items removed + return validate_items(table_name, list_model, items, column_mode, data_mode) + + +def validate_item( + table_name: str, + model: Type[_TPydanticModel], + item: TDataItems, + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> _TPydanticModel: + """Validates `item` against model `model` and returns an instance of it""" + try: + return model.parse_obj(item) + except ValidationError as e: + for err in e.errors(): + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", model, {"columns": "freeze"}, item) from e + elif column_mode == "discard_row": + return None + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + else: + if data_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", model, {"data_type": "freeze"}, item) from e + elif data_mode == "discard_row": + return None + raise NotImplementedError(f"{data_mode} data mode not implemented for Pydantic validation") + raise AssertionError("unreachable") \ No newline at end of file diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index e1c5c3b846..ab133b36c9 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -1,5 +1,5 @@ import abc -from typing import Any, Generic, Type, Iterator, Tuple, Protocol, TYPE_CHECKING, TypeVar +from typing import Any, Generic, Type, Generator, Tuple, Protocol, TYPE_CHECKING, TypeVar from dlt.common.typing import DictStrAny, TDataItem, StrAny if TYPE_CHECKING: @@ -10,7 +10,7 @@ # type definitions for json normalization function # iterator of form ((table_name, parent_table), dict) must be returned from normalization function -TNormalizedRowIterator = Iterator[Tuple[Tuple[str, str], StrAny]] +TNormalizedRowIterator = Generator[Tuple[Tuple[str, str], StrAny], bool, None] # type var for data item normalizer config TNormalizerConfig = TypeVar("TNormalizerConfig", bound=Any) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 98b34e298d..c9ce5a9d25 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -48,6 +48,8 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): _skip_primary_key: Dict[str, bool] def __init__(self, schema: Schema) -> None: + """This item normalizer works with nested dictionaries. It flattens dictionaries and descends into lists. + It yields row dictionaries at each nesting level.""" self.schema = schema self._reset() @@ -230,7 +232,9 @@ def _normalize_row( extend.update(self._get_propagated_values(table, flattened_row, _r_lvl )) # yield parent table first - yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + should_descend = yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + if should_descend is False: + return # normalize and yield lists for list_path, list_content in lists.items(): @@ -264,7 +268,7 @@ def extend_schema(self) -> None: def extend_table(self, table_name: str) -> None: # if the table has a merge w_d, add propagation info to normalizer table = self.schema.tables.get(table_name) - if not table.get("parent") and table["write_disposition"] == "merge": + if not table.get("parent") and table.get("write_disposition") == "merge": DataItemNormalizer.update_normalizer_config(self.schema, {"propagation": { "tables": { table_name: { diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index ddd9003799..973abb2451 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -17,7 +17,7 @@ from dlt.common.destination import Destination, TDestinationReferenceArg, TDestination from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.typing import DictStrAny, REPattern @@ -212,7 +212,8 @@ def run( columns: Sequence[TColumnSchema] = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: ... diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index 1a3b4db223..ac320bef0a 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,9 +1,11 @@ -from dlt.common.schema.typing import TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase +from dlt.common.schema.typing import TSchemaContractDict, TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase from dlt.common.schema.typing import COLUMN_HINTS -from dlt.common.schema.schema import Schema +from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.utils import verify_schema_hash __all__ = [ "TSchemaUpdate", "TSchemaTables", "TTableSchema", "TStoredSchema", "TTableSchemaColumns", "TColumnHint", - "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash" + "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash", "TSchemaContractDict", + "DEFAULT_SCHEMA_CONTRACT_MODE", "DataValidationError" ] diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 5f638a111d..96df6b7418 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -2,6 +2,7 @@ from dlt.common.exceptions import DltException from dlt.common.data_types import TDataType +from dlt.common.schema.typing import TSchemaContractDict, TSchemaContractEntities, TSchemaEvolutionMode class SchemaException(DltException): @@ -16,11 +17,6 @@ def __init__(self, name: str) -> None: super().__init__(f"{name} is an invalid schema/source name. The source or schema name must be a valid Python identifier ie. a snake case function name and have maximum {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small letters, numbers and underscores.") -# class InvalidDatasetName(ValueError, SchemaException): -# def __init__(self, name: str, normalized_name: str) -> None: -# self.name = name -# super().__init__(f"{name} is an invalid dataset name. The dataset name must conform to wide range of destinations and ideally should contain only small letters, numbers and underscores. Try {normalized_name} instead as suggested by current naming module.") - class InvalidDatasetName(ValueError, SchemaException): def __init__(self, destination_name: str) -> None: self.destination_name = destination_name @@ -70,7 +66,47 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi self.to_engine = to_engine super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + +class DataValidationError(SchemaException): + def __init__( + self, + schema_name: str, + table_name: str, + column_name: str, + contract_entity: TSchemaContractEntities, + contract_mode: TSchemaEvolutionMode, + table_schema: Any, + schema_contract: TSchemaContractDict, + data_item: Any = None, + extended_info: str = None + ) -> None: + """Raised when `data_item` violates `contract_mode` on a `contract_entity` as defined by `table_schema` + + Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence. + """ + msg = "" + if schema_name: + msg = f"Schema: {schema_name} " + msg += f"Table: {table_name} " + if column_name: + msg += f"Column: {column_name}" + msg = "In " + msg + f" . Contract on {contract_entity} with mode {contract_mode} is violated. " + (extended_info or "") + super().__init__(msg) + self.schema_name = schema_name + self.table_name = table_name + self.column_name = column_name + + # violated contract + self.contract_entity = contract_entity + self.contract_mode = contract_mode + + # some evidence + self.table_schema = table_schema + self.schema_contract = schema_contract + self.data_item = data_item + + class UnknownTableException(SchemaException): def __init__(self, table_name: str) -> None: self.table_name = table_name - super().__init__(f"Trying to access unknown table {table_name}.") \ No newline at end of file + super().__init__(f"Trying to access unknown table {table_name}.") diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 77a5ae8e8e..67ae345845 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,6 +1,6 @@ import yaml from copy import copy, deepcopy -from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast +from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal from dlt.common import json from dlt.common.utils import extend_list_deduplicated @@ -10,13 +10,20 @@ from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType -from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections) +from dlt.common.schema.typing import (COLUMN_HINTS, DLT_NAME_PREFIX, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict +from dlt.common.schema.exceptions import DataValidationError +DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { + "tables": "evolve", + "columns": "evolve", + "data_type": "evolve" +} + class Schema: ENGINE_VERSION: ClassVar[int] = SCHEMA_ENGINE_VERSION @@ -60,7 +67,8 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._reset_schema(name, normalizers) @classmethod - def from_dict(cls, d: DictStrAny) -> "Schema": + def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": + # upgrade engine if needed stored_schema = utils.migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) # verify schema @@ -69,7 +77,8 @@ def from_dict(cls, d: DictStrAny) -> "Schema": stored_schema = utils.apply_defaults(stored_schema) # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) return cls.from_stored_schema(stored_schema) @classmethod @@ -81,9 +90,10 @@ def from_stored_schema(cls, stored_schema: TStoredSchema) -> "Schema": def replace_schema_content(self, schema: "Schema") -> None: self._reset_schema(schema.name, schema._normalizers_config) - self._from_stored_schema(schema.to_dict()) + # do not bump version so hash from `schema` is preserved + self._from_stored_schema(schema.to_dict(bump_version=False)) - def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: + def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> TStoredSchema: stored_schema: TStoredSchema = { "version": self._stored_version, "version_hash": self._stored_version_hash, @@ -99,7 +109,8 @@ def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: stored_schema["description"] = self._schema_description # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) # remove defaults after bumping version if remove_defaults: utils.remove_defaults(stored_schema) @@ -187,8 +198,120 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial + def apply_schema_contract( + self, + schema_contract: TSchemaContractDict, + partial_table: TPartialTableSchema, + data_item: TDataItem = None, + raise_on_freeze: bool = True + ) -> Tuple[TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]]]: + """ + Checks if `schema_contract` allows for the `partial_table` to update the schema. It applies the contract dropping + the affected columns or the whole `partial_table`. It generates and returns a set of filters that should be applied to incoming data in order to modify it + so it conforms to the contract. `data_item` is provided only as evidence in case DataValidationError is raised. + + Example `schema_contract`: + { + "tables": "freeze", + "columns": "evolve", + "data_type": "discard_row" + } + + Settings for table affects new tables, settings for column affects new columns and settings for data_type affects new variant columns. Each setting can be set to one of: + * evolve: allow all changes + * freeze: allow no change and fail the load + * discard_row: allow no schema change and filter out the row + * discard_value: allow no schema change and filter out the value but load the rest of the row + + Returns a tuple where a first element is modified partial table and the second is a list of filters. The modified partial may be None in case the + whole table is not allowed. + Each filter is a tuple of (table|columns, entity name, freeze | discard_row | discard_value). + Note: by default `freeze` immediately raises DataValidationError which is convenient in most use cases + + """ + # default settings allow all evolutions, skip all else + if schema_contract == DEFAULT_SCHEMA_CONTRACT_MODE: + return partial_table, [] + + assert partial_table + table_name = partial_table["name"] + existing_table: TTableSchema = self._schema_tables.get(table_name, None) + + # table is new when not yet exist or + is_new_table = not existing_table or self.is_new_table(table_name) + # check case where we have a new table + if is_new_table and schema_contract["tables"] != "evolve": + if raise_on_freeze and schema_contract["tables"] == "freeze": + raise DataValidationError( + self.name, table_name, None, "tables", "freeze", None, schema_contract, data_item, f"Trying to add table {table_name} but new tables are frozen." + ) + # filter tables with name below + return None, [("tables", table_name, schema_contract["tables"])] + + column_mode, data_mode = schema_contract["columns"], schema_contract["data_type"] + # allow to add new columns when table is new or if columns are allowed to evolve once + if is_new_table or existing_table.get("x-normalizer", {}).get("evolve-columns-once", False): # type: ignore[attr-defined] + column_mode = "evolve" + + # check if we should filter any columns, partial table below contains only new columns + filters: List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]] = [] + for column_name, column in list(partial_table["columns"].items()): + # dlt cols may always be added + if column_name.startswith(self._dlt_tables_prefix): + continue + is_variant = column.get("variant", False) + # new column and contract prohibits that + if column_mode != "evolve" and not is_variant: + if raise_on_freeze and column_mode == "freeze": + raise DataValidationError( + self.name, table_name, column_name, "columns", "freeze", existing_table, schema_contract, data_item, f"Trying to add column {column_name} to table {table_name} but columns are frozen." + ) + # filter column with name below + filters.append(("columns", column_name, column_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + # variant (data type evolution) and contract prohibits that + if data_mode != "evolve" and is_variant: + if raise_on_freeze and data_mode == "freeze": + raise DataValidationError( + self.name, table_name, column_name, "data_type", "freeze", existing_table, schema_contract, data_item, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen." + ) + # filter column with name below + filters.append(("columns", column_name, data_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + return partial_table, filters + + @staticmethod + def expand_schema_contract_settings(settings: TSchemaContract, default: TSchemaContractDict = None) -> TSchemaContractDict: + """Expand partial or shorthand settings into full settings dictionary using `default` for unset entities""" + if isinstance(settings, str): + settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) + return cast(TSchemaContractDict, {**(default or DEFAULT_SCHEMA_CONTRACT_MODE), **(settings or {})}) + + def resolve_contract_settings_for_table(self, table_name: str, new_table_schema: TTableSchema = None) -> TSchemaContractDict: + """Resolve the exact applicable schema contract settings for the table `table_name`. `new_table_schema` is added to the tree during the resolution.""" + + settings: TSchemaContract = {} + if not table_name.startswith(self._dlt_tables_prefix): + if new_table_schema: + tables = copy(self._schema_tables) + tables[table_name] = new_table_schema + else: + tables = self._schema_tables + # find root table + try: + table = utils.get_top_level_table(tables, table_name) + settings = table["schema_contract"] + except KeyError: + settings = self._settings.get("schema_contract", {}) + + # expand settings, empty settings will expand into default settings + return Schema.expand_schema_contract_settings(settings) + def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: - """Update table in this schema""" table_name = partial_table["name"] parent_table_name = partial_table.get("parent") # check if parent table present @@ -215,9 +338,8 @@ def update_schema(self, schema: "Schema") -> None: # update all tables for table in schema.tables.values(): self.update_table(table) - # update normalizer config nondestructively - self.data_item_normalizer.update_normalizer_config(self, self.data_item_normalizer.get_normalizer_config(schema)) - self.update_normalizers() + # pass normalizer config + self._configure_normalizers(schema._normalizers_config) # update and compile settings self._settings = deepcopy(schema.settings) self._compile_settings() @@ -231,9 +353,8 @@ def bump_version(self) -> Tuple[int, str]: Returns: Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple """ - version = utils.bump_version_if_modified(self.to_dict()) - self._stored_version, self._stored_version_hash = version - return version + self._stored_version, self._stored_version_hash, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) + return self._stored_version, self._stored_version_hash def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} @@ -326,6 +447,10 @@ def dlt_tables(self) -> List[TTableSchema]: def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) + def is_new_table(self, table_name: str) -> bool: + """Returns true if this table does not exist OR is incomplete (has only incomplete columns) and therefore new""" + return (table_name not in self.tables) or (not [c for c in self.tables[table_name]["columns"].values() if utils.is_complete_column(c)]) + @property def version(self) -> int: """Version of the schema content that takes into account changes from the time of schema loading/creation. @@ -393,6 +518,12 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) + def set_schema_contract(self, settings: TSchemaContract) -> None: + if not settings: + self._settings.pop("schema_contract", None) + else: + self._settings["schema_contract"] = settings + def add_type_detection(self, detection: TTypeDetections) -> None: """Add type auto detection to the schema.""" if detection not in self.settings["detections"]: @@ -517,7 +648,7 @@ def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: # name normalization functions self.naming = naming_module - self._dlt_tables_prefix = self.naming.normalize_table_identifier("_dlt") + self._dlt_tables_prefix = self.naming.normalize_table_identifier(DLT_NAME_PREFIX) self.version_table_name = self.naming.normalize_table_identifier(VERSION_TABLE_NAME) self.loads_table_name = self.naming.normalize_table_identifier(LOADS_TABLE_NAME) self.state_table_name = self.naming.normalize_table_identifier(STATE_TABLE_NAME) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index ac17f0ae9f..720313b57b 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -11,12 +11,13 @@ # current version of schema engine -SCHEMA_ENGINE_VERSION = 6 +SCHEMA_ENGINE_VERSION = 7 # dlt tables VERSION_TABLE_NAME = "_dlt_version" LOADS_TABLE_NAME = "_dlt_loads" STATE_TABLE_NAME = "_dlt_pipeline_state" +DLT_NAME_PREFIX = "_dlt" TColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "merge_key", "root_key"] """Known properties and hints of the column""" @@ -71,17 +72,32 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" +TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] +TSchemaContractEntities = Literal["tables", "columns", "data_type"] + +class TSchemaContractDict(TypedDict, total=False): + """TypedDict defining the schema update settings""" + tables: Optional[TSchemaEvolutionMode] + columns: Optional[TSchemaEvolutionMode] + data_type: Optional[TSchemaEvolutionMode] + +TSchemaContract = Union[TSchemaEvolutionMode, TSchemaContractDict] class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] includes: Optional[List[TSimpleRegex]] +class NormalizerInfo(TypedDict, total=True): + new_table: bool + +# TypedDict that defines properties of a table class TTableSchema(TypedDict, total=False): """TypedDict that defines properties of a table""" name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] + schema_contract: Optional[TSchemaContract] table_sealed: Optional[bool] parent: Optional[str] filters: Optional[TRowFilters] @@ -89,16 +105,15 @@ class TTableSchema(TypedDict, total=False): resource: Optional[str] table_format: Optional[TTableFormat] - class TPartialTableSchema(TTableSchema): pass - TSchemaTables = Dict[str, TTableSchema] TSchemaUpdate = Dict[str, List[TPartialTableSchema]] + class TSchemaSettings(TypedDict, total=False): - schema_sealed: Optional[bool] + schema_contract: Optional[TSchemaContract] detections: Optional[List[TTypeDetections]] default_hints: Optional[Dict[TColumnHint, List[TSimpleRegex]]] preferred_types: Optional[Dict[TSimpleRegex, TDataType]] @@ -115,3 +130,4 @@ class TStoredSchema(TypedDict, total=False): settings: Optional[TSchemaSettings] tables: TSchemaTables normalizers: TNormalizersConfig + diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index f2075ce85d..9b4e8fb047 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -16,7 +16,7 @@ from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition) + TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContract, TSchemaContractDict) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName, UnknownTableException) @@ -134,8 +134,8 @@ def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: # return copy(column) # type: ignore -def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: - # if any change to schema document is detected then bump version and write new hash +def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, str]: + """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") if not previous_hash: @@ -144,7 +144,7 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str]: elif hash_ != previous_hash: stored_schema["version"] += 1 stored_schema["version_hash"] = hash_ - return stored_schema["version"], hash_ + return stored_schema["version"], hash_, previous_hash def generate_version_hash(stored_schema: TStoredSchema) -> str: @@ -340,6 +340,15 @@ def migrate_filters(group: str, filters: List[str]) -> None: # replace loads table schema_dict["tables"][LOADS_TABLE_NAME] = load_table() from_engine = 6 + if from_engine == 6 and to_engine > 6: + # migrate from sealed properties to schema evolution settings + schema_dict["settings"].pop("schema_sealed", None) + schema_dict["settings"]["schema_contract"] = {} + for table in schema_dict["tables"].values(): + table.pop("table_sealed", None) + if not table.get("parent"): + table["schema_contract"] = {} + from_engine = 7 schema_dict["engine_version"] = from_engine if from_engine != to_engine: @@ -426,7 +435,6 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl continue existing_v = tab_a.get(k) if existing_v != v: - # print(f"{k} ==? {v} ==? {existing_v}") partial_table[k] = v # type: ignore # this should not really happen @@ -649,6 +657,7 @@ def new_table( columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, resource: str = None, + schema_contract: TSchemaContract = None, table_format: TTableFormat = None ) -> TTableSchema: @@ -660,10 +669,13 @@ def new_table( table["parent"] = parent_table_name assert write_disposition is None assert resource is None + assert schema_contract is None else: # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name + if schema_contract is not None: + table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format if validate_schema: @@ -692,7 +704,6 @@ def new_column(column_name: str, data_type: TDataType = None, nullable: bool = T return column - def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: return None diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 8de95a6f60..6621f07e26 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -24,10 +24,10 @@ def get_writer(self, load_id: str, schema_name: str, table_name: str) -> Buffere self.buffered_writers[writer_id] = writer return writer - def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> int: writer = self.get_writer(load_id, schema_name, table_name) # write item(s) - writer.write_data_item(item, columns) + return writer.write_data_item(item, columns) def write_empty_file(self, load_id: str, schema_name: str, table_name: str, columns: TTableSchemaColumns) -> None: writer = self.get_writer(load_id, schema_name, table_name) diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index c482d5e7ea..79aeb22e61 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List from dlt.common.schema.schema import Schema from dlt.common.configuration.accessors import config @@ -18,7 +18,7 @@ def __getitem__(self, name: str) -> Schema: else: # return new schema instance schema = super().load_schema(name) - self._update_live_schema(schema) + self.update_live_schema(schema) return schema @@ -30,7 +30,7 @@ def load_schema(self, name: str) -> Schema: def save_schema(self, schema: Schema) -> str: rv = super().save_schema(schema) # update the live schema with schema being saved, if no live schema exist, create one to be available for a getter - self._update_live_schema(schema) + self.update_live_schema(schema) return rv def remove_schema(self, name: str) -> None: @@ -54,12 +54,18 @@ def commit_live_schema(self, name: str) -> Schema: self._save_schema(live_schema) return live_schema - def _update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + def update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + """Will update live schema content without writing to storage. Optionally allows to create a new live schema""" live_schema = self.live_schemas.get(schema.name) if live_schema: - # replace content without replacing instance - # print(f"live schema {live_schema} updated in place") - live_schema.replace_schema_content(schema) + if id(live_schema) != id(schema): + # replace content without replacing instance + # print(f"live schema {live_schema} updated in place") + live_schema.replace_schema_content(schema) elif can_create_new: # print(f"live schema {schema.name} created from schema") self.live_schemas[schema.name] = schema + + def list_schemas(self) -> List[str]: + names = list(set(super().list_schemas()) | set(self.live_schemas.keys())) + return names diff --git a/dlt/common/typing.py b/dlt/common/typing.py index b2bd03f7e6..3b3a0d3353 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -68,41 +68,38 @@ def asstr(self, verbosity: int = 0) -> str: ... +def is_union_type(t: Type[Any]) -> bool: + return get_origin(t) is Union + def is_optional_type(t: Type[Any]) -> bool: return get_origin(t) is Union and type(None) in get_args(t) - def is_final_type(t: Type[Any]) -> bool: return get_origin(t) is Final - -def extract_optional_type(t: Type[Any]) -> Any: - return get_args(t)[0] - +def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: + if no_none: + return [arg for arg in get_args(t) if arg is not type(None)] # noqa: E721 + return list(get_args(t)) def is_literal_type(hint: Type[Any]) -> bool: return get_origin(hint) is Literal - def is_union(hint: Type[Any]) -> bool: return get_origin(hint) is Union - def is_newtype_type(t: Type[Any]) -> bool: return hasattr(t, "__supertype__") - def is_typeddict(t: Type[Any]) -> bool: return isinstance(t, _TypedDict) - def is_list_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Sequence) except TypeError: return False - def is_dict_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Mapping) diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 0214bc037a..94c9144086 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -245,6 +245,7 @@ def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None: def update_dict_nested(dst: TDict, src: StrAny) -> TDict: + """Merges `src` into `dst` key wise. Does not recur into lists. Values in `src` overwrite `dst` if both keys exit.""" # based on https://github.com/clarketm/mergedeep/blob/master/mergedeep/mergedeep.py def _is_recursive_merge(a: StrAny, b: StrAny) -> bool: diff --git a/dlt/common/validation.py b/dlt/common/validation.py index f1900c1b0e..b746fda361 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -2,7 +2,7 @@ from typing import Callable, Any, Type, get_type_hints, get_args from dlt.common.exceptions import DictValidationException -from dlt.common.typing import StrAny, extract_optional_type, is_literal_type, is_optional_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict +from dlt.common.typing import StrAny, is_literal_type, is_optional_type, extract_union_types, is_union_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict, is_union TFilterFunc = Callable[[str], bool] @@ -49,10 +49,27 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil raise DictValidationException(f"In {path}: following fields are unexpected {unexpected}", path) def verify_prop(pk: str, pv: Any, t: Any) -> None: - if is_optional_type(t): - t = extract_optional_type(t) - - if is_literal_type(t): + # covers none in optional and union types + if is_optional_type(t) and pv is None: + pass + elif is_union_type(t): + # pass if value actually is none + union_types = extract_union_types(t, no_none=True) + # this is the case for optional fields + if len(union_types) == 1: + verify_prop(pk, pv, union_types[0]) + else: + has_passed = False + for ut in union_types: + try: + verify_prop(pk, pv, ut) + has_passed = True + except DictValidationException: + pass + if not has_passed: + type_names = [str(get_args(ut)) if is_literal_type(ut) else ut.__name__ for ut in union_types] + raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}. One of these types expected: {', '.join(type_names)}.", path, pk, pv) + elif is_literal_type(t): a_l = get_args(t) if pv not in a_l: raise DictValidationException(f"In {path}: field {pk} value {pv} not in allowed {a_l}", path, pk, pv) diff --git a/dlt/destinations/impl/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py index ac51bd5f42..f37a1f6cd8 100644 --- a/dlt/destinations/impl/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -1,8 +1,7 @@ from typing import Any from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource VECTORIZE_HINT = "x-qdrant-embed" diff --git a/dlt/destinations/impl/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py index 6829197273..bbb3f1c9da 100644 --- a/dlt/destinations/impl/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -1,8 +1,7 @@ from typing import Dict, Any, Literal, Set, get_args from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource TTokenizationTMethod = Literal["word", "lowercase", "whitespace", "field"] TOKENIZATION_METHODS: Set[TTokenizationTMethod] = set(get_args(TTokenizationTMethod)) diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index e69de29bb2..cc6ff15759 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -0,0 +1,7 @@ +from dlt.extract.resource import DltResource, with_table_name +from dlt.extract.source import DltSource +from dlt.extract.decorators import source, resource, transformer, defer +from dlt.extract.incremental import Incremental +from dlt.extract.wrappers import wrap_additional_type + +__all__ = ["DltResource", "DltSource", "with_table_name", "source", "resource", "transformer", "defer", "Incremental", "wrap_additional_type"] diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index dbc5f2fa82..b8abbc1d57 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -15,7 +15,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContract, TTableFormat from dlt.extract.utils import ensure_table_schema_columns_hint, simulate_func_call, wrap_compat_transformer, wrap_resource_gen from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -25,7 +25,8 @@ from dlt.extract.incremental import IncrementalResourceWrapper from dlt.extract.typing import TTableHintTemplate -from dlt.extract.source import DltResource, DltSource, TUnboundDltResource +from dlt.extract.source import DltSource +from dlt.extract.resource import DltResource, TUnboundDltResource @configspec @@ -53,9 +54,10 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] -) -> Callable[TSourceFunParams, TDltSourceImpl]: +) -> Callable[TSourceFunParams, DltSource]: ... @overload @@ -67,6 +69,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: @@ -80,6 +83,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ) -> Any: @@ -115,6 +119,8 @@ def source( schema (Schema, optional): An explicit `Schema` instance to be associated with the source. If not present, `dlt` creates a new `Schema` object with provided `name`. If such `Schema` already exists in the same folder as the module containing the decorated function, such schema will be loaded from file. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to this resource. + spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. _impl_cls (Type[TDltSourceImpl], optional): A custom implementation of DltSource, may be also used to providing just a typing stub @@ -122,7 +128,6 @@ def source( Returns: `DltSource` instance """ - if name and schema: raise ArgumentsOverloadException("'name' has no effect when `schema` argument is present", source.__name__) @@ -175,6 +180,7 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting + s.schema_contract = schema_contract # enable root propagation s.root_key = root_key return s @@ -206,6 +212,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -222,6 +229,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -238,6 +246,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, @@ -256,6 +265,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -272,6 +282,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, @@ -322,6 +333,7 @@ def resource( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to all resources of this source (if not overridden in the resource itself) table_format (Literal["iceberg"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, other destinations ignore this hint. selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. @@ -346,6 +358,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=columns, primary_key=primary_key, merge_key=merge_key, + schema_contract=schema_contract, table_format=table_format ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index e540a2468f..351b85a9d8 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -264,11 +264,3 @@ def __init__(self, source_name: str, schema_name: str) -> None: class IncrementalUnboundError(DltResourceException): def __init__(self, cursor_path: str) -> None: super().__init__("", f"The incremental definition with cursor path {cursor_path} is used without being bound to the resource. This most often happens when you create dynamic resource from a generator function that uses incremental. See https://dlthub.com/docs/general-usage/incremental-loading#incremental-loading-with-last-value for an example.") - - -class ValidationError(ValueError, DltException): - def __init__(self, validator: ValidateItem, data_item: TDataItems, original_exception: Exception) ->None: - self.original_exception = original_exception - self.validator = validator - self.data_item = data_item - super().__init__(f"Extracted data item could not be validated with {validator}. Original message: {original_exception}") diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 3f71943579..1276f1b1f5 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,258 +1,22 @@ import contextlib -import os -from typing import ClassVar, List, Set, Dict, Type, Any, Sequence, Optional -from collections import defaultdict +from typing import Set, Dict, Optional, Set from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section -from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.configuration.specs import ConfigSectionContext, known_sections from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat -from dlt.common.exceptions import MissingDependencyException from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.utils import uniq_id -from dlt.common.typing import TDataItems, TDataItem -from dlt.common.schema import Schema, utils, TSchemaUpdate -from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns -from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage -from dlt.common.configuration.specs import known_sections +from dlt.common.schema import utils from dlt.extract.decorators import SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints from dlt.extract.pipe import PipeIterator -from dlt.extract.source import DltResource, DltSource -from dlt.extract.typing import TableNameMeta -try: - from dlt.common.libs import pyarrow - from dlt.common.libs.pyarrow import pyarrow as pa -except MissingDependencyException: - pyarrow = None -try: - import pandas as pd -except ModuleNotFoundError: - pd = None - - -class ExtractorItemStorage(DataItemStorage): - load_file_type: TLoaderFileFormat - - def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: - # data item storage with jsonl with pua encoding - super().__init__(self.load_file_type) - self.extract_folder = extract_folder - self.storage = storage - - - def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: - template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") - return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.extract_folder, extract_id) - - -class JsonLExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "puae-jsonl" - - -class ArrowExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "arrow" - - -class ExtractorStorage(NormalizeStorage): - EXTRACT_FOLDER: ClassVar[str] = "extract" - - """Wrapper around multiple extractor storages with different file formats""" - def __init__(self, C: NormalizeStorageConfiguration) -> None: - super().__init__(True, C) - self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { - "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), - "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) - } - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.EXTRACT_FOLDER, extract_id) - - def create_extract_id(self) -> str: - extract_id = uniq_id() - self.storage.create_folder(self._get_extract_path(extract_id)) - return extract_id - - def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: - return self._item_storages[loader_file_format] - - def close_writers(self, extract_id: str) -> None: - for storage in self._item_storages.values(): - storage.close_writers(extract_id) - - def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: - extract_path = self._get_extract_path(extract_id) - for file in self.storage.list_folder_files(extract_path, to_root=False): - from_file = os.path.join(extract_path, file) - to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) - if with_delete: - self.storage.atomic_rename(from_file, to_file) - else: - # create hardlink which will act as a copy - self.storage.link_hard(from_file, to_file) - if with_delete: - self.storage.delete_folder(extract_path, recursively=True) - - def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: - self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) - - - -class Extractor: - file_format: TLoaderFileFormat - dynamic_tables: TSchemaUpdate - def __init__( - self, - extract_id: str, - storage: ExtractorStorage, - schema: Schema, - resources_with_items: Set[str], - dynamic_tables: TSchemaUpdate, - collector: Collector = NULL_COLLECTOR - ) -> None: - self._storage = storage - self.schema = schema - self.dynamic_tables = dynamic_tables - self.collector = collector - self.resources_with_items = resources_with_items - self.extract_id = extract_id - - @property - def storage(self) -> ExtractorItemStorage: - return self._storage.get_storage(self.file_format) - - @staticmethod - def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: - """Detect the loader file format of the data items based on type. - Currently this is either 'arrow' or 'puae-jsonl' - - Returns: - The loader file format or `None` if if can't be detected. - """ - for item in items if isinstance(items, list) else [items]: - # Assume all items in list are the same type - if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): - return "arrow" - return "puae-jsonl" - return None # Empty list is unknown format - - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - if isinstance(meta, TableNameMeta): - table_name = meta.table_name - self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) - else: - if resource._table_name_hint_fun: - if isinstance(items, list): - for item in items: - self._write_dynamic_table(resource, item) - else: - self._write_dynamic_table(resource, items) - else: - # write item belonging to table with static name - table_name = resource.table_name # type: ignore[assignment] - self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) - - def write_empty_file(self, table_name: str) -> None: - table_name = self.schema.naming.normalize_table_identifier(table_name) - self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - # normalize table name before writing so the name match the name in schema - # note: normalize function should be cached so there's almost no penalty on frequent calling - # note: column schema is not required for jsonl writer used here - table_name = self.schema.naming.normalize_identifier(table_name) - self.collector.update(table_name) - self.resources_with_items.add(resource_name) - self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) - - def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: - table_name = resource._table_name_hint_fun(item) - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - self.dynamic_tables[table_name] = [resource.compute_table_schema(item)] - else: - # quick check if deep table merge is required - if resource._table_has_other_dynamic_hints: - new_table = resource.compute_table_schema(item) - # this merges into existing table in place - utils.merge_tables(existing_table[0], new_table) - else: - # if there are no other dynamic hints besides name then we just leave the existing partial table - pass - # write to storage with inferred table name - self._write_item(table_name, resource.name, item) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - static_table = resource.compute_table_schema() - static_table["name"] = table_name - self.dynamic_tables[table_name] = [static_table] - - -class JsonLExtractor(Extractor): - file_format = "puae-jsonl" - - -class ArrowExtractor(Extractor): - file_format = "arrow" - - def _rename_columns(self, items: List[TDataItem], new_column_names: List[str]) -> List[TDataItem]: - """Rename arrow columns to normalized schema column names""" - if not items: - return items - if items[0].schema.names == new_column_names: - # No need to rename - return items - if isinstance(items[0], pyarrow.pyarrow.Table): - return [item.rename_columns(new_column_names) for item in items] - elif isinstance(items[0], pyarrow.pyarrow.RecordBatch): - # Convert the batches to table -> rename -> then back to batches - return pa.Table.from_batches(items).rename_columns(new_column_names).to_batches() # type: ignore[no-any-return] - else: - raise TypeError(f"Unsupported data item type {type(items[0])}") - - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - items = [ - # 2. Remove null-type columns from the table(s) as they can't be loaded - pyarrow.remove_null_columns(tbl) for tbl in ( - # 1. Convert pandas frame(s) to arrow Table - pyarrow.pyarrow.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item - for item in (items if isinstance(items, list) else [items]) - ) - ] - super().write_table(resource, items, meta) - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - # Note: `items` is always a list here due to the conversion in `write_table` - new_columns = list(self.dynamic_tables[table_name][0]["columns"].keys()) - super()._write_item(table_name, resource_name, self._rename_columns(items, new_columns), self.dynamic_tables[table_name][0]["columns"]) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is not None: - return - static_table = resource.compute_table_schema() - if isinstance(items, list): - item = items[0] - else: - item = items - # Merge the columns to include primary_key and other hints that may be set on the resource - arrow_columns = pyarrow.py_arrow_to_table_schema_columns(item.schema) - for key, value in static_table["columns"].items(): - arrow_columns[key] = utils.merge_columns(value, arrow_columns.get(key, {})) - static_table["columns"] = arrow_columns - static_table["name"] = table_name - self.dynamic_tables[table_name] = [self.schema.normalize_table_identifiers(static_table)] +from dlt.extract.source import DltSource +from dlt.extract.storage import ExtractorStorage +from dlt.extract.extractors import JsonLExtractor, ArrowExtractor, Extractor def extract( @@ -264,16 +28,15 @@ def extract( max_parallel_items: int = None, workers: int = None, futures_poll_interval: float = None -) -> TSchemaUpdate: - dynamic_tables: TSchemaUpdate = {} +) -> None: schema = source.schema resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ), "arrow": ArrowExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ) } last_item_format: Optional[TLoaderFileFormat] = None @@ -296,7 +59,7 @@ def extract( resource = source.resources[pipe_item.pipe.name] # Fallback to last item's format or default (puae-jsonl) if the current item is an empty list item_format = Extractor.item_format(pipe_item.item) or last_item_format or "puae-jsonl" - extractors[item_format].write_table(resource, pipe_item.item, pipe_item.meta) + extractors[item_format].write_items(resource, pipe_item.item, pipe_item.meta) last_item_format = item_format # find defined resources that did not yield any pipeitems and create empty jobs for them @@ -310,7 +73,7 @@ def extract( for table in tables_by_resources[resource.name]: # we only need to write empty files for the top tables if not table.get("parent", None): - extractors[last_item_format or "puae-jsonl"].write_empty_file(table["name"]) + extractors["puae-jsonl"].write_empty_file(table["name"]) if left_gens > 0: # go to 100% @@ -319,21 +82,17 @@ def extract( # flush all buffered writers storage.close_writers(extract_id) - # returns set of partial tables - return dynamic_tables - def extract_with_schema( storage: ExtractorStorage, source: DltSource, - schema: Schema, collector: Collector, max_parallel_items: int, - workers: int + workers: int, ) -> str: # generate extract_id to be able to commit all the sources together later extract_id = storage.create_extract_id() - with Container().injectable_context(SourceSchemaInjectableContext(schema)): + with Container().injectable_context(SourceSchemaInjectableContext(source.schema)): # inject the config section with the current source name with inject_section(ConfigSectionContext(sections=(known_sections.SOURCES, source.section, source.name), source_state_key=source.name)): # reset resource states, the `extracted` list contains all the explicit resources and all their parents @@ -341,11 +100,6 @@ def extract_with_schema( with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - - extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) - # iterate over all items in the pipeline and update the schema if dynamic table hints were present - for _, partials in extractor.items(): - for partial in partials: - schema.update_table(schema.normalize_table_identifiers(partial)) + extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) return extract_id diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py new file mode 100644 index 0000000000..0ec8aed968 --- /dev/null +++ b/dlt/extract/extractors.py @@ -0,0 +1,246 @@ +from copy import copy +from typing import Set, Dict, Any, Optional, Set + +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import BaseConfiguration, configspec +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.runtime.collector import Collector, NULL_COLLECTOR +from dlt.common.utils import update_dict_nested +from dlt.common.typing import TDataItems, TDataItem +from dlt.common.schema import Schema, utils +from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema + +from dlt.extract.resource import DltResource +from dlt.extract.typing import TableNameMeta +from dlt.extract.storage import ExtractorStorage, ExtractorItemStorage +try: + from dlt.common.libs import pyarrow + from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem +except MissingDependencyException: + pyarrow = None + +try: + import pandas as pd +except ModuleNotFoundError: + pd = None + + +class Extractor: + file_format: TLoaderFileFormat + + @configspec + class ExtractorConfiguration(BaseConfiguration): + _caps: Optional[DestinationCapabilitiesContext] = None + + @with_config(spec=ExtractorConfiguration) + def __init__( + self, + extract_id: str, + storage: ExtractorStorage, + schema: Schema, + resources_with_items: Set[str], + collector: Collector = NULL_COLLECTOR, + *, + _caps: DestinationCapabilitiesContext = None + ) -> None: + self.schema = schema + self.naming = schema.naming + self.collector = collector + self.resources_with_items = resources_with_items + self.extract_id = extract_id + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + self._storage = storage + self._caps = _caps or DestinationCapabilitiesContext.generic_capabilities() + + @property + def storage(self) -> ExtractorItemStorage: + return self._storage.get_storage(self.file_format) + + @staticmethod + def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: + """Detect the loader file format of the data items based on type. + Currently this is either 'arrow' or 'puae-jsonl' + + Returns: + The loader file format or `None` if if can't be detected. + """ + for item in items if isinstance(items, list) else [items]: + # Assume all items in list are the same type + if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): + return "arrow" + return "puae-jsonl" + return None # Empty list is unknown format + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" + if table_name := self._get_static_table_name(resource, meta): + # write item belonging to table with static name + self._write_to_static_table(resource, table_name, items) + else: + # table has name or other hints depending on data items + self._write_to_dynamic_table(resource, items) + + def write_empty_file(self, table_name: str) -> None: + table_name = self.naming.normalize_table_identifier(table_name) + self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) + + def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: + if resource._table_name_hint_fun: + return None + if isinstance(meta, TableNameMeta): + table_name = meta.table_name + else: + table_name = resource.table_name # type: ignore[assignment] + return self.naming.normalize_table_identifier(table_name) + + def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: + return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) + self.collector.update(table_name, inc=new_rows_count) + self.resources_with_items.add(resource_name) + + def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: + if not isinstance(items, list): + items = [items] + + for item in items: + table_name = self._get_dynamic_table_name(resource, item) + if table_name in self._filtered_tables: + continue + if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: + item = self._compute_and_update_table(resource, table_name, item) + # write to storage with inferred table name + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, item) + + def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: + if table_name not in self._table_contracts: + items = self._compute_and_update_table(resource, table_name, items) + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, items) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: + """Computes a schema for a new or dynamic table and normalizes identifiers""" + return self.schema.normalize_table_identifiers( + resource.compute_table_schema(items) + ) + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + """ + Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written + """ + computed_table = self._compute_table(resource, items) + # overwrite table name (if coming from meta) + computed_table["name"] = table_name + # get or compute contract + schema_contract = self._table_contracts.setdefault( + table_name, + self.schema.resolve_contract_settings_for_table(table_name, computed_table) + ) + + # this is a new table so allow evolve once + if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): + computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] + existing_table = self.schema._schema_tables.get(table_name, None) + if existing_table: + diff_table = utils.diff_tables(existing_table, computed_table) + else: + diff_table = computed_table + + # apply contracts + diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table, data_item=items) + + # merge with schema table + if diff_table: + self.schema.update_table(diff_table) + + # process filters + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + return items + + +class JsonLExtractor(Extractor): + file_format = "puae-jsonl" + + +class ArrowExtractor(Extractor): + file_format = "arrow" + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + static_table_name = self._get_static_table_name(resource, meta) + items = [ + # 3. remove columns and rows in data contract filters + # 2. Remove null-type columns from the table(s) as they can't be loaded + self._apply_contract_filters(pyarrow.remove_null_columns(tbl), resource, static_table_name) for tbl in ( + # 1. Convert pandas frame(s) to arrow Table + pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item + for item in (items if isinstance(items, list) else [items]) + ) + ] + super().write_items(resource, items, meta) + + def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, static_table_name: Optional[str]) -> "TAnyArrowItem": + """Removes the columns (discard value) or rows (discard rows) as indicated by contract filters.""" + # convert arrow schema names into normalized names + rename_mapping = pyarrow.get_normalized_arrow_fields_mapping(item, self.naming) + # find matching columns and delete by original name + table_name = static_table_name or self._get_dynamic_table_name(resource, item) + filtered_columns = self._filtered_columns.get(table_name) + if filtered_columns: + # remove rows where columns have non null values + # create a mask where rows will be False if any of the specified columns are non-null + mask = None + rev_mapping = {v: k for k, v in rename_mapping.items()} + for column in [name for name, mode in filtered_columns.items() if mode == "discard_row"]: + is_null = pyarrow.pyarrow.compute.is_null(item[rev_mapping[column]]) + mask = is_null if mask is None else pyarrow.pyarrow.compute.and_(mask, is_null) + # filter the table using the mask + if mask is not None: + item = item.filter(mask) + + # remove value actually removes the whole columns from the table + # NOTE: filtered columns has normalized column names so we need to go through mapping + removed_columns = [name for name in rename_mapping if filtered_columns.get(rename_mapping[name]) is not None] + if removed_columns: + item = pyarrow.remove_columns(item, removed_columns) + + return item + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + columns = columns or self.schema.tables[table_name]["columns"] + # Note: `items` is always a list here due to the conversion in `write_table` + items = [pyarrow.normalize_py_arrow_schema(item, columns, self.naming, self._caps) for item in items] + super()._write_item(table_name, resource_name, items, columns) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: + items = items[0] + computed_table = super()._compute_table(resource, items) + + # Merge the columns to include primary_key and other hints that may be set on the resource + arrow_table = copy(computed_table) + arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(items.schema) + # normalize arrow table before merging + arrow_table = self.schema.normalize_table_identifiers(arrow_table) + # we must override the columns to preserve the order in arrow table + arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) + + return arrow_table + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + items = super()._compute_and_update_table(resource, table_name, items) + # filter data item as filters could be updated in compute table + items = [self._apply_contract_filters(item, resource, table_name) for item in items] + return items diff --git a/dlt/extract/schema.py b/dlt/extract/hints.py similarity index 74% rename from dlt/extract/schema.py rename to dlt/extract/hints.py index c1dfd1f7f5..19d503f970 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/hints.py @@ -1,9 +1,8 @@ from copy import copy, deepcopy -from collections.abc import Mapping as C_Mapping from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat, TSchemaContract from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -12,7 +11,7 @@ from dlt.extract.typing import TFunHintTemplate, TTableHintTemplate, ValidateItem from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, TableNameMissing from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint -from dlt.extract.validation import get_column_validator +from dlt.extract.validation import create_item_validator class TTableSchemaTemplate(TypedDict, total=False): @@ -25,10 +24,12 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] + schema_contract: TTableHintTemplate[TSchemaContract] validator: ValidateItem + original_columns: TTableHintTemplate[TAnySchemaColumns] -class DltResourceSchema: +class DltResourceHints: def __init__(self, table_schema_template: TTableSchemaTemplate = None): self.__qualname__ = self.__name__ = self.name self._table_name_hint_fun: TFunHintTemplate[str] = None @@ -70,7 +71,11 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: return None return self._table_schema_template.get("columns") - def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: + @property + def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: + return self._table_schema_template.get("schema_contract") + + def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data""" if not self._table_schema_template: return new_table(self.name, resource=self.name) @@ -85,13 +90,11 @@ def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: if self._table_name_hint_fun and item is None: raise DataItemRequiredForDynamicTableHints(self.name) # resolve - resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items()} # type: ignore - resolved_template.pop("incremental", None) - resolved_template.pop("validator", None) + resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator", "original_columns"]} # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name validate_dict_ignoring_xkeys( - spec=TPartialTableSchema, + spec=TTableSchema, doc=table_schema, path=f"new_table/{self.name}", ) @@ -105,7 +108,8 @@ def apply_hints( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - incremental: Incremental[Any] = None + incremental: Incremental[Any] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -122,10 +126,10 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract) else: # set single hints - t = deepcopy(self._table_schema_template) + t = self._clone_table_template(self._table_schema_template) if table_name is not None: if table_name: t["name"] = table_name @@ -139,7 +143,8 @@ def apply_hints( if write_disposition: t["write_disposition"] = write_disposition if columns is not None: - t['validator'] = get_column_validator(columns) + # keep original columns: ie in case it is a Pydantic model + t["original_columns"] = columns # if callable then override existing if callable(columns) or callable(t["columns"]): t["columns"] = ensure_table_schema_columns_hint(columns) @@ -151,7 +156,6 @@ def apply_hints( else: # set to empty columns t["columns"] = ensure_table_schema_columns(columns) - if primary_key is not None: if primary_key: t["primary_key"] = primary_key @@ -162,13 +166,27 @@ def apply_hints( t["merge_key"] = merge_key else: t.pop("merge_key", None) + if schema_contract is not None: + if schema_contract: + t["schema_contract"] = schema_contract + else: + t.pop("schema_contract", None) + # recreate validator if columns definition or contract changed + if schema_contract is not None or columns is not None: + t["validator"], schema_contract = create_item_validator(t.get("original_columns"), t.get("schema_contract")) + if schema_contract is not None: + t["schema_contract"] = schema_contract # set properties that cannot be passed to new_table_template - t["incremental"] = incremental + if incremental is not None: + if incremental is Incremental.EMPTY: + t["incremental"] = None + else: + t["incremental"] = incremental self.set_template(t) def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - DltResourceSchema.validate_dynamic_hints(table_schema_template) + DltResourceHints.validate_dynamic_hints(table_schema_template) # if "name" is callable in the template then the table schema requires actual data item to be inferred name_hint = table_schema_template.get("name") if callable(name_hint): @@ -179,13 +197,21 @@ def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: self._table_has_other_dynamic_hints = any(callable(v) for k, v in table_schema_template.items() if k != "name") self._table_schema_template = table_schema_template + @staticmethod + def _clone_table_template(template: TTableSchemaTemplate) -> TTableSchemaTemplate: + t_ = copy(template) + t_["columns"] = deepcopy(template["columns"]) + if "schema_contract" in template: + t_["schema_contract"] = deepcopy(template["schema_contract"]) + return t_ + @staticmethod def _resolve_hint(item: TDataItem, hint: TTableHintTemplate[Any]) -> Any: - """Calls each dynamic hint passing a data item""" - if callable(hint): - return hint(item) - else: - return hint + """Calls each dynamic hint passing a data item""" + if callable(hint): + return hint(item) + else: + return hint @staticmethod def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSchema) -> None: @@ -205,9 +231,9 @@ def _merge_keys(t_: TTableSchemaTemplate) -> TPartialTableSchema: # assert not callable(t_["merge_key"]) # assert not callable(t_["primary_key"]) if "primary_key" in t_: - DltResourceSchema._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore + DltResourceHints._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore if "merge_key" in t_: - DltResourceSchema._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore + DltResourceHints._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore return partial @@ -219,21 +245,29 @@ def new_table_template( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None - ) -> TTableSchemaTemplate: + ) -> TTableSchemaTemplate: + validator, schema_contract = create_item_validator(columns, schema_contract) + clean_columns = columns if columns is not None: - validator = get_column_validator(columns) - columns = ensure_table_schema_columns_hint(columns) - if not callable(columns): - columns = columns.values() # type: ignore - else: - validator = None + clean_columns = ensure_table_schema_columns_hint(columns) + if not callable(clean_columns): + clean_columns = clean_columns.values() # type: ignore # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table( - table_name, parent_table_name, write_disposition=write_disposition, columns=columns, table_format=table_format # type: ignore + table_name, # type: ignore + parent_table_name, # type: ignore + write_disposition=write_disposition, # type: ignore + columns=clean_columns, # type: ignore + schema_contract=schema_contract, # type: ignore + table_format=table_format # type: ignore ) if not table_name: new_template.pop("name") + # remember original columns + if columns is not None: + new_template["original_columns"] = columns # always remove resource new_template.pop("resource", None) # type: ignore if primary_key: @@ -242,12 +276,12 @@ def new_table_template( new_template["merge_key"] = merge_key if validator: new_template["validator"] = validator - DltResourceSchema.validate_dynamic_hints(new_template) + DltResourceHints.validate_dynamic_hints(new_template) return new_template @staticmethod def validate_dynamic_hints(template: TTableSchemaTemplate) -> None: table_name = template.get("name") # if any of the hints is a function then name must be as well - if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator"]) and not callable(table_name): + if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator", "original_columns"]) and not callable(table_name): raise InconsistentTableTemplate(f"Table name {table_name} must be a function if any other table hint is a function") diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 6d042aa15d..1c5fa7ab38 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -1,5 +1,5 @@ import os -from typing import Generic, Any, Optional, get_args, get_origin, Type, Dict +from typing import Generic, ClassVar, Any, Optional, get_args, get_origin, Type, Dict import inspect from functools import wraps @@ -69,11 +69,15 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa The values passed explicitly to Incremental will be ignored. Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded """ + # this is config/dataclass so declare members cursor_path: str = None # TODO: Support typevar here initial_value: Optional[Any] = None end_value: Optional[Any] = None + # incremental acting as empty + EMPTY: ClassVar["Incremental[Any]"] = None + def __init__( self, cursor_path: str = dlt.config.value, @@ -336,6 +340,8 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return [item for item in (self._transform_item(transformer, row) for row in rows) if item is not None] return self._transform_item(transformer, rows) +Incremental.EMPTY = Incremental[Any]("") + class IncrementalResourceWrapper(ItemTransform[TDataItem]): _incremental: Optional[Incremental[Any]] = None diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index af45736da4..44538aa3f5 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -23,9 +23,11 @@ from dlt.extract.typing import TTableHintTemplate from dlt.common.schema.typing import TColumnNames try: + from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pa = None + pyarrow = None class IncrementalTransform: @@ -182,24 +184,7 @@ def _deduplicate(self, tbl: "pa.Table", unique_columns: Optional[List[str]], agg """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: - tbl = tbl.append_column(self._dlt_index, pa.array(np.arange(tbl.num_rows))) - # code below deduplicates groups that include the cursor column in the group id. that was just artifact of - # json incremental and there's no need to duplicate it here - - # if unique_columns is None: - # return tbl - # group_cols = unique_columns + [cursor_path] - # try: - # tbl = tbl.filter( - # pa.compute.is_in( - # tbl[self._dlt_index], - # tbl.group_by(group_cols).aggregate( - # [(self._dlt_index, "one"), (cursor_path, aggregate)] - # )[f'{self._dlt_index}_one'] - # ) - # ) - # except KeyError as e: - # raise IncrementalPrimaryKeyMissing(self.resource_name, unique_columns[0], tbl) from e + tbl = pyarrow.append_column(tbl, self._dlt_index, pa.array(np.arange(tbl.num_rows))) return tbl def __call__( @@ -225,7 +210,7 @@ def __call__( if isinstance(primary_key, str): self._dlt_index = primary_key elif primary_key is None: - unique_columns = tbl.column_names + unique_columns = tbl.schema.names else: # deduplicating is disabled unique_columns = None @@ -312,7 +297,7 @@ def __call__( if len(tbl) == 0: return None, start_out_of_range, end_out_of_range try: - tbl = tbl.drop(["_dlt_index"]) + tbl = pyarrow.remove_columns(tbl, ["_dlt_index"]) except KeyError: pass if is_pandas: diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py new file mode 100644 index 0000000000..2c3018e77d --- /dev/null +++ b/dlt/extract/resource.py @@ -0,0 +1,494 @@ +from copy import deepcopy +import inspect +from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Iterable, Iterator, Union, Any, Optional + +from dlt.common.configuration.resolve import inject_section +from dlt.common.configuration.specs import known_sections +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.configuration.container import Container +from dlt.common.pipeline import PipelineContext, StateInjectableContext, resource_state, pipeline_state +from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id + +from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TableNameMeta, + FilterItem, MapItem, YieldMapItem, ValidateItem) +from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep +from dlt.extract.hints import DltResourceHints, TTableSchemaTemplate +from dlt.extract.incremental import Incremental, IncrementalResourceWrapper +from dlt.extract.exceptions import ( + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, + InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, + InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, + InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer +) +from dlt.extract.wrappers import wrap_additional_type + + +def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: + """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" + return DataItemWithMeta(TableNameMeta(table_name), item) + + +class DltResource(Iterable[TDataItem], DltResourceHints): + """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" + Empty: ClassVar["DltResource"] = None + source_name: str + """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" + section: str + """A config section name""" + + def __init__( + self, + pipe: Pipe, + table_schema_template: TTableSchemaTemplate, + selected: bool, + incremental: IncrementalResourceWrapper = None, + section: str = None, + args_bound: bool = False + ) -> None: + self.section = section + self.selected = selected + self._pipe = pipe + self._args_bound = args_bound + self._explicit_args: DictStrAny = None + if incremental and not self.incremental: + self.add_step(incremental) + self.source_name = None + super().__init__(table_schema_template) + + @classmethod + def from_data( + cls, + data: Any, + name: str = None, + section: str = None, + table_schema_template: TTableSchemaTemplate = None, + selected: bool = True, + data_from: Union["DltResource", Pipe] = None, + incremental: IncrementalResourceWrapper = None + ) -> "DltResource": + if data is None: + raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore + + if isinstance(data, DltResource): + return data + + if isinstance(data, Pipe): + return cls(data, table_schema_template, selected, incremental=incremental, section=section) + + if callable(data): + name = name or get_callable_name(data) + + # if generator, take name from it + if inspect.isgenerator(data): + name = name or get_callable_name(data) # type: ignore + + # name is mandatory + if not name: + raise ResourceNameMissing() + + # wrap additional types + data = wrap_additional_type(data) + + # several iterable types are not allowed and must be excluded right away + if isinstance(data, (AsyncIterator, AsyncIterable)): + raise InvalidResourceDataTypeAsync(name, data, type(data)) + if isinstance(data, (str, dict)): + raise InvalidResourceDataTypeBasic(name, data, type(data)) + + # check if depends_on is a valid resource + parent_pipe: Pipe = None + if data_from is not None: + DltResource._ensure_valid_transformer_resource(name, data) + parent_pipe = DltResource._get_parent_pipe(name, data_from) + + # create resource from iterator, iterable or generator function + if isinstance(data, (Iterable, Iterator)) or callable(data): + pipe = Pipe.from_data(name, data, parent=parent_pipe) + return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) + else: + # some other data type that is not supported + raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") + + @property + def name(self) -> str: + """Resource name inherited from the pipe""" + return self._pipe.name + + def with_name(self, new_name: str) -> "DltResource": + """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" + return self._clone(new_name=new_name, with_parent=True) + + @property + def is_transformer(self) -> bool: + """Checks if the resource is a transformer that takes data from another resource""" + return self._pipe.has_parent + + @property + def requires_args(self) -> bool: + """Checks if resource has unbound arguments""" + try: + self._pipe.ensure_gen_bound() + return False + except (TypeError, ParametrizedResourceUnbound): + return True + + @property + def incremental(self) -> IncrementalResourceWrapper: + """Gets incremental transform if it is in the pipe""" + incremental: IncrementalResourceWrapper = None + step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) + if step_no >= 0: + incremental = self._pipe.steps[step_no] # type: ignore + return incremental + + @property + def validator(self) -> Optional[ValidateItem]: + """Gets validator transform if it is in the pipe""" + validator: ValidateItem = None + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + validator = self._pipe.steps[step_no] # type: ignore[assignment] + return validator + + @validator.setter + def validator(self, validator: Optional[ValidateItem]) -> None: + """Add/remove or replace the validator in pipe""" + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + self._pipe.remove_step(step_no) + if validator: + self.add_step(validator, insert_at=step_no if step_no >= 0 else None) + + def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: + """Replaces the parent in the transformer resource pipe from which the data is piped.""" + if self.is_transformer: + DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) + else: + raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") + parent_pipe = self._get_parent_pipe(self.name, data_from) + self._pipe.parent = parent_pipe + + def add_pipe(self, data: Any) -> None: + """Creates additional pipe for the resource from the specified data""" + # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer + raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) + + def select_tables(self, *table_names: Iterable[str]) -> "DltResource": + """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. + + Both `with_table_name` marker and data-based (function) table name hints are supported. + """ + def _filter(item: TDataItem, meta: Any = None) -> bool: + is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names + is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names + return is_in_meta or is_in_dyn + + # add filtering function at the end of pipe + self.add_filter(_filter) + return self + + def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. + insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(MapItem(item_map)) + else: + self._pipe.insert_step(MapItem(item_map), insert_at) + return self + + def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to + ie. pivot an item into sequence of rows. + + Args: + item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. + insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(YieldMapItem(item_map)) + else: + self._pipe.insert_step(YieldMapItem(item_map), insert_at) + return self + + def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` + + `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept + insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(FilterItem(item_filter)) + else: + self._pipe.insert_step(FilterItem(item_filter), insert_at) + return self + + def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 + """Adds a limit `max_items` to the resource pipe + + This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is + a no-op for transformers. Those should be limited by their input data. + + Args: + max_items (int): The maximum number of items to yield + Returns: + "DltResource": returns self + """ + def _gen_wrap(gen: TPipeStep) -> TPipeStep: + """Wrap a generator to take the first `max_items` records""" + nonlocal max_items + count = 0 + if inspect.isfunction(gen): + gen = gen() + try: + for i in gen: # type: ignore # TODO: help me fix this later + yield i + count += 1 + if count == max_items: + return + finally: + if inspect.isgenerator(gen): + gen.close() + return + # transformers should be limited by their input, so we only limit non-transformers + if not self.is_transformer: + self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) + return self + + def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 + if insert_at is None: + self._pipe.append_step(item_transform) + else: + self._pipe.insert_step(item_transform, insert_at) + return self + + def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: + super().set_template(table_schema_template) + incremental = self.incremental + # try to late assign incremental + if table_schema_template.get("incremental") is not None: + if incremental: + incremental._incremental = table_schema_template["incremental"] + else: + # if there's no wrapper add incremental as a transform + incremental = table_schema_template["incremental"] # type: ignore + self.add_step(incremental) + + if incremental: + primary_key = table_schema_template.get("primary_key", incremental.primary_key) + if primary_key is not None: + incremental.primary_key = primary_key + + if table_schema_template.get('validator') is not None: + self.validator = table_schema_template['validator'] + + def bind(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + orig_gen = self._pipe.gen + gen = self._pipe.bind_gen(*args, **kwargs) + if isinstance(gen, DltResource): + # the resource returned resource: update in place + old_pipe = self._pipe + self.__dict__.clear() + self.__dict__.update(gen.__dict__) + # keep old pipe instance + self._pipe = old_pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen._pipe.__dict__) + elif isinstance(gen, Pipe): + # the resource returned pipe: just replace pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen.__dict__) + else: + self._args_bound = True + self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore + return self + + @property + def explicit_args(self) -> StrAny: + """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" + if not self._args_bound: + raise TypeError(f"Resource {self.name} is not yet parametrized") + return self._explicit_args + + @property + def state(self) -> StrAny: + """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" + with inject_section(self._get_config_section_context()): + return resource_state(self.name) + + def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + r = self._clone() + return r.bind(*args, **kwargs) + + def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": + """Allows to pipe data from across resources and transform functions with | operator""" + # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") + if isinstance(transform, DltResource): + transform.pipe_data_from(self) + # return transformed resource for chaining + return transform + else: + # map or yield map + if inspect.isgeneratorfunction(inspect.unwrap(transform)): + return self.add_yield_map(transform) + else: + return self.add_map(transform) + + def __iter__(self) -> Iterator[TDataItem]: + """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. + + A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. + """ + # use the same state dict when opening iterator and when iterator is iterated + container = Container() + state, _ = pipeline_state(container, {}) + state_context = StateInjectableContext(state=state) + section_context = self._get_config_section_context() + + # managed pipe iterator will set the context on each call to __next__ + with inject_section(section_context), Container().injectable_context(state_context): + pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore + + pipe_iterator.set_context([state_context, section_context]) + _iter = map(lambda item: item.item, pipe_iterator) + return flatten_list_or_items(_iter) + + def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: + try: + sig = sig or inspect.signature(f) + self._explicit_args = sig.bind_partial(*args, **kwargs).arguments + except Exception: + pass + + def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": + """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source + """ + pipe = self._pipe + if self._pipe and not self._pipe.is_empty: + pipe = pipe._clone(new_name=new_name, with_parent=with_parent) + # incremental and parent are already in the pipe (if any) + return DltResource( + pipe, + deepcopy(self._table_schema_template), + selected=self.selected, + section=self.section + ) + + def _get_config_section_context(self) -> ConfigSectionContext: + container = Container() + proxy = container[PipelineContext] + pipeline = None if not proxy.is_active() else proxy.pipeline() + if pipeline: + pipeline_name = pipeline.pipeline_name + else: + pipeline_name = None + if pipeline: + default_schema_name = pipeline.default_schema_name + else: + default_schema_name = None + if not default_schema_name and pipeline_name: + default_schema_name = pipeline._make_schema_with_default_name().name + return ConfigSectionContext( + pipeline_name=pipeline_name, + # do not emit middle config section to not overwrite the resource section + # only sources emit middle config section + sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), + source_state_key=self.source_name or default_schema_name or self.section or uniq_id() + ) + + def __str__(self) -> str: + info = f"DltResource [{self.name}]" + if self.section: + info += f" in section [{self.section}]" + if self.source_name: + info += f" added to source [{self.source_name}]:" + else: + info += ":" + + if self.is_transformer: + info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" + else: + if self._pipe.is_data_bound: + if self.requires_args: + head_sig = inspect.signature(self._pipe.gen) # type: ignore + info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." + else: + info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." + else: + info += "\nThis resource is not bound to the data" + info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" + return info + + @staticmethod + def _ensure_valid_transformer_resource(name: str, data: Any) -> None: + # resource must be a callable with single argument + if callable(data): + valid_code = DltResource.validate_transformer_generator_function(data) + if valid_code != 0: + raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) + else: + raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) + + @staticmethod + def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: + # parent resource + if isinstance(data_from, Pipe): + return data_from + elif isinstance(data_from, DltResource): + return data_from._pipe + else: + # if this is generator function provide nicer exception + if callable(data_from): + raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) + else: + raise InvalidParentResourceDataType(name, data_from, type(data_from)) + + @staticmethod + def validate_transformer_generator_function(f: AnyFun) -> int: + sig = inspect.signature(f) + if len(sig.parameters) == 0: + return 1 + # transformer may take only one positional only argument + pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) + if pos_only_len > 1: + return 2 + first_ar = next(iter(sig.parameters.values())) + # and pos only must be first + if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: + return 2 + # first arg must be positional or kw_pos + if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): + return 3 + return 0 + + +# produce Empty resource singleton +DltResource.Empty = DltResource(Pipe(None), None, False) +TUnboundDltResource = Callable[..., DltResource] diff --git a/dlt/extract/source.py b/dlt/extract/source.py index d36cb4b121..771e8ca0cc 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -1,502 +1,27 @@ import warnings import contextlib -from copy import copy, deepcopy +from copy import copy import makefun import inspect -from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Dict, Iterable, Iterator, List, Sequence, Tuple, Union, Any, Optional +from typing import Dict, Iterable, Iterator, List, Sequence, Tuple, Any from typing_extensions import Self from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer, RelationalNormalizerConfigPropagation +from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnName -from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.schema.typing import TColumnName, TSchemaContract +from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container -from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, resource_state, source_state, pipeline_state -from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, get_callable_name, graph_edges_to_nodes, multi_context_manager, uniq_id - -from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TDecompositionStrategy, TableNameMeta, - FilterItem, MapItem, YieldMapItem, ValidateItem) -from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep -from dlt.extract.schema import DltResourceSchema, TTableSchemaTemplate -from dlt.extract.incremental import Incremental, IncrementalResourceWrapper -from dlt.extract.exceptions import ( - InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, - DataItemRequiredForDynamicTableHints, InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, - InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer, ResourcesNotFoundError, DeletingResourcesNotSupported) -from dlt.extract.wrappers import wrap_additional_type - - -def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: - """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" - return DataItemWithMeta(TableNameMeta(table_name), item) - - -class DltResource(Iterable[TDataItem], DltResourceSchema): - """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" - Empty: ClassVar["DltResource"] = None - source_name: str - """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" - section: str - """A config section name""" - - def __init__( - self, - pipe: Pipe, - table_schema_template: TTableSchemaTemplate, - selected: bool, - incremental: IncrementalResourceWrapper = None, - section: str = None, - args_bound: bool = False - ) -> None: - self.section = section - self.selected = selected - self._pipe = pipe - self._args_bound = args_bound - self._explicit_args: DictStrAny = None - if incremental and not self.incremental: - self.add_step(incremental) - self.source_name = None - super().__init__(table_schema_template) - - @classmethod - def from_data( - cls, - data: Any, - name: str = None, - section: str = None, - table_schema_template: TTableSchemaTemplate = None, - selected: bool = True, - data_from: Union["DltResource", Pipe] = None, - incremental: IncrementalResourceWrapper = None - ) -> "DltResource": - if data is None: - raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore - - if isinstance(data, DltResource): - return data - - if isinstance(data, Pipe): - return cls(data, table_schema_template, selected, incremental=incremental, section=section) - - if callable(data): - name = name or get_callable_name(data) - - # if generator, take name from it - if inspect.isgenerator(data): - name = name or get_callable_name(data) # type: ignore - - # name is mandatory - if not name: - raise ResourceNameMissing() - - # wrap additional types - data = wrap_additional_type(data) - - # several iterable types are not allowed and must be excluded right away - if isinstance(data, (AsyncIterator, AsyncIterable)): - raise InvalidResourceDataTypeAsync(name, data, type(data)) - if isinstance(data, (str, dict)): - raise InvalidResourceDataTypeBasic(name, data, type(data)) - - # check if depends_on is a valid resource - parent_pipe: Pipe = None - if data_from is not None: - DltResource._ensure_valid_transformer_resource(name, data) - parent_pipe = DltResource._get_parent_pipe(name, data_from) - - # create resource from iterator, iterable or generator function - if isinstance(data, (Iterable, Iterator)) or callable(data): - pipe = Pipe.from_data(name, data, parent=parent_pipe) - return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) - else: - # some other data type that is not supported - raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") - - @property - def name(self) -> str: - """Resource name inherited from the pipe""" - return self._pipe.name - - def with_name(self, new_name: str) -> "DltResource": - """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" - return self._clone(new_name=new_name, with_parent=True) - - @property - def is_transformer(self) -> bool: - """Checks if the resource is a transformer that takes data from another resource""" - return self._pipe.has_parent - - @property - def requires_args(self) -> bool: - """Checks if resource has unbound arguments""" - try: - self._pipe.ensure_gen_bound() - return False - except (TypeError, ParametrizedResourceUnbound): - return True - - @property - def incremental(self) -> IncrementalResourceWrapper: - """Gets incremental transform if it is in the pipe""" - incremental: IncrementalResourceWrapper = None - step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) - if step_no >= 0: - incremental = self._pipe.steps[step_no] # type: ignore - return incremental - - @property - def validator(self) -> Optional[ValidateItem]: - """Gets validator transform if it is in the pipe""" - validator: ValidateItem = None - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - validator = self._pipe.steps[step_no] # type: ignore[assignment] - return validator - - @validator.setter - def validator(self, validator: Optional[ValidateItem]) -> None: - """Add/remove or replace the validator in pipe""" - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - self._pipe.remove_step(step_no) - if validator: - self.add_step(validator, insert_at=step_no if step_no >= 0 else None) - - def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: - """Replaces the parent in the transformer resource pipe from which the data is piped.""" - if self.is_transformer: - DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) - else: - raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") - parent_pipe = self._get_parent_pipe(self.name, data_from) - self._pipe.parent = parent_pipe - - def add_pipe(self, data: Any) -> None: - """Creates additional pipe for the resource from the specified data""" - # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer - raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) - - def select_tables(self, *table_names: Iterable[str]) -> "DltResource": - """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. - - Both `with_table_name` marker and data-based (function) table name hints are supported. - """ - def _filter(item: TDataItem, meta: Any = None) -> bool: - is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names - is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names - return is_in_meta or is_in_dyn - - # add filtering function at the end of pipe - self.add_filter(_filter) - return self - - def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. - insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(MapItem(item_map)) - else: - self._pipe.insert_step(MapItem(item_map), insert_at) - return self - - def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to - ie. pivot an item into sequence of rows. - - Args: - item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. - insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(YieldMapItem(item_map)) - else: - self._pipe.insert_step(YieldMapItem(item_map), insert_at) - return self - - def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` +from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, source_state, pipeline_state +from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, graph_edges_to_nodes - `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept - insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(FilterItem(item_filter)) - else: - self._pipe.insert_step(FilterItem(item_filter), insert_at) - return self - - def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 - """Adds a limit `max_items` to the resource pipe - - This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is - a no-op for transformers. Those should be limited by their input data. - - Args: - max_items (int): The maximum number of items to yield - Returns: - "DltResource": returns self - """ - def _gen_wrap(gen: TPipeStep) -> TPipeStep: - """Wrap a generator to take the first `max_items` records""" - nonlocal max_items - count = 0 - if inspect.isfunction(gen): - gen = gen() - try: - for i in gen: # type: ignore # TODO: help me fix this later - yield i - count += 1 - if count == max_items: - return - finally: - if inspect.isgenerator(gen): - gen.close() - return - # transformers should be limited by their input, so we only limit non-transformers - if not self.is_transformer: - self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) - return self - - def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 - if insert_at is None: - self._pipe.append_step(item_transform) - else: - self._pipe.insert_step(item_transform, insert_at) - return self - - def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - super().set_template(table_schema_template) - incremental = self.incremental - # try to late assign incremental - if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: - # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) - - if incremental: - primary_key = table_schema_template.get("primary_key", incremental.primary_key) - if primary_key is not None: - incremental.primary_key = primary_key - - if table_schema_template.get('validator') is not None: - self.validator = table_schema_template['validator'] - - def bind(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - orig_gen = self._pipe.gen - gen = self._pipe.bind_gen(*args, **kwargs) - if isinstance(gen, DltResource): - # the resource returned resource: update in place - old_pipe = self._pipe - self.__dict__.clear() - self.__dict__.update(gen.__dict__) - # keep old pipe instance - self._pipe = old_pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen._pipe.__dict__) - elif isinstance(gen, Pipe): - # the resource returned pipe: just replace pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen.__dict__) - else: - self._args_bound = True - self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore - return self - - @property - def explicit_args(self) -> StrAny: - """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" - if not self._args_bound: - raise TypeError(f"Resource {self.name} is not yet parametrized") - return self._explicit_args - - @property - def state(self) -> StrAny: - """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" - with inject_section(self._get_config_section_context()): - return resource_state(self.name) - - def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - r = self._clone() - return r.bind(*args, **kwargs) - - def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": - """Allows to pipe data from across resources and transform functions with | operator""" - # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") - if isinstance(transform, DltResource): - transform.pipe_data_from(self) - # return transformed resource for chaining - return transform - else: - # map or yield map - if inspect.isgeneratorfunction(inspect.unwrap(transform)): - return self.add_yield_map(transform) - else: - return self.add_map(transform) - - def __iter__(self) -> Iterator[TDataItem]: - """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. - - A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - """ - # use the same state dict when opening iterator and when iterator is iterated - container = Container() - state, _ = pipeline_state(container, {}) - state_context = StateInjectableContext(state=state) - section_context = self._get_config_section_context() - - # managed pipe iterator will set the context on each call to __next__ - with inject_section(section_context), Container().injectable_context(state_context): - pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore - - pipe_iterator.set_context([state_context, section_context]) - _iter = map(lambda item: item.item, pipe_iterator) - return flatten_list_or_items(_iter) - - def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: - try: - sig = sig or inspect.signature(f) - self._explicit_args = sig.bind_partial(*args, **kwargs).arguments - except Exception: - pass - - def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": - """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source - """ - pipe = self._pipe - if self._pipe and not self._pipe.is_empty: - pipe = pipe._clone(new_name=new_name, with_parent=with_parent) - # incremental and parent are already in the pipe (if any) - return DltResource( - pipe, - deepcopy(self._table_schema_template), - selected=self.selected, - section=self.section - ) - - def _get_config_section_context(self) -> ConfigSectionContext: - container = Container() - proxy = container[PipelineContext] - pipeline = None if not proxy.is_active() else proxy.pipeline() - if pipeline: - pipeline_name = pipeline.pipeline_name - else: - pipeline_name = None - if pipeline: - default_schema_name = pipeline.default_schema_name - else: - default_schema_name = None - if not default_schema_name and pipeline_name: - default_schema_name = pipeline._make_schema_with_default_name().name - return ConfigSectionContext( - pipeline_name=pipeline_name, - # do not emit middle config section to not overwrite the resource section - # only sources emit middle config section - sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), - source_state_key=self.source_name or default_schema_name or self.section or uniq_id() - ) - - def __str__(self) -> str: - info = f"DltResource [{self.name}]" - if self.section: - info += f" in section [{self.section}]" - if self.source_name: - info += f" added to source [{self.source_name}]:" - else: - info += ":" - - if self.is_transformer: - info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" - else: - if self._pipe.is_data_bound: - if self.requires_args: - head_sig = inspect.signature(self._pipe.gen) # type: ignore - info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." - else: - info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." - else: - info += "\nThis resource is not bound to the data" - info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" - return info - - @staticmethod - def _ensure_valid_transformer_resource(name: str, data: Any) -> None: - # resource must be a callable with single argument - if callable(data): - valid_code = DltResource.validate_transformer_generator_function(data) - if valid_code != 0: - raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) - else: - raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) - - @staticmethod - def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: - # parent resource - if isinstance(data_from, Pipe): - return data_from - elif isinstance(data_from, DltResource): - return data_from._pipe - else: - # if this is generator function provide nicer exception - if callable(data_from): - raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) - else: - raise InvalidParentResourceDataType(name, data_from, type(data_from)) - - @staticmethod - def validate_transformer_generator_function(f: AnyFun) -> int: - sig = inspect.signature(f) - if len(sig.parameters) == 0: - return 1 - # transformer may take only one positional only argument - pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) - if pos_only_len > 1: - return 2 - first_ar = next(iter(sig.parameters.values())) - # and pos only must be first - if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: - return 2 - # first arg must be positional or kw_pos - if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): - return 3 - return 0 - - -# produce Empty resource singleton -DltResource.Empty = DltResource(Pipe(None), None, False) -TUnboundDltResource = Callable[..., DltResource] +from dlt.extract.typing import TDecompositionStrategy +from dlt.extract.pipe import Pipe, ManagedPipeIterator +from dlt.extract.hints import DltResourceHints +from dlt.extract.resource import DltResource +from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, ResourcesNotFoundError, DeletingResourcesNotSupported class DltResourceDict(Dict[str, DltResource]): @@ -529,7 +54,7 @@ def extracted(self) -> Dict[str, DltResource]: resource = self[pipe.name] except KeyError: # resource for pipe not found: return mock resource - mock_template = DltResourceSchema.new_table_template( + mock_template = DltResourceHints.new_table_template( pipe.name, write_disposition=resource.write_disposition ) @@ -682,6 +207,14 @@ def max_table_nesting(self) -> int: def max_table_nesting(self, value: int) -> None: RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) + @property + def schema_contract(self) -> TSchemaContract: + return self.schema.settings["schema_contract"] + + @schema_contract.setter + def schema_contract(self, settings: TSchemaContract) -> None: + self.schema.set_schema_contract(settings) + @property def exhausted(self) -> bool: """check all selected pipes wether one of them has started. if so, the source is exhausted.""" diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py new file mode 100644 index 0000000000..ddda064aa4 --- /dev/null +++ b/dlt/extract/storage.py @@ -0,0 +1,78 @@ +import os +from typing import ClassVar, Dict + +from dlt.common.data_writers import TLoaderFileFormat + +from dlt.common.utils import uniq_id +from dlt.common.typing import TDataItems +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage + + +class ExtractorItemStorage(DataItemStorage): + load_file_type: TLoaderFileFormat + + def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: + # data item storage with jsonl with pua encoding + super().__init__(self.load_file_type) + self.extract_folder = extract_folder + self.storage = storage + + + def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: + template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") + return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.extract_folder, extract_id) + + +class JsonLExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "puae-jsonl" + + +class ArrowExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "arrow" + + +class ExtractorStorage(NormalizeStorage): + EXTRACT_FOLDER: ClassVar[str] = "extract" + + """Wrapper around multiple extractor storages with different file formats""" + def __init__(self, C: NormalizeStorageConfiguration) -> None: + super().__init__(True, C) + self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { + "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), + "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) + } + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.EXTRACT_FOLDER, extract_id) + + def create_extract_id(self) -> str: + extract_id = uniq_id() + self.storage.create_folder(self._get_extract_path(extract_id)) + return extract_id + + def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: + return self._item_storages[loader_file_format] + + def close_writers(self, extract_id: str) -> None: + for storage in self._item_storages.values(): + storage.close_writers(extract_id) + + def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: + extract_path = self._get_extract_path(extract_id) + for file in self.storage.list_folder_files(extract_path, to_root=False): + from_file = os.path.join(extract_path, file) + to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) + if with_delete: + self.storage.atomic_rename(from_file, to_file) + else: + # create hardlink which will act as a copy + self.storage.link_hard(from_file, to_file) + if with_delete: + self.storage.delete_folder(extract_path, recursively=True) + + def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index ad4e23b84f..646267c539 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -138,3 +138,8 @@ class ValidateItem(ItemTransform[TDataItem]): Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. See `PydanticValidator` for possible implementation. """ + table_name: str + + def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: + self.table_name = pipe.name + return self diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index c8e30d0eb2..8bd6c7afb9 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -1,13 +1,13 @@ -from typing import Optional, Protocol, TypeVar, Generic, Type, Union, Any, List +from typing import Optional, Tuple, TypeVar, Generic, Type, Union, Any, List +from dlt.common.schema.schema import Schema try: - from pydantic import BaseModel as PydanticBaseModel, ValidationError as PydanticValidationError, create_model + from pydantic import BaseModel as PydanticBaseModel except ModuleNotFoundError: - PydanticBaseModel = None # type: ignore[misc] + PydanticBaseModel = Any # type: ignore[misc, assignment] -from dlt.extract.exceptions import ValidationError from dlt.common.typing import TDataItems -from dlt.common.schema.typing import TAnySchemaColumns +from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode from dlt.extract.typing import TTableHintTemplate, ValidateItem @@ -16,31 +16,54 @@ class PydanticValidator(ValidateItem, Generic[_TPydanticModel]): model: Type[_TPydanticModel] - def __init__(self, model: Type[_TPydanticModel]) -> None: - self.model = model - # Create a model for validating list of items in batch - self.list_model = create_model( - "List" + model.__name__, - items=(List[model], ...) # type: ignore[valid-type] - ) + def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> None: + from dlt.common.libs.pydantic import apply_schema_contract_to_model, create_list_model + + self.column_mode: TSchemaEvolutionMode = column_mode + self.data_mode: TSchemaEvolutionMode = data_mode + self.model = apply_schema_contract_to_model(model, column_mode, data_mode) + self.list_model = create_list_model(self.model, data_mode) def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, List[_TPydanticModel]]: """Validate a data item against the pydantic model""" if item is None: return None - try: - if isinstance(item, list): - return self.list_model(items=item).items # type: ignore[attr-defined, no-any-return] - return self.model.parse_obj(item) - except PydanticValidationError as e: - raise ValidationError(self, item, e) from e + + from dlt.common.libs.pydantic import validate_item, validate_items + + if isinstance(item, list): + return validate_items(self.table_name, self.list_model, item, self.column_mode, self.data_mode) + return validate_item(self.table_name, self.model, item, self.column_mode, self.data_mode) def __str__(self, *args: Any, **kwargs: Any) -> str: return f"PydanticValidator(model={self.model.__qualname__})" -def get_column_validator(columns: TTableHintTemplate[TAnySchemaColumns]) -> Optional[ValidateItem]: +def create_item_validator( + columns: TTableHintTemplate[TAnySchemaColumns], + schema_contract: TTableHintTemplate[TSchemaContract] = None +) -> Tuple[Optional[ValidateItem], TTableHintTemplate[TSchemaContract]]: + """Creates item validator for a `columns` definition and a `schema_contract` + + Returns a tuple (validator, schema contract). If validator could not be created, returns None at first position. + If schema_contract was not specified a default schema contract for given validator will be returned + """ if PydanticBaseModel is not None and isinstance(columns, type) and issubclass(columns, PydanticBaseModel): - return PydanticValidator(columns) - return None + assert not callable(schema_contract), "schema_contract cannot be dynamic for Pydantic item validator" + + from dlt.common.libs.pydantic import extra_to_column_mode, get_extra_from_model + # freeze the columns if we have a fully defined table and no other explicit contract + expanded_schema_contract = Schema.expand_schema_contract_settings( + schema_contract, + # corresponds to default Pydantic behavior + default={"tables": "evolve", "columns": extra_to_column_mode(get_extra_from_model(columns)), "data_type": "freeze"} + ) + return (PydanticValidator( + columns, + expanded_schema_contract["columns"], + expanded_schema_contract["data_type"] + ), + schema_contract or expanded_schema_contract + ) + return None, schema_contract diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 2a9c76cc76..e0329d583c 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -25,7 +25,7 @@ from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.runtime.collector import NULL_COLLECTOR -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.helpers import retry_load from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import log diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index f34f8b6fdc..13b408945c 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration @@ -6,6 +6,7 @@ from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration + @configspec class ItemsNormalizerConfiguration(BaseConfiguration): add_dlt_id: bool = False diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py index e69de29bb2..79da16b925 100644 --- a/dlt/normalize/exceptions.py +++ b/dlt/normalize/exceptions.py @@ -0,0 +1,5 @@ +from dlt.common.exceptions import DltException + +class NormalizeException(DltException): + def __init__(self, msg: str) -> None: + super().__init__(msg) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index b9bd5468dc..6146d864b6 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -1,20 +1,20 @@ import os -from typing import List, Dict, Tuple, Protocol, Any -from pathlib import Path +from typing import List, Dict, Set, Tuple, Any from abc import abstractmethod from dlt.common import json, logger -from dlt.common.json import custom_pua_decode +from dlt.common.json import custom_pua_decode, may_have_pua from dlt.common.runtime import signals -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.storages import NormalizeStorage, LoadStorage, NormalizeStorageConfiguration, FileStorage -from dlt.common.typing import TDataItem +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict +from dlt.common.storages import NormalizeStorage, LoadStorage, FileStorage +from dlt.common.typing import DictStrAny, TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.utils import TRowCount, merge_row_count, increase_row_count -from dlt.normalize.configuration import NormalizeConfiguration from dlt.common.exceptions import MissingDependencyException from dlt.common.normalizers.utils import generate_dlt_ids +from dlt.normalize.configuration import NormalizeConfiguration + try: from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa @@ -44,54 +44,136 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> Tuple[Lis class JsonLItemsNormalizer(ItemsNormalizer): - def _normalize_chunk(self, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: - column_schemas: Dict[ - str, TTableSchemaColumns - ] = {} # quick access to column schema for writers below + def __init__( + self, + load_storage: LoadStorage, + normalize_storage: NormalizeStorage, + schema: Schema, + load_id: str, + config: NormalizeConfiguration + ) -> None: + super().__init__(load_storage, normalize_storage, schema, load_id, config) + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_tables_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + # quick access to column schema for writers below + self._column_schemas: Dict[str, TTableSchemaColumns] = {} + + def _filter_columns(self, filtered_columns: Dict[str, TSchemaEvolutionMode], row: DictStrAny) -> DictStrAny: + for name, mode in filtered_columns.items(): + if name in row: + if mode == "discard_row": + return None + elif mode == "discard_value": + row.pop(name) + return row + + def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_have_pua: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: + column_schemas = self._column_schemas schema_update: TSchemaUpdate = {} schema = self.schema schema_name = schema.name items_count = 0 row_counts: TRowCount = {} + normalize_data_fun = self.schema.normalize_data_item for item in items: - for (table_name, parent_table), row in self.schema.normalize_data_item( - item, self.load_id, root_table_name - ): - # filter row, may eliminate some or all fields - row = schema.filter_row(table_name, row) - # do not process empty rows - if row: + items_gen = normalize_data_fun(item, self.load_id, root_table_name) + try: + should_descend: bool = None + # use send to prevent descending into child rows when row was discarded + while row_info := items_gen.send(should_descend): + should_descend = True + (table_name, parent_table), row = row_info + + # rows belonging to filtered out tables are skipped + if table_name in self._filtered_tables: + # stop descending into further rows + should_descend = False + continue + + # filter row, may eliminate some or all fields + row = schema.filter_row(table_name, row) + # do not process empty rows + if not row: + should_descend = False + continue + + # filter columns or full rows if schema contract said so + # do it before schema inference in `coerce_row` to not trigger costly migration code + filtered_columns = self._filtered_tables_columns.get(table_name, None) + if filtered_columns: + row = self._filter_columns(filtered_columns, row) # type: ignore[arg-type] + # if whole row got dropped + if not row: + should_descend = False + continue + # decode pua types - for k, v in row.items(): - row[k] = custom_pua_decode(v) # type: ignore + if may_have_pua: + for k, v in row.items(): + row[k] = custom_pua_decode(v) # type: ignore + # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row( table_name, parent_table, row ) - # theres a new table or new columns in existing table + + # if we detect a migration, check schema contract if partial_table: + schema_contract = self._table_contracts.setdefault( + table_name, + schema.resolve_contract_settings_for_table(parent_table or table_name) # parent_table, if present, exists in the schema + ) + partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table, data_item=row) + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_tables_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + + if partial_table is None: + # discard migration and row + should_descend = False + continue + # theres a new table or new columns in existing table # update schema and save the change schema.update_table(partial_table) table_updates = schema_update.setdefault(table_name, []) table_updates.append(partial_table) + # update our columns column_schemas[table_name] = schema.get_table_columns( table_name ) + + # apply new filters + if filtered_columns and filters: + row = self._filter_columns(filtered_columns, row) + # do not continue if new filters skipped the full row + if not row: + should_descend = False + continue + # get current columns schema columns = column_schemas.get(table_name) if not columns: columns = schema.get_table_columns(table_name) column_schemas[table_name] = columns # store row - # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock + # TODO: store all rows for particular items all together after item is fully completed + # will be useful if we implement bad data sending to a table self.load_storage.write_data_item( self.load_id, schema_name, table_name, row, columns ) # count total items + # TODO: take counts and bytes from buffered file writers instead of taking those here items_count += 1 increase_row_count(row_counts, table_name, 1) + except StopIteration: + pass signals.raise_if_signalled() return schema_update, items_count, row_counts @@ -102,12 +184,13 @@ def __call__( ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: schema_updates: List[TSchemaUpdate] = [] row_counts: TRowCount = {} - with self.normalize_storage.storage.open_file(extracted_items_file) as f: + with self.normalize_storage.storage.open_file(extracted_items_file, "rb") as f: # enumerate jsonl file line by line items_count = 0 + line: bytes for line_no, line in enumerate(f): - items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items) + items: List[TDataItem] = json.loadb(line) + partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items, may_have_pua(line)) schema_updates.append(partial_update) merge_row_count(row_counts, r_counts) logger.debug( @@ -134,6 +217,7 @@ def _write_with_dlt_columns( table_updates.append(table_update) load_id_type = pa.dictionary(pa.int8(), pa.string()) new_columns.append(( + -1, pa.field("_dlt_load_id", load_id_type, nullable=False), lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type) )) @@ -143,6 +227,7 @@ def _write_with_dlt_columns( table_updates = schema_update.setdefault(root_table_name, []) table_updates.append(table_update) new_columns.append(( + -1, pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), lambda batch: pa.array(generate_dlt_ids(batch.num_rows)) )) @@ -186,7 +271,6 @@ def __call__( self, extracted_items_file: str, root_table_name: str ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: base_schema_update = self._fix_schema_precisions(root_table_name) - import pyarrow as pa add_dlt_id = self.config.parquet_normalizer.add_dlt_id add_dlt_load_id = self.config.parquet_normalizer.add_dlt_load_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index d60ea05965..ab87a5a2a1 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -14,7 +14,6 @@ from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration -from dlt.common.typing import TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo @@ -52,7 +51,12 @@ def create_storages(self) -> None: # pass initial normalize storage config embedded in normalize config self.normalize_storage = NormalizeStorage(True, config=self.config._normalize_storage_config) # normalize saves in preferred format but can read all supported formats - self.load_storage = LoadStorage(True, self.config.destination_capabilities.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, config=self.config._load_storage_config) + self.load_storage = LoadStorage( + True, + self.config.destination_capabilities.preferred_loader_file_format, + LoadStorage.ALL_SUPPORTED_FILE_FORMATS, + config=self.config._load_storage_config + ) @staticmethod def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Schema: @@ -237,7 +241,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap self.load_storage.config, schema.to_dict(), load_id, - files, + files ) self.update_table(schema, result[0]) self.collector.update("Files", len(result[2])) @@ -246,14 +250,14 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None: schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) - # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) - # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) - if len(schema_updates) > 0: - logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") - # schema is updated, save it to schema volume - self.schema_storage.save_schema(schema) + # remove normalizer specific info + for table in schema.tables.values(): + table.pop("x-normalizer", None) # type: ignore[typeddict-item] + logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") + # schema is updated, save it to schema volume + self.schema_storage.save_schema(schema) # save schema to temp load folder self.load_storage.save_temp_schema(schema, load_id) # save schema updates even if empty diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index af7dd12294..0f173307a0 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,13 +1,13 @@ from typing import Sequence, cast, overload from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config from dlt.common.configuration.container import Container from dlt.common.configuration.inject import get_orig_args, last_config -from dlt.common.destination import Destination, TDestinationReferenceArg +from dlt.common.destination import TLoaderFileFormat, Destination, TDestinationReferenceArg from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs @@ -177,7 +177,9 @@ def run( table_name: str = None, write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, - schema: Schema = None + schema: Schema = None, + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -237,7 +239,9 @@ def run( table_name=table_name, write_disposition=write_disposition, columns=columns, - schema=schema + schema=schema, + loader_file_format=loader_file_format, + schema_contract=schema_contract ) # plug default tracking module diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 5f880d8711..14a7108683 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,2 @@ """Module with market functions that make data to be specially processed""" -from dlt.extract.source import with_table_name \ No newline at end of file +from dlt.extract import with_table_name \ No newline at end of file diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index 465eccfdb6..b9eb958027 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -5,7 +5,6 @@ from functools import wraps from collections.abc import Sequence as C_Sequence from typing import Any, Callable, ClassVar, List, Iterator, Optional, Sequence, Tuple, cast, get_type_hints, ContextManager -from concurrent.futures import Executor from dlt import version from dlt.common import json, logger, pendulum @@ -19,7 +18,8 @@ MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract +from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner @@ -34,9 +34,9 @@ from dlt.common.utils import is_interactive from dlt.common.data_writers import TLoaderFileFormat +from dlt.extract import DltResource, DltSource from dlt.extract.exceptions import SourceExhausted from dlt.extract.extract import ExtractorStorage, extract_with_schema -from dlt.extract.source import DltResource, DltSource from dlt.normalize import Normalize from dlt.normalize.configuration import NormalizeConfiguration from dlt.destinations.sql_client import SqlClientBase @@ -50,8 +50,6 @@ from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace, describe_extract_data from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state - -from dlt.common.schema.utils import normalize_schema_name from dlt.pipeline.deprecations import credentials_argument_deprecated @@ -82,8 +80,11 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # refresh live schemas in storage or import schema path self._schema_storage.commit_live_schema(name) rv = f(self, *args, **kwargs) + # save modified live schemas + for name in self._schema_storage.live_schemas: + self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added - self.schema_names = self._schema_storage.list_schemas() + self.schema_names = self._list_schemas_sorted() return rv return _wrap # type: ignore @@ -268,7 +269,8 @@ def extract( primary_key: TColumnNames = None, schema: Schema = None, max_parallel_items: int = None, - workers: int = None + workers: int = None, + schema_contract: TSchemaContract = None ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted @@ -277,7 +279,7 @@ def extract( try: with self._maybe_destination_capabilities(): # extract all sources - for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key): + for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key, schema_contract): if source.exhausted: raise SourceExhausted(source.name) # TODO: merge infos for all the sources @@ -288,6 +290,7 @@ def extract( # TODO: if we fail here we should probably wipe out the whole extract folder for extract_id in extract_ids: storage.commit_extract_files(extract_id) + return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor @@ -394,7 +397,8 @@ def run( columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -444,6 +448,8 @@ def run( loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. + schema_contract (TSchemaContract, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. + Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. Returns: @@ -470,9 +476,10 @@ def run( logger.warn("The pipeline `run` method will now load the pending load packages. The data you passed to the run function will not be loaded. In order to do that you must run the pipeline again") return self.load(destination, dataset_name, credentials=credentials) + # extract from the source if data is not None: - self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema) + self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract=schema_contract) self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) else: @@ -808,21 +815,34 @@ def _data_to_sources(self, parent_table_name: str = None, write_disposition: TWriteDisposition = None, columns: TAnySchemaColumns = None, - primary_key: TColumnNames = None + primary_key: TColumnNames = None, + schema_contract: TSchemaContract = None ) -> List[DltSource]: def apply_hint_args(resource: DltResource) -> None: - # apply hints only if any of the hints is present, table_name must be always present - if table_name or parent_table_name or write_disposition or columns or primary_key: - resource.apply_hints(table_name or resource.table_name or resource.name, parent_table_name, write_disposition, columns, primary_key) + resource.apply_hints( + table_name, + parent_table_name, + write_disposition, + columns, + primary_key, + schema_contract=schema_contract + ) + + def apply_settings(source_: DltSource) -> None: + # apply schema contract settings + if schema_contract: + source_.schema_contract = schema_contract def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: - return schema - if self.default_schema_name: - return self.default_schema.clone() - return self._make_schema_with_default_name() + schema_ = schema + elif self.default_schema_name: + schema_ = self.default_schema.clone() + else: + schema_ = self._make_schema_with_default_name() + return schema_ effective_schema = choose_schema() @@ -835,14 +855,8 @@ def append_data(data_item: Any) -> None: # if schema is explicit then override source schema if schema: data_item.schema = schema - # try to apply hints to resources - _resources = data_item.resources.values() - for r in _resources: - apply_hint_args(r) sources.append(data_item) elif isinstance(data_item, DltResource): - # apply hints - apply_hint_args(data_item) # do not set section to prevent source that represent a standalone resource # to overwrite other standalone resources (ie. parents) in that source sources.append( @@ -851,10 +865,9 @@ def append_data(data_item: Any) -> None: else: # iterator/iterable/generator # create resource first without table template - resource = DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) - # apply hints - apply_hint_args(resource) - resources.append(resource) + resources.append( + DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) + ) if isinstance(data, C_Sequence) and len(data) > 0: # if first element is source or resource @@ -866,36 +879,42 @@ def append_data(data_item: Any) -> None: else: append_data(data) + # add all the appended resources in one source if resources: - # add all the appended resources in one source sources.append(DltSource(effective_schema.name, self.pipeline_name, effective_schema, resources)) + # apply hints and settings + for source in sources: + apply_settings(source) + for resource in source.selected_resources.values(): + apply_hint_args(resource) + return sources def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: - # discover the schema from source - source_schema = source.schema - source_schema.update_normalizers() + # discover the existing pipeline schema + if source.schema.name in self.schemas: + # use clone until extraction complete + pipeline_schema = self.schemas[source.schema.name].clone() + # apply all changes in the source schema to pipeline schema + # NOTE: we do not apply contracts to changes done programmatically + pipeline_schema.update_schema(source.schema) + # replace schema in the source + source.schema = pipeline_schema # extract into pipeline schema - extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) + extract_id = extract_with_schema(storage, source, self.collector, max_parallel_items, workers) # save import with fully discovered schema - self._schema_storage.save_import_schema_if_not_exists(source_schema) + self._schema_storage.save_import_schema_if_not_exists(source.schema) - # if source schema does not exist in the pipeline - if source_schema.name not in self._schema_storage: - # create new schema - self._schema_storage.save_schema(source_schema) - - # update pipeline schema (do contract checks here) - pipeline_schema = self._schema_storage[source_schema.name] - pipeline_schema.update_schema(source_schema) + # update live schema but not update the store yet + self._schema_storage.update_live_schema(source.schema) # set as default if this is first schema in pipeline if not self.default_schema_name: # this performs additional validations as schema contains the naming module - self._set_default_schema_name(pipeline_schema) + self._set_default_schema_name(source.schema) return extract_id @@ -1212,12 +1231,6 @@ def managed_state(self, *, extract_state: bool = False) -> Iterator[TPipelineSta backup_state = self._get_state() # restore original pipeline props self._state_to_props(backup_state) - # synchronize schema storage with initial list of schemas, note that we'll not be able to synchronize the schema content - if self._schema_storage: - # TODO: we should restore schemas backup here - for existing_schema_name in self._schema_storage.list_schemas(): - if existing_schema_name not in self.schema_names: - self._schema_storage.remove_schema(existing_schema_name) # raise original exception raise else: @@ -1268,7 +1281,11 @@ def _props_to_state(self, state: TPipelineState) -> None: state["destination"] = self.destination.name if self.staging: state["staging"] = self.staging.name - state["schema_names"] = self._schema_storage.list_schemas() + state["schema_names"] = self._list_schemas_sorted() + + def _list_schemas_sorted(self) -> List[str]: + """Lists schema names sorted to have deterministic state""" + return sorted(self._schema_storage.list_schemas()) def _save_state(self, state: TPipelineState) -> None: self._pipeline_storage.save(Pipeline.STATE_FILE, json_encode_state(state)) @@ -1278,7 +1295,7 @@ def _extract_state(self, state: TPipelineState) -> TPipelineState: # note: the schema will be persisted because the schema saving decorator is over the state manager decorator for extract state_source = DltSource(self.default_schema.name, self.pipeline_name, self.default_schema, [state_resource(state)]) storage = ExtractorStorage(self._normalize_storage_config) - extract_id = extract_with_schema(storage, state_source, self.default_schema, _NULL_COLLECTOR, 1, 1) + extract_id = extract_with_schema(storage, state_source, _NULL_COLLECTOR, 1, 1) storage.commit_extract_files(extract_id) return state diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 581ed4c2bd..a9603b8f66 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -12,7 +12,7 @@ from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import JobClientBase, WithStateSync -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException from dlt.common.utils import compressed_b64decode, compressed_b64encode diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 2ba71396f6..46ab524aa1 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -14,7 +14,7 @@ from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.exceptions import PipelineStepFailed diff --git a/dlt/reflection/script_inspector.py b/dlt/reflection/script_inspector.py index 204135dcd7..9899e2b157 100644 --- a/dlt/reflection/script_inspector.py +++ b/dlt/reflection/script_inspector.py @@ -12,7 +12,8 @@ from dlt.common.typing import DictStrAny from dlt.pipeline import Pipeline -from dlt.extract.source import DltSource, ManagedPipeIterator +from dlt.extract import DltSource +from dlt.extract.pipe import ManagedPipeIterator def patch__init__(self: Any, *args: Any, **kwargs: Any) -> None: diff --git a/dlt/sources/__init__.py b/dlt/sources/__init__.py index 6e418a3cb2..465467db67 100644 --- a/dlt/sources/__init__.py +++ b/dlt/sources/__init__.py @@ -1,7 +1,6 @@ """Module with built in sources and source building blocks""" -from dlt.extract.incremental import Incremental as incremental -from dlt.extract.source import DltSource, DltResource from dlt.common.typing import TDataItem, TDataItems +from dlt.extract import DltSource, DltResource, Incremental as incremental from . import credentials from . import config from . import filesystem diff --git a/docs/examples/archive/sources/rasa/rasa.py b/docs/examples/archive/sources/rasa/rasa.py index aa31b3c482..b498f9c3de 100644 --- a/docs/examples/archive/sources/rasa/rasa.py +++ b/docs/examples/archive/sources/rasa/rasa.py @@ -3,7 +3,7 @@ import dlt from dlt.common.typing import StrAny, TDataItem, TDataItems from dlt.common.time import timestamp_within -from dlt.extract.source import DltResource +from dlt.extract.resource import DltResource @dlt.source diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py index 6370f29811..3f433e3fef 100644 --- a/docs/examples/incremental_loading/zendesk.py +++ b/docs/examples/incremental_loading/zendesk.py @@ -1,10 +1,9 @@ -from typing import Iterator, Optional, Dict, Any, Tuple +from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime -from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime -from dlt.extract.source import DltResource +from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 2d674407bc..4c3d3f0b3a 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -10,13 +10,12 @@ def incremental_snippet() -> None: # @@@DLT_SNIPPET_START example # @@@DLT_SNIPPET_START markdown_source - from typing import Iterator, Optional, Dict, Any, Tuple + from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime - from dlt.extract.source import DltResource + from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md new file mode 100644 index 0000000000..543edf2502 --- /dev/null +++ b/docs/website/docs/general-usage/data-contracts.md @@ -0,0 +1,81 @@ +--- +title: Data Contracts +description: Data contracts and controlling schema evolution +keywords: [data contracts, schema, dlt schema, pydantic] +--- + +## Data contracts and controlling schema evolution + +`dlt` will evolve the schema of the destination to accomodate the structure and data types of the extracted data. There are several settings +that you can use to control this automatic schema evolution, from the default settings where all changes to the schema are accepted to +a frozen schema that does not change at all. + +Consider this example: + +```py +@dlt.resource(schema_contract={"tables": "evolve", "columns": "freeze"}) +def items(): + ... +``` + +This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which +contains a new column. + +### Possible settings + +The `schema_contract` exists on the `source` decorator as a directive for all resources of that source and on the +`resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. +The `schema_contract` is a dictionary with keys that control the following: + +* `table` creating of new tables and subtables +* `columns` creating of new columns on an existing table +* `data_type` creating of new variant columns, which happens if a different datatype is discovered in the extracted data than exists in the schema + +Each property can be set to one of three values: +* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination +* `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. All other rows will be. +* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. + +If a table is a new table that has not been created on the destination yet, dlt will allow the creation of all columns and variants on the first run + +### Code Examples + +The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. + +```py +@dlt.resource(schema_contract={"tables": "discard_row", "columns": "evolve", "data_type": "freeze"}) +def items(): + ... +``` + +The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. + +```py +pipeline.run(my_source(), schema_contract="freeze") +``` + +The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. +Here for all resources variant columns are frozen and raise an error if encountered, on `items` new columns are allowed but `other_items` inherits the `freeze` setting from +the source, thus new columns are frozen there. New tables are allowed. + +```py +@dlt.resource(schema_contract={"columns": "evolve"}) +def items(): + ... + +@dlt.resource() +def other_items(): + ... + +@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}): +def source(): + return [items(), other_items()] + + +# this will use the settings defined by the decorators +pipeline.run(source()) + +# this will freeze the whole schema, regardless of the decorator settings +pipeline.run(source(), schema_contract="freeze") + +``` \ No newline at end of file diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index e203b3d93a..a7f68fadd1 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -110,21 +110,27 @@ Things to note: - Fields with an `Optional` type are marked as `nullable` - Fields with a `Union` type are converted to the first (not `None`) type listed in the union. E.g. `status: Union[int, str]` results in a `bigint` column. -- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. You can override this by manually calling the pydantic helper with `skip_complex_types=True`, see below: +- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. + +You can override this by configuring the Pydantic model ```python -from dlt.common.lib.pydantic import pydantic_to_table_schema_columns +from typing import ClassVar +from dlt.common.libs.pydantic import DltConfig -... +class UserWithNesting(User): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} -@dlt.resource(name="user", columns=pydantic_to_table_schema_columns(User, skip_complex_types=True)) +@dlt.resource(name="user", columns=UserWithNesting) def get_users(): ... ``` -This omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default +`"skip_complex_types"` omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default behaviour of creating child tables for these fields. +We do not support `RootModel` that validate simple types. You can add such validator yourself, see [data filtering section](#filter-transform-and-pivot-data). + ### Dispatch data to many tables You can load data to many tables from a single resource. The most common case is a stream of events diff --git a/docs/website/docs/reference/tracing.md b/docs/website/docs/reference/tracing.md new file mode 100644 index 0000000000..0ad0a59912 --- /dev/null +++ b/docs/website/docs/reference/tracing.md @@ -0,0 +1,6 @@ +1. Identifiers + +2. Data Lineage + +3. Schema Lineage + diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 0dc7416caa..9ae94a8514 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -106,6 +106,7 @@ const sidebars = { 'general-usage/incremental-loading', 'general-usage/full-loading', 'general-usage/schema', + 'general-usage/data-contracts', { type: 'category', label: 'Configuration', diff --git a/poetry.lock b/poetry.lock index e925740fb7..018c1357fe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -136,6 +136,17 @@ python-versions = ">=3.7, <4" about-time = "4.2.1" grapheme = "0.6.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} + [[package]] name = "ansicon" version = "1.89.0" @@ -164,7 +175,7 @@ trio = ["trio (>=0.22)"] [[package]] name = "apache-airflow" -version = "2.7.0" +version = "2.7.2" description = "Programmatically author, schedule and monitor data pipelines" category = "dev" optional = false @@ -191,7 +202,7 @@ cryptography = ">=0.9.3" deprecated = ">=1.2.13" dill = ">=0.2.2" flask = ">=2.2,<2.3" -flask-appbuilder = "4.3.3" +flask-appbuilder = "4.3.6" flask-caching = ">=1.5.0" flask-login = ">=0.6.2" flask-session = ">=0.4.0" @@ -200,7 +211,7 @@ google-re2 = ">=1.0" graphviz = ">=0.12" gunicorn = ">=20.1.0" httpx = "*" -importlib-metadata = {version = ">=1.7,<5.0.0", markers = "python_version < \"3.9\""} +importlib-metadata = {version = ">=1.7", markers = "python_version < \"3.9\""} importlib-resources = {version = ">=5.2", markers = "python_version < \"3.9\""} itsdangerous = ">=2.0" jinja2 = ">=3.0.0" @@ -213,14 +224,14 @@ markdown-it-py = ">=2.1.0" markupsafe = ">=1.1.1" marshmallow-oneofschema = ">=2.0.1" mdit-py-plugins = ">=0.3.0" -opentelemetry-api = "1.15.0" +opentelemetry-api = ">=1.15.0" opentelemetry-exporter-otlp = "*" packaging = ">=14.0" pathspec = ">=0.9.0" pendulum = ">=2.0" pluggy = ">=1.0" psutil = ">=4.2.0" -pydantic = ">=1.10.0,<2.0.0" +pydantic = ">=1.10.0" pygments = ">=2.0.1" pyjwt = ">=2.0.0" python-daemon = ">=3.0.0" @@ -231,7 +242,7 @@ rfc3339-validator = ">=0.1.4" rich = ">=12.4.4" rich-argparse = ">=1.0.0" setproctitle = ">=1.1.8" -sqlalchemy = ">=1.4,<2.0" +sqlalchemy = ">=1.4.28,<2.0" sqlalchemy-jsonfield = ">=1.0" tabulate = ">=0.7.5" tenacity = ">=6.2.0,<8.2.0 || >8.2.0" @@ -244,8 +255,8 @@ werkzeug = ">=2.0" aiobotocore = ["aiobotocore (>=2.1.1)"] airbyte = ["apache-airflow-providers-airbyte"] alibaba = ["apache-airflow-providers-alibaba"] -all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.24.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>7,<7.15.0)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "sasl (>=0.3.1)", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] -all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sasl (>=0.3.1)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] +all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>8,<9)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] +all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] amazon = ["apache-airflow-providers-amazon"] apache-atlas = ["atlasclient (>=0.1.2)"] apache-beam = ["apache-airflow-providers-apache-beam"] @@ -273,7 +284,7 @@ atlassian-jira = ["apache-airflow-providers-atlassian-jira"] aws = ["apache-airflow-providers-amazon"] azure = ["apache-airflow-providers-microsoft-azure"] cassandra = ["apache-airflow-providers-apache-cassandra"] -celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.2.3,<6)", "flower (>=1.0.0)"] +celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "flower (>=1.0.0)"] cgroups = ["cgroupspy (>=0.2.2)"] cloudant = ["apache-airflow-providers-cloudant"] cncf-kubernetes = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] @@ -284,13 +295,13 @@ databricks = ["apache-airflow-providers-databricks"] datadog = ["apache-airflow-providers-datadog"] dbt-cloud = ["apache-airflow-providers-dbt-cloud"] deprecated-api = ["requests (>=2.26.0)"] -devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] -devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] dingding = ["apache-airflow-providers-dingding"] discord = ["apache-airflow-providers-discord"] -doc = ["astroid (>=2.12.3)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] +doc = ["astroid (>=2.12.3,<3.0)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] doc-gen = ["eralchemy2"] docker = ["apache-airflow-providers-docker"] druid = ["apache-airflow-providers-apache-druid"] @@ -301,9 +312,9 @@ ftp = ["apache-airflow-providers-ftp"] gcp = ["apache-airflow-providers-google"] gcp-api = ["apache-airflow-providers-google"] github = ["apache-airflow-providers-github"] -github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] google = ["apache-airflow-providers-google"] -google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] grpc = ["apache-airflow-providers-grpc"] hashicorp = ["apache-airflow-providers-hashicorp"] hdfs = ["apache-airflow-providers-apache-hdfs"] @@ -340,7 +351,6 @@ plexus = ["apache-airflow-providers-plexus"] postgres = ["apache-airflow-providers-postgres"] presto = ["apache-airflow-providers-presto"] qds = ["apache-airflow-providers-qubole"] -qubole = ["apache-airflow-providers-qubole"] rabbitmq = ["amqp"] redis = ["apache-airflow-providers-redis"] s3 = ["apache-airflow-providers-amazon"] @@ -1846,7 +1856,7 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-appbuilder" -version = "4.3.3" +version = "4.3.6" description = "Simple and rapid application development framework, built on top of Flask. includes detailed security, auto CRUD generation for your models, google charts and much more." category = "dev" optional = false @@ -1878,6 +1888,7 @@ WTForms = "<4" jmespath = ["jmespath (>=0.9.5)"] oauth = ["Authlib (>=0.14,<2.0.0)"] openid = ["Flask-OpenID (>=1.2.5,<2)"] +talisman = ["flask-talisman (>=1.0.0,<2.0)"] [[package]] name = "flask-babel" @@ -3415,17 +3426,6 @@ category = "main" optional = false python-versions = ">=3.7" -[[package]] -name = "oscrypto" -version = "1.3.0" -description = "TLS (SSL) sockets, key generation, encryption, decryption, signing, verification and KDFs using the OS crypto libraries. Does not require a compiler, and relies on the OS for patching. Works on Windows, OS X and Linux/BSD." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -asn1crypto = ">=1.5.1" - [[package]] name = "packaging" version = "23.1" @@ -3436,7 +3436,7 @@ python-versions = ">=3.7" [[package]] name = "pandas" -version = "1.5.3" +version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -3448,11 +3448,32 @@ numpy = [ {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] -python-dateutil = ">=2.8.1" +python-dateutil = ">=2.8.2" pytz = ">=2020.1" - -[package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] [[package]] name = "parsedatetime" @@ -3658,11 +3679,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pyarrow" -version = "10.0.1" +version = "14.0.1" description = "Python library for Apache Arrow" category = "main" optional = true -python-versions = ">=3.7" +python-versions = ">=3.8" [package.dependencies] numpy = ">=1.16.6" @@ -3722,28 +3743,32 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -[[package]] -name = "pycryptodomex" -version = "3.18.0" -description = "Cryptographic library for Python" -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - [[package]] name = "pydantic" -version = "1.10.12" -description = "Data validation and settings management using python type hints" +version = "2.5.0" +description = "Data validation using Python type hints" category = "main" optional = false python-versions = ">=3.7" [package.dependencies] -typing-extensions = ">=4.2.0" +annotated-types = ">=0.4.0" +pydantic-core = "2.14.1" +typing-extensions = ">=4.6.1" [package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.14.1" +description = "" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydoc-markdown" @@ -4435,7 +4460,7 @@ python-versions = ">=3.7" [[package]] name = "snowflake-connector-python" -version = "3.1.1" +version = "3.5.0" description = "Snowflake Connector for Python" category = "main" optional = true @@ -4450,12 +4475,10 @@ cryptography = ">=3.1.0,<42.0.0" filelock = ">=3.5,<4" idna = ">=2.5,<4" keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} -oscrypto = "<2.0.0" packaging = "*" pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} -platformdirs = ">=2.6.0,<3.9.0" -pyarrow = {version = ">=10.0.1,<10.1.0", optional = true, markers = "extra == \"pandas\""} -pycryptodomex = ">=3.2,<3.5.0 || >3.5.0,<4.0.0" +platformdirs = ">=2.6.0,<4.0.0" +pyarrow = {version = "*", optional = true, markers = "extra == \"pandas\""} pyjwt = "<3.0.0" pyOpenSSL = ">=16.2.0,<24.0.0" pytz = "*" @@ -4463,11 +4486,11 @@ requests = "<3.0.0" sortedcontainers = ">=2.4.0" tomlkit = "*" typing-extensions = ">=4.3,<5" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<2.0.0" [package.extras] -development = ["Cython", "coverage", "more-itertools", "numpy (<1.26.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] -pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow (>=10.0.1,<10.1.0)"] +development = ["Cython", "coverage", "more-itertools", "numpy (<1.27.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] +pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow"] secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] [[package]] @@ -5063,7 +5086,6 @@ motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] -pydantic = ["pydantic"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["s3fs", "botocore"] @@ -5073,7 +5095,7 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<3.13" -content-hash = "f857c300f44dadb0cf25af0016c5baf5318097a2d6b0d7035f6aaa2e7fb592b2" +content-hash = "bbfaab078877deaa60ecf6bc95c0374e1967268ca24594a99b792b88c4ef270b" [metadata.files] about-time = [ @@ -5197,6 +5219,10 @@ alive-progress = [ {file = "alive-progress-3.1.4.tar.gz", hash = "sha256:74a95d8d0d42bc99d3a3725dbd06ebb852245f1b64e301a7c375b92b22663f7b"}, {file = "alive_progress-3.1.4-py3-none-any.whl", hash = "sha256:c80ad87ce9c1054b01135a87fae69ecebbfc2107497ae87cbe6aec7e534903db"}, ] +annotated-types = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] ansicon = [ {file = "ansicon-1.89.0-py2.py3-none-any.whl", hash = "sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec"}, {file = "ansicon-1.89.0.tar.gz", hash = "sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1"}, @@ -5206,8 +5232,8 @@ anyio = [ {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, ] apache-airflow = [ - {file = "apache-airflow-2.7.0.tar.gz", hash = "sha256:06fba3df5943b6eda5e2f033e7e45b6ea557d89909ca36e61614ea61075f9722"}, - {file = "apache_airflow-2.7.0-py3-none-any.whl", hash = "sha256:8e3cf4b3cd8583a2e76bd04827af8d34747e0cf30a28cf0e70f4f4f39ce61f6d"}, + {file = "apache-airflow-2.7.2.tar.gz", hash = "sha256:c6fab3449066867d9a7728f40b6b9e27f1ea68bca39b064a27f5c5ddc3262224"}, + {file = "apache_airflow-2.7.2-py3-none-any.whl", hash = "sha256:1bc2c022bcae24b911e49fafd5fb619b49efba87ed7bc8561a2065810d8fe899"}, ] apache-airflow-providers-common-sql = [ {file = "apache-airflow-providers-common-sql-1.7.1.tar.gz", hash = "sha256:ba37f795d9656a87cf4661edc381b8ecfe930272c59324b59f8a158fd0971aeb"}, @@ -5784,8 +5810,8 @@ flask = [ {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] flask-appbuilder = [ - {file = "Flask-AppBuilder-4.3.3.tar.gz", hash = "sha256:b420379f74788e431a2763f8d3749cc37712df682dc00a45538d85d989340768"}, - {file = "Flask_AppBuilder-4.3.3-py3-none-any.whl", hash = "sha256:7eb1904d8f61297778ebf0d0b83f1d74b154534c9e84af3bb9198cfc0f51ff05"}, + {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, + {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, ] flask-babel = [ {file = "Flask-Babel-2.0.0.tar.gz", hash = "sha256:f9faf45cdb2e1a32ea2ec14403587d4295108f35017a7821a2b1acb8cfd9257d"}, @@ -7055,42 +7081,36 @@ orjson = [ {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] -oscrypto = [ - {file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"}, - {file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"}, -] packaging = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] pandas = [ - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"}, - {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"}, - {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"}, - {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"}, - {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"}, - {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"}, - {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"}, - {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, ] parsedatetime = [ {file = "parsedatetime-2.4-py2-none-any.whl", hash = "sha256:9ee3529454bf35c40a77115f5a596771e59e1aee8c53306f346c461b8e913094"}, @@ -7268,31 +7288,42 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyarrow = [ - {file = "pyarrow-10.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:e00174764a8b4e9d8d5909b6d19ee0c217a6cf0232c5682e31fdfbd5a9f0ae52"}, - {file = "pyarrow-10.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6f7a7dbe2f7f65ac1d0bd3163f756deb478a9e9afc2269557ed75b1b25ab3610"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb627673cb98708ef00864e2e243f51ba7b4c1b9f07a1d821f98043eccd3f585"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba71e6fc348c92477586424566110d332f60d9a35cb85278f42e3473bc1373da"}, - {file = "pyarrow-10.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7b4ede715c004b6fc535de63ef79fa29740b4080639a5ff1ea9ca84e9282f349"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e3fe5049d2e9ca661d8e43fab6ad5a4c571af12d20a57dffc392a014caebef65"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:254017ca43c45c5098b7f2a00e995e1f8346b0fb0be225f042838323bb55283c"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70acca1ece4322705652f48db65145b5028f2c01c7e426c5d16a30ba5d739c24"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abb57334f2c57979a49b7be2792c31c23430ca02d24becd0b511cbe7b6b08649"}, - {file = "pyarrow-10.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1765a18205eb1e02ccdedb66049b0ec148c2a0cb52ed1fb3aac322dfc086a6ee"}, - {file = "pyarrow-10.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:61f4c37d82fe00d855d0ab522c685262bdeafd3fbcb5fe596fe15025fbc7341b"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e141a65705ac98fa52a9113fe574fdaf87fe0316cde2dffe6b94841d3c61544c"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf26f809926a9d74e02d76593026f0aaeac48a65b64f1bb17eed9964bfe7ae1a"}, - {file = "pyarrow-10.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:443eb9409b0cf78df10ced326490e1a300205a458fbeb0767b6b31ab3ebae6b2"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f2d00aa481becf57098e85d99e34a25dba5a9ade2f44eb0b7d80c80f2984fc03"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b1fc226d28c7783b52a84d03a66573d5a22e63f8a24b841d5fc68caeed6784d4"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa59933b20183c1c13efc34bd91efc6b2997377c4c6ad9272da92d224e3beb1"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:668e00e3b19f183394388a687d29c443eb000fb3fe25599c9b4762a0afd37775"}, - {file = "pyarrow-10.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1bc6e4d5d6f69e0861d5d7f6cf4d061cf1069cb9d490040129877acf16d4c2a"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:42ba7c5347ce665338f2bc64685d74855900200dac81a972d49fe127e8132f75"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b069602eb1fc09f1adec0a7bdd7897f4d25575611dfa43543c8b8a75d99d6874"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94fb4a0c12a2ac1ed8e7e2aa52aade833772cf2d3de9dde685401b22cec30002"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db0c5986bf0808927f49640582d2032a07aa49828f14e51f362075f03747d198"}, - {file = "pyarrow-10.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:0ec7587d759153f452d5263dbc8b1af318c4609b607be2bd5127dcda6708cdb1"}, - {file = "pyarrow-10.0.1.tar.gz", hash = "sha256:1a14f57a5f472ce8234f2964cd5184cccaa8df7e04568c64edc33b23eb285dd5"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, ] pyasn1 = [ {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, @@ -7314,77 +7345,112 @@ pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -pycryptodomex = [ - {file = "pycryptodomex-3.18.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:160a39a708c36fa0b168ab79386dede588e62aec06eb505add870739329aecc6"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c2953afebf282a444c51bf4effe751706b4d0d63d7ca2cc51db21f902aa5b84e"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:ba95abd563b0d1b88401658665a260852a8e6c647026ee6a0a65589287681df8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2014_aarch64.whl", hash = "sha256:192306cf881fe3467dda0e174a4f47bb3a8bb24b90c9cdfbdc248eec5fc0578c"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:f9ab5ef0718f6a8716695dea16d83b671b22c45e9c0c78fd807c32c0192e54b5"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win32.whl", hash = "sha256:50308fcdbf8345e5ec224a5502b4215178bdb5e95456ead8ab1a69ffd94779cb"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win_amd64.whl", hash = "sha256:4d9379c684efea80fdab02a3eb0169372bca7db13f9332cb67483b8dc8b67c37"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5594a125dae30d60e94f37797fc67ce3c744522de7992c7c360d02fdb34918f8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8ff129a5a0eb5ff16e45ca4fa70a6051da7f3de303c33b259063c19be0c43d35"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2014_aarch64.whl", hash = "sha256:3d9314ac785a5b75d5aaf924c5f21d6ca7e8df442e5cf4f0fefad4f6e284d422"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:f237278836dda412a325e9340ba2e6a84cb0f56b9244781e5b61f10b3905de88"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac614363a86cc53d8ba44b6c469831d1555947e69ab3276ae8d6edc219f570f7"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:302a8f37c224e7b5d72017d462a2be058e28f7be627bdd854066e16722d0fc0c"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux2014_aarch64.whl", hash = "sha256:6421d23d6a648e83ba2670a352bcd978542dad86829209f59d17a3f087f4afef"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84e105787f5e5d36ec6a581ff37a1048d12e638688074b2a00bcf402f9aa1c2"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6875eb8666f68ddbd39097867325bd22771f595b4e2b0149739b5623c8bf899b"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:27072a494ce621cc7a9096bbf60ed66826bb94db24b49b7359509e7951033e74"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:1949e09ea49b09c36d11a951b16ff2a05a0ffe969dda1846e4686ee342fe8646"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6ed3606832987018615f68e8ed716a7065c09a0fe94afd7c9ca1b6777f0ac6eb"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win32.whl", hash = "sha256:d56c9ec41258fd3734db9f5e4d2faeabe48644ba9ca23b18e1839b3bdf093222"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win_amd64.whl", hash = "sha256:e00a4bacb83a2627e8210cb353a2e31f04befc1155db2976e5e239dd66482278"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2dc4eab20f4f04a2d00220fdc9258717b82d31913552e766d5f00282c031b70a"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-win32.whl", hash = "sha256:75672205148bdea34669173366df005dbd52be05115e919551ee97171083423d"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bec6c80994d4e7a38312072f89458903b65ec99bed2d65aa4de96d997a53ea7a"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d35a8ffdc8b05e4b353ba281217c8437f02c57d7233363824e9d794cf753c419"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76f0a46bee539dae4b3dfe37216f678769349576b0080fdbe431d19a02da42ff"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:71687eed47df7e965f6e0bf3cadef98f368d5221f0fb89d2132effe1a3e6a194"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73d64b32d84cf48d9ec62106aa277dbe99ab5fbfd38c5100bc7bddd3beb569f7"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbdcce0a226d9205560a5936b05208c709b01d493ed8307792075dedfaaffa5f"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58fc0aceb9c961b9897facec9da24c6a94c5db04597ec832060f53d4d6a07196"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:215be2980a6b70704c10796dd7003eb4390e7be138ac6fb8344bf47e71a8d470"}, - {file = "pycryptodomex-3.18.0.tar.gz", hash = "sha256:3e3ecb5fe979e7c1bb0027e518340acf7ee60415d79295e5251d13c68dde576e"}, -] pydantic = [ - {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"}, - {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"}, - {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"}, - {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"}, - {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"}, - {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"}, - {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"}, - {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"}, - {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"}, - {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"}, + {file = "pydantic-2.5.0-py3-none-any.whl", hash = "sha256:7ce6e766c456ad026fe5712f7bcf036efc34bd5d107b3e669ef7ea01b3a9050c"}, + {file = "pydantic-2.5.0.tar.gz", hash = "sha256:69bd6fb62d2d04b7055f59a396993486a2ee586c43a0b89231ce0000de07627c"}, +] +pydantic-core = [ + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:812beca1dcb2b722cccc7e9c620bd972cbc323321194ec2725eab3222e6ac573"}, + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2ccdc53cb88e51c7d47d74c59630d7be844428f6b8d463055ffad6f0392d8da"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd937733bf2fe7d6a8bf208c12741f1f730b7bf5636033877767a75093c29b8a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:581bb606a31749a00796f5257947a0968182d7fe91e1dada41f06aeb6bfbc91a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aadf74a40a7ae49c3c1aa7d32334fe94f4f968e21dd948e301bb4ed431fb2412"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b89821a2c77cc1b8f2c1fc3aacd6a3ecc5df8f7e518dc3f18aef8c4dcf66003d"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49ee28d65f506b2858a60745cc974ed005298ebab12693646b97641dd7c99c35"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97246f896b4df7fd84caa8a75a67abb95f94bc0b547665bf0889e3262b060399"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1185548665bc61bbab0dc78f10c8eafa0db0aa1e920fe9a451b77782b10a65cc"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2a7d08b39fac97540fba785fce3b21ee01a81f081a07a4d031efd791da6666f9"}, + {file = "pydantic_core-2.14.1-cp310-none-win32.whl", hash = "sha256:0a8c8daf4e3aa3aeb98e3638fc3d58a359738f3d12590b2474c6bb64031a0764"}, + {file = "pydantic_core-2.14.1-cp310-none-win_amd64.whl", hash = "sha256:4f0788699a92d604f348e9c1ac5e97e304e97127ba8325c7d0af88dcc7d35bd3"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:2be018a84995b6be1bbd40d6064395dbf71592a981169cf154c0885637f5f54a"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fc3227408808ba7df8e95eb1d8389f4ba2203bed8240b308de1d7ae66d828f24"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d5d0e9bbb50481a049bd0203224b339d4db04006b78564df2b782e2fd16ebc"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc6a4ea9f88a810cb65ccae14404da846e2a02dd5c0ad21dee712ff69d142638"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d312ad20e3c6d179cb97c42232b53111bcd8dcdd5c1136083db9d6bdd489bc73"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:679cc4e184f213c8227862e57340d12fd4d4d19dc0e3ddb0f653f86f01e90f94"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101df420e954966868b8bc992aefed5fa71dd1f2755104da62ee247abab28e2f"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c964c0cc443d6c08a2347c0e5c1fc2d85a272dc66c1a6f3cde4fc4843882ada4"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8276bbab68a9dbe721da92d19cbc061f76655248fe24fb63969d0c3e0e5755e7"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:12163197fec7c95751a3c71b36dcc1909eed9959f011ffc79cc8170a6a74c826"}, + {file = "pydantic_core-2.14.1-cp311-none-win32.whl", hash = "sha256:b8ff0302518dcd001bd722bbe342919c29e5066c7eda86828fe08cdc112668b8"}, + {file = "pydantic_core-2.14.1-cp311-none-win_amd64.whl", hash = "sha256:59fa83873223f856d898452c6162a390af4297756f6ba38493a67533387d85d9"}, + {file = "pydantic_core-2.14.1-cp311-none-win_arm64.whl", hash = "sha256:798590d38c9381f07c48d13af1f1ef337cebf76ee452fcec5deb04aceced51c7"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:587d75aec9ae50d0d63788cec38bf13c5128b3fc1411aa4b9398ebac884ab179"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26242e3593d4929123615bd9365dd86ef79b7b0592d64a96cd11fd83c69c9f34"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5879ac4791508d8f0eb7dec71ff8521855180688dac0c55f8c99fc4d1a939845"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad9ea86f5fc50f1b62c31184767fe0cacaa13b54fe57d38898c3776d30602411"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:102ac85a775e77821943ae38da9634ddd774b37a8d407181b4f7b05cdfb36b55"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2459cc06572730e079ec1e694e8f68c99d977b40d98748ae72ff11ef21a56b0b"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:217dcbfaf429a9b8f1d54eb380908b9c778e78f31378283b30ba463c21e89d5d"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9d59e0d7cdfe8ed1d4fcd28aad09625c715dc18976c7067e37d8a11b06f4be3e"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e2be646a5155d408e68b560c0553e8a83dc7b9f90ec6e5a2fc3ff216719385db"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ffba979801e3931a19cd30ed2049450820effe8f152aaa317e2fd93795d318d7"}, + {file = "pydantic_core-2.14.1-cp312-none-win32.whl", hash = "sha256:132b40e479cb5cebbbb681f77aaceabbc8355df16c9124cff1d4060ada83cde2"}, + {file = "pydantic_core-2.14.1-cp312-none-win_amd64.whl", hash = "sha256:744b807fe2733b6da3b53e8ad93e8b3ea3ee3dfc3abece4dd2824cc1f39aa343"}, + {file = "pydantic_core-2.14.1-cp312-none-win_arm64.whl", hash = "sha256:24ba48f9d0b8d64fc5e42e1600366c3d7db701201294989aebdaca23110c02ab"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:ba55d73a2df4771b211d0bcdea8b79454980a81ed34a1d77a19ddcc81f98c895"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e905014815687d88cbb14bbc0496420526cf20d49f20606537d87646b70f1046"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:443dc5eede7fa76b2370213e0abe881eb17c96f7d694501853c11d5d56916602"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abae6fd5504e5e438e4f6f739f8364fd9ff5a5cdca897e68363e2318af90bc28"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9486e27bb3f137f33e2315be2baa0b0b983dae9e2f5f5395240178ad8e644728"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69df82892ff00491d673b1929538efb8c8d68f534fdc6cb7fd3ac8a5852b9034"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184ff7b30c3f60e1b775378c060099285fd4b5249271046c9005f8b247b39377"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3d5b2a4b3c10cad0615670cab99059441ff42e92cf793a0336f4bc611e895204"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:871c641a83719caaa856a11dcc61c5e5b35b0db888e1a0d338fe67ce744575e2"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1e7208946ea9b27a8cef13822c339d4ae96e45952cc01fc4a91c7f1cb0ae2861"}, + {file = "pydantic_core-2.14.1-cp37-none-win32.whl", hash = "sha256:b4ff385a525017f5adf6066d7f9fb309f99ade725dcf17ed623dc7dce1f85d9f"}, + {file = "pydantic_core-2.14.1-cp37-none-win_amd64.whl", hash = "sha256:c7411cd06afeb263182e38c6ca5b4f5fe4f20d91466ad7db0cd6af453a02edec"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:2871daf5b2823bf77bf7d3d43825e5d904030c155affdf84b21a00a2e00821d2"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7977e261cac5f99873dc2c6f044315d09b19a71c4246560e1e67593889a90978"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5a111f9158555582deadd202a60bd7803b6c68f406391b7cf6905adf0af6811"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac417312bf6b7a0223ba73fb12e26b2854c93bf5b1911f7afef6d24c379b22aa"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c36987f5eb2a7856b5f5feacc3be206b4d1852a6ce799f6799dd9ffb0cba56ae"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6e98227eb02623d57e1fd061788837834b68bb995a869565211b9abf3de4bf4"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:023b6d7ec4e97890b28eb2ee24413e69a6d48de4e8b75123957edd5432f4eeb3"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6015beb28deb5306049ecf2519a59627e9e050892927850a884df6d5672f8c7d"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3f48d4afd973abbd65266ac24b24de1591116880efc7729caf6b6b94a9654c9e"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:28734bcfb8fc5b03293dec5eb5ea73b32ff767f6ef79a31f6e41dad2f5470270"}, + {file = "pydantic_core-2.14.1-cp38-none-win32.whl", hash = "sha256:3303113fdfaca927ef11e0c5f109e2ec196c404f9d7ba5f8ddb63cdf287ea159"}, + {file = "pydantic_core-2.14.1-cp38-none-win_amd64.whl", hash = "sha256:144f2c1d5579108b6ed1193fcc9926124bd4142b0f7020a7744980d1235c8a40"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:893bf4fb9bfb9c4639bc12f3de323325ada4c6d60e478d5cded65453e9364890"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:052d8731aaf844f91fe4cd3faf28983b109a5865b3a256ec550b80a5689ead87"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb1c6ecb53e4b907ee8486f453dd940b8cbb509946e2b671e3bf807d310a96fc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:94cf6d0274eb899d39189144dcf52814c67f9b0fd196f211420d9aac793df2da"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36c3bf96f803e207a80dbcb633d82b98ff02a9faa76dd446e969424dec8e2b9f"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb290491f1f0786a7da4585250f1feee200fc17ff64855bdd7c42fb54526fa29"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6590ed9d13eb51b28ea17ddcc6c8dbd6050b4eb589d497105f0e13339f223b72"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69cd74e55a5326d920e7b46daa2d81c2bdb8bcf588eafb2330d981297b742ddc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d965bdb50725a805b083f5f58d05669a85705f50a6a864e31b545c589290ee31"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ca942a2dc066ca5e04c27feaa8dfb9d353ddad14c6641660c565149186095343"}, + {file = "pydantic_core-2.14.1-cp39-none-win32.whl", hash = "sha256:72c2ef3787c3b577e5d6225d73a77167b942d12cef3c1fbd5e74e55b7f881c36"}, + {file = "pydantic_core-2.14.1-cp39-none-win_amd64.whl", hash = "sha256:55713d155da1e508083c4b08d0b1ad2c3054f68b8ef7eb3d3864822e456f0bb5"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:53efe03cc383a83660cfdda6a3cb40ee31372cedea0fde0b2a2e55e838873ab6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:f523e116879bc6714e61d447ce934676473b068069dce6563ea040381dc7a257"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85bb66d661be51b2cba9ca06759264b3469d2dbb53c3e6effb3f05fec6322be6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f53a3ccdc30234cb4342cec541e3e6ed87799c7ca552f0b5f44e3967a5fed526"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1bfb63821ada76719ffcd703fc40dd57962e0d8c253e3c565252e6de6d3e0bc6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e2c689439f262c29cf3fcd5364da1e64d8600facecf9eabea8643b8755d2f0de"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a15f6e5588f7afb7f6fc4b0f4ff064749e515d34f34c666ed6e37933873d8ad8"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:f1a30eef060e21af22c7d23349f1028de0611f522941c80efa51c05a63142c62"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16f4a7e1ec6b3ea98a1e108a2739710cd659d68b33fbbeaba066202cab69c7b6"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd80a2d383940eec3db6a5b59d1820f947317acc5c75482ff8d79bf700f8ad6a"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a68a36d71c7f638dda6c9e6b67f6aabf3fa1471b198d246457bfdc7c777cdeb7"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ebc79120e105e4bcd7865f369e3b9dbabb0d492d221e1a7f62a3e8e292550278"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c8c466facec2ccdf025b0b1455b18f2c3d574d5f64d24df905d3d7b8f05d5f4e"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b91b5ec423e88caa16777094c4b2b97f11453283e7a837e5e5e1b886abba1251"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e49aa0cb316f743bc7792c36aefa39fc2221312f1d4b333b19edbdd71f2b1"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f483467c046f549572f8aca3b7128829e09ae3a9fe933ea421f7cb7c58120edb"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dee4682bd7947afc682d342a8d65ad1834583132383f8e801601a8698cb8d17a"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8d927d042c0ef04607ee7822828b208ab045867d20477ec6593d612156798547"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5a1570875eb0d1479fb2270ed80c88c231aaaf68b0c3f114f35e7fb610435e4f"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cb2fd3ab67558eb16aecfb4f2db4febb4d37dc74e6b8613dc2e7160fb58158a9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a7991f25b98038252363a03e6a9fe92e60fe390fda2631d238dc3b0e396632f8"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b45b7be9f99991405ecd6f6172fb6798908a8097106ae78d5cc5cc15121bad9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:51506e7652a2ef1d1cf763c4b51b972ff4568d1dddc96ca83931a6941f5e6389"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:66dc0e63349ec39c1ea66622aa5c2c1f84382112afd3ab2fa0cca4fb01f7db39"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8e17f0c3ba4cb07faa0038a59ce162de584ed48ba645c8d05a5de1e40d4c21e7"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d983222223f63e323a5f497f5b85e211557a5d8fb670dc88f343784502b466ba"}, + {file = "pydantic_core-2.14.1.tar.gz", hash = "sha256:0d82a6ee815388a362885186e431fac84c7a06623bc136f508e9f88261d8cadb"}, ] pydoc-markdown = [ {file = "pydoc_markdown-4.8.2-py3-none-any.whl", hash = "sha256:203f74119e6bb2f9deba43d452422de7c8ec31955b61e0620fa4dd8c2611715f"}, @@ -8093,27 +8159,27 @@ sniffio = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] snowflake-connector-python = [ - {file = "snowflake-connector-python-3.1.1.tar.gz", hash = "sha256:2700503a5f99d6e22e412d7cf4fd2211296cc0e50b2a38ad9c6f48ddb8beff67"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3aec4ab6f6d66a0dc2b5bbd8fc2c11fd76090c63fdc65577af9d4e28055c51f2"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5d2589f39b1c1c91eda6711181afb7f197f7dd43204f26db48df90849d9f528b"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c540b4fe173cc9a24df285ce49c70fe0dadc6316b8a2160324c549086a71a118"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25007ccf5d9c0b87e29af40470f6f1e76d03621642a7492d62282215b7e9d67d"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:fff3caebd8b60cee09ad55674d12b8940b9d5f57a394c8467637167372710841"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7b7622be7bcad26786bf771341e3b4819df6e4d7858e5dd4c8700423ca7364e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:260d259a79e6120bf58fcec9a52705fd02a430f296a77a1531720906b7a02f5e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0163d5036f05a39977c6d7aba5e8bb1632be1117785a72e2602e3a34b89ded1c"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d38546ebcba7bca37a16cfcbbc0f8e7c19946b4e45e0c5dc2a8963f3b739958"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:484044c2d9aacd5c8a0a9d8d8b69b06352e3612f23c5e44d54771a96047d80b1"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e4a4aab55a4a3236625b738fad19524c9cef810fe041d567dc5dc1d9b1f9eb7"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:5d95eeaff7b085b0c8facab40391bede699ffc0865f2cdaa37b19a8429d47943"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a944a1862672552f8c00b98b576a8b16da46f9c5b918ba4b969bd7d1205c32a"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7abb142ba3ee5db6c61be0dc578fa10e59b7c1f33716b0c93ae6706b2a8bbee3"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:bf6ca8f8678dea6cf5275f69dbd9e4ebb18c2211be35379b65175e36e5953b92"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ceb263b95720ab645c2e60e37d436db51321e0192d399631d052387728911689"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:8b7fe82d8d1cdc90caadbcce419d3bcbf1bdeffb9bba974a81a46f389d8ee243"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d46b798507f6c7447e21c76bd71969e22e55fa848196f20de73b3e2b65373b5"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bdcce7069368b7b2ec8a855812c1b0e9e6bdf6b01660225ffff5ba163fa507d"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:daedeff774cd68df05e68dbfa66e83a877e63a99461b8262eb5c8cd37e309aa7"}, + {file = "snowflake-connector-python-3.5.0.tar.gz", hash = "sha256:654e4a1f68a491544bd8f7c5ab02eb8531df67c5f4309d5253bd204044f8a1b3"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a365fa4f23be27a4a46d04f73a48ccb1ddad5b9558f100ba592a49571c90a33c"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5b648b8f32aa540e9adf14e84ea5d77a6c3c6cbc3cbcf172622a0b8db0e99384"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722dc0100c3247788aeb975a8a5941f2f757e8524d2626cf6fe78df02b6384fb"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7029b8776c5d2153ed2b0254dc23ae1e3bde141b6634fc6c77b919ed29d5bb42"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:3472703fc4f308343d925c41dab976a42e10192fa0b8b9025e80b083ad7dcf1b"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:40f4a376b6da875d70383b60c66ad3723f0bed21d8bdbf7afb39525cb70c70ef"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:8a08d8df6f1b5b5d0bf9145e6339dbeaf294392529629d0bd7e4dd3e49d7892c"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac16a00bb3824069303e119cd049858c2caf92d174f9486ba273d19abf06a18d"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a820148b64436621b5db79c2e7848d5d12ece13b0948281c19dd2f8a50e4dbe"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ffa8f95a767e5077e82cf290a43950f37cfc25e34935f038abc96494a1595a03"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ef70cd89aee56fbbaeb68dc1f7612598b0c8a470d16ddb68ca7657bd70cbf8d7"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:09ff23c1aa4bf9e148e491512a81b097ce0b1c2a870f3d0bb0dc5febf764c45c"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e696f133c57494dce57a68a92d1e2cf20334361400fe3c4c73637627f7d9c0ec"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0136a9fb45013ea3d50045acb3cedb50b2d5d6ac1d0f9adc538e28cf86a1386"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:133e2a8a5e7b59d84e83886bb516d290edbd0b92dd69304f8f7ac613faca2aeb"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c463d11b05b57c40eb83d84044d761535a855e498ffd52456e92eed333e43b17"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:cdd198dbc0aff373bb9e95f315cdc0b922ae61186ba9bd7da4950835827cd7f9"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d8769b95a46040261a46dc58757c59b26e6122466222d8b8e518ea6aa62e83d"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee97a8ac0aaf40a7b7420c8936a66d8d33376cd40498ac3d38efa7bb5712d14a"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8cd747e2719ba44dd2ce0e9b1e6f8b03485b2b335a352f3b45138b56fad5888"}, ] sortedcontainers = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, diff --git a/pyproject.toml b/pyproject.toml index 627ec8344f..6798df3696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ psycopg2-binary = {version = ">=2.9.1", optional = true} psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_implementation == 'PyPy'"} grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} -pyarrow = {version = ">=8.0.0", optional = true} +pyarrow = {version = ">=12.0.0", optional = true} duckdb = {version = ">=0.6.1,<0.10.0", optional = true} dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} @@ -68,12 +68,11 @@ dbt-athena-community = {version = ">=1.2.0", optional = true} s3fs = {version = ">=2022.4.0", optional = true} gcsfs = {version = ">=2022.4.0", optional = true} botocore = {version = ">=1.28", optional = true} -snowflake-connector-python = {version = ">=3.1.1", optional = true, extras = ["pandas"]} +snowflake-connector-python = {version = ">=3.5.0", optional = true, extras = ["pandas"]} cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -pydantic = {version = ">=1.10,<2.0", optional = true} adlfs = {version = ">=2022.4.0", optional = true} pyodbc = {version = "^4.0.39", optional = true} qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} @@ -97,7 +96,6 @@ motherduck = ["duckdb", "pyarrow"] cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] -pydantic = ["pydantic"] mssql = ["pyodbc"] qdrant = ["qdrant-client"] @@ -107,13 +105,8 @@ dlt = "dlt.cli._dlt:_main" [tool.poetry.group.dev.dependencies] requests-mock = "^1.10.0" types-click = "^7.1.8" -pandas = "^1.5.3" sqlfluff = "^2.3.2" -google-auth-oauthlib = "^1.0.0" types-deprecated = "^1.2.9.2" -tqdm = "^4.65.0" -enlighten = "^1.11.2" -alive-progress = "^3.1.1" pytest-console-scripts = "^1.4.1" pytest = "^6.2.4" mypy = "^1.6.1" @@ -139,6 +132,17 @@ types-tqdm = "^4.66.0.2" types-psutil = "^5.9.5.16" types-psycopg2 = "^2.9.21.14" +[tool.poetry.group.pipeline] +optional=true + +[tool.poetry.group.pipeline.dependencies] +google-auth-oauthlib = "^1.0.0" +tqdm = "^4.65.0" +enlighten = "^1.11.2" +alive-progress = "^3.1.1" +pydantic = ">2" +pandas = ">2" + [tool.poetry.group.airflow] optional = true @@ -151,6 +155,9 @@ optional = true [tool.poetry.group.providers.dependencies] google-api-python-client = "^2.86.0" +[tool.poetry.group.sentry-sdk] +optional = true + [tool.poetry.group.sentry-sdk.dependencies] sentry-sdk = "^1.5.6" diff --git a/pytest.ini b/pytest.ini index fc7ce9119b..88c8353a69 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,4 +6,5 @@ xfail_strict= true log_cli= 1 log_cli_level= INFO python_files = test_*.py *_test.py *snippets.py *snippet.pytest -python_functions = *_test test_* *_snippet \ No newline at end of file +python_functions = *_test test_* *_snippet +filterwarnings= ignore::DeprecationWarning \ No newline at end of file diff --git a/tests/cases.py b/tests/cases.py index ca8a97082e..70c20d74af 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -333,7 +333,14 @@ def assert_all_data_types_row( assert db_mapping == expected_rows -def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = True, include_time: bool = True, num_rows: int = 3) -> Tuple[Any, List[Dict[str, Any]]]: +def arrow_table_all_data_types( + object_format: TArrowFormat, + include_json: bool = True, + include_time: bool = True, + include_not_normalized_name: bool = True, + include_name_clash: bool = False, + num_rows: int = 3 +) -> Tuple[Any, List[Dict[str, Any]]]: """Create an arrow object or pandas dataframe with all supported data types. Returns the table and its records in python format @@ -342,7 +349,6 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = from dlt.common.libs.pyarrow import pyarrow as pa data = { - "Pre Normalized Column": [random.choice(ascii_lowercase) for _ in range(num_rows)], "string": [random.choice(ascii_lowercase) for _ in range(num_rows)], "float": [round(random.uniform(0, 100), 4) for _ in range(num_rows)], "int": [random.randrange(0, 100) for _ in range(num_rows)], @@ -355,6 +361,12 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = "null": pd.Series( [None for _ in range(num_rows)]) } + if include_name_clash: + data["pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + include_not_normalized_name = True + if include_not_normalized_name: + data["Pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + if include_json: data["json"] = [{"a": random.randrange(0, 100)} for _ in range(num_rows)] diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml new file mode 100644 index 0000000000..f8645d78ae --- /dev/null +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -0,0 +1,459 @@ +version: 15 +version_hash: yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= +engine_version: 7 +name: ethereum +tables: + _dlt_loads: + columns: + load_id: + nullable: false + data_type: text + name: load_id + schema_name: + nullable: true + data_type: text + name: schema_name + status: + nullable: false + data_type: bigint + name: status + inserted_at: + nullable: false + data_type: timestamp + name: inserted_at + schema_version_hash: + nullable: true + data_type: text + name: schema_version_hash + write_disposition: skip + description: Created by DLT. Tracks completed loads + schema_contract: {} + name: _dlt_loads + resource: _dlt_loads + _dlt_version: + columns: + version: + nullable: false + data_type: bigint + name: version + engine_version: + nullable: false + data_type: bigint + name: engine_version + inserted_at: + nullable: false + data_type: timestamp + name: inserted_at + schema_name: + nullable: false + data_type: text + name: schema_name + version_hash: + nullable: false + data_type: text + name: version_hash + schema: + nullable: false + data_type: text + name: schema + write_disposition: skip + description: Created by DLT. Tracks schema updates + schema_contract: {} + name: _dlt_version + resource: _dlt_version + blocks: + description: Ethereum blocks + x-annotation: this will be preserved on save + write_disposition: append + filters: + includes: [] + excludes: [] + columns: + _dlt_load_id: + nullable: false + description: load id coming from the extractor + data_type: text + name: _dlt_load_id + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + number: + nullable: false + primary_key: true + data_type: bigint + name: number + parent_hash: + nullable: true + data_type: text + name: parent_hash + hash: + nullable: false + cluster: true + unique: true + data_type: text + name: hash + base_fee_per_gas: + nullable: false + data_type: wei + name: base_fee_per_gas + difficulty: + nullable: false + data_type: wei + name: difficulty + extra_data: + nullable: true + data_type: text + name: extra_data + gas_limit: + nullable: false + data_type: bigint + name: gas_limit + gas_used: + nullable: false + data_type: bigint + name: gas_used + logs_bloom: + nullable: true + data_type: binary + name: logs_bloom + miner: + nullable: true + data_type: text + name: miner + mix_hash: + nullable: true + data_type: text + name: mix_hash + nonce: + nullable: true + data_type: text + name: nonce + receipts_root: + nullable: true + data_type: text + name: receipts_root + sha3_uncles: + nullable: true + data_type: text + name: sha3_uncles + size: + nullable: true + data_type: bigint + name: size + state_root: + nullable: false + data_type: text + name: state_root + timestamp: + nullable: false + unique: true + sort: true + data_type: timestamp + name: timestamp + total_difficulty: + nullable: true + data_type: wei + name: total_difficulty + transactions_root: + nullable: false + data_type: text + name: transactions_root + schema_contract: {} + name: blocks + resource: blocks + blocks__transactions: + parent: blocks + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + block_number: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: block_number + transaction_index: + nullable: false + primary_key: true + data_type: bigint + name: transaction_index + hash: + nullable: false + unique: true + data_type: text + name: hash + block_hash: + nullable: false + cluster: true + data_type: text + name: block_hash + block_timestamp: + nullable: false + sort: true + data_type: timestamp + name: block_timestamp + chain_id: + nullable: true + data_type: text + name: chain_id + from: + nullable: true + data_type: text + name: from + gas: + nullable: true + data_type: bigint + name: gas + gas_price: + nullable: true + data_type: bigint + name: gas_price + input: + nullable: true + data_type: text + name: input + max_fee_per_gas: + nullable: true + data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: + nullable: true + data_type: wei + name: max_priority_fee_per_gas + nonce: + nullable: true + data_type: bigint + name: nonce + r: + nullable: true + data_type: text + name: r + s: + nullable: true + data_type: text + name: s + status: + nullable: true + data_type: bigint + name: status + to: + nullable: true + data_type: text + name: to + type: + nullable: true + data_type: text + name: type + v: + nullable: true + data_type: bigint + name: v + value: + nullable: false + data_type: wei + name: value + eth_value: + nullable: true + data_type: decimal + name: eth_value + name: blocks__transactions + blocks__transactions__logs: + parent: blocks__transactions + columns: + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + address: + nullable: false + data_type: text + name: address + block_timestamp: + nullable: false + sort: true + data_type: timestamp + name: block_timestamp + block_hash: + nullable: false + cluster: true + data_type: text + name: block_hash + block_number: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: block_number + transaction_index: + nullable: false + primary_key: true + foreign_key: true + data_type: bigint + name: transaction_index + log_index: + nullable: false + primary_key: true + data_type: bigint + name: log_index + data: + nullable: true + data_type: text + name: data + removed: + nullable: true + data_type: bool + name: removed + transaction_hash: + nullable: false + data_type: text + name: transaction_hash + name: blocks__transactions__logs + blocks__transactions__logs__topics: + parent: blocks__transactions__logs + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__transactions__logs__topics + blocks__transactions__access_list: + parent: blocks__transactions + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + address: + nullable: true + data_type: text + name: address + name: blocks__transactions__access_list + blocks__transactions__access_list__storage_keys: + parent: blocks__transactions__access_list + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__transactions__access_list__storage_keys + blocks__uncles: + parent: blocks + columns: + _dlt_parent_id: + nullable: false + foreign_key: true + data_type: text + name: _dlt_parent_id + _dlt_list_idx: + nullable: false + data_type: bigint + name: _dlt_list_idx + _dlt_id: + nullable: false + unique: true + data_type: text + name: _dlt_id + _dlt_root_id: + nullable: false + root_key: true + data_type: text + name: _dlt_root_id + value: + nullable: true + data_type: text + name: value + name: blocks__uncles +settings: + default_hints: + foreign_key: + - _dlt_parent_id + not_null: + - re:^_dlt_id$ + - _dlt_root_id + - _dlt_parent_id + - _dlt_list_idx + unique: + - _dlt_id + cluster: + - block_hash + partition: + - block_timestamp + root_key: + - _dlt_root_id + preferred_types: + timestamp: timestamp + block_timestamp: timestamp + schema_contract: {} +normalizers: + names: dlt.common.normalizers.names.snake_case + json: + module: dlt.common.normalizers.json.relational + config: + generate_dlt_id: true + propagation: + root: + _dlt_id: _dlt_root_id + tables: + blocks: + timestamp: block_timestamp + hash: block_hash + diff --git a/tests/common/data_writers/test_buffered_writer.py b/tests/common/data_writers/test_buffered_writer.py index 85cfcb2d0c..c275f22b2b 100644 --- a/tests/common/data_writers/test_buffered_writer.py +++ b/tests/common/data_writers/test_buffered_writer.py @@ -1,28 +1,14 @@ -import os -from typing import Iterator, Set, Literal +from typing import Iterator import pytest -from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter from dlt.common.data_writers.exceptions import BufferedDataWriterClosed -from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage from dlt.common.typing import DictStrAny -from tests.utils import TEST_STORAGE_ROOT, write_version, autouse_test_storage -import datetime # noqa: 251 - - -ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} - - -def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: - caps = DestinationCapabilitiesContext.generic_capabilities() - caps.preferred_loader_file_format = _format - file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") - return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) +from tests.common.data_writers.utils import ALL_WRITERS, get_writer def test_write_no_item() -> None: @@ -175,47 +161,3 @@ def test_writer_optional_schema(disable_compression: bool) -> None: with get_writer(_format="jsonl", disable_compression=disable_compression) as writer: writer.write_data_item([{"col1": 1}], None) writer.write_data_item([{"col1": 1}], None) - - -@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) -def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format=writer_format) as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item({"col1": 1}, columns=c1) - assert writer._buffered_items_count == 1 - # list - writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) - assert writer._buffered_items_count == 3 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 3 - - -def test_writer_items_count_arrow() -> None: - import pyarrow as pa - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format="arrow") as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) - assert writer._buffered_items_count == 1 - # single item with many rows - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) - assert writer._buffered_items_count == 3 - # empty list - writer.write_data_item([], columns=c1) - assert writer._buffered_items_count == 3 - # list with one item - writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) - assert writer._buffered_items_count == 4 - # list with many items - writer.write_data_item( - [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], - columns=c1 - ) - assert writer._buffered_items_count == 7 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py new file mode 100644 index 0000000000..e1a071903f --- /dev/null +++ b/tests/common/data_writers/utils.py @@ -0,0 +1,17 @@ +import os +from typing import Set, Literal + + +from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter +from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext + +from tests.utils import TEST_STORAGE_ROOT + +ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} + + +def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.preferred_loader_file_format = _format + file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") + return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 7169044117..91b5a93466 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -321,6 +321,59 @@ def test_list_position(norm: RelationalNormalizer) -> None: # print(rows) +def test_control_descending(norm: RelationalNormalizer) -> None: + row: StrAny = { + "f": [{ + "l": ["a", "b", "c"], + "v": 120, + "lo": [[{"e": "a"}, {"e": "b"}, {"e":"c"}]] + }], + "g": "val" + } + + # break at first row + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # prevent yielding descendants of "f" but yield all else + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + (table, _), _ = rows_gen.send(True) + assert table == "table__f" + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # descend into "l" + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__l" + assert one_row["value"] == "a" + # get next element in the list - even with sending False - we do not descend + (table, _), one_row = rows_gen.send(False) + assert table == "table__f__l" + assert one_row["value"] == "b" + + # prevent descending into list of lists + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + # yield "l" + next(rows_gen) + next(rows_gen) + next(rows_gen) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__lo" + # do not descend into lists + with pytest.raises(StopIteration): + rows_gen.send(False) + + def test_list_in_list() -> None: chats = { "_dlt_id": "123456", diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 64e90c7c21..2eb903f041 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -202,14 +202,17 @@ def test_diff_tables() -> None: # ignore identical table props existing = deepcopy(table) changed["write_disposition"] = "append" + changed["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", "write_disposition": "append", + "schema_contract": "freeze", "columns": {} } existing["write_disposition"] = "append" + existing["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index 8b465d796e..f5f406a7a1 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -132,7 +132,7 @@ def test_simple_regex_validator() -> None: def test_load_corrupted_schema() -> None: - eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v4") + eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") del eth_v4["tables"]["blocks"] with pytest.raises(ParentTableNotFoundException): utils.validate_stored_schema(eth_v4) @@ -203,13 +203,21 @@ def test_replace_schema_content() -> None: eth_v5: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v5") eth_v5["imported_version_hash"] = "IMP_HASH" schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] - schema_eth.bump_version() schema.replace_schema_content(schema_eth) assert schema_eth.stored_version_hash == schema.stored_version_hash assert schema_eth.version == schema.version assert schema_eth.version_hash == schema.version_hash assert schema_eth._imported_version_hash == schema._imported_version_hash + # replace content of modified schema + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + assert schema_eth.version_hash != schema_eth.stored_version_hash + # replace content does not bump version + schema = Schema("simple") + schema.replace_schema_content(schema_eth) + assert schema.version_hash != schema.stored_version_hash + @pytest.mark.parametrize("columns,hint,value", [ (["_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx"], "nullable", False), @@ -573,7 +581,7 @@ def assert_new_schema_values(schema: Schema) -> None: assert schema.stored_version == 1 assert schema.stored_version_hash is not None assert schema.version_hash is not None - assert schema.ENGINE_VERSION == 6 + assert schema.ENGINE_VERSION == 7 assert len(schema.settings["default_hints"]) > 0 # check settings assert utils.standard_type_detections() == schema.settings["detections"] == schema._type_detections diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py new file mode 100644 index 0000000000..2f6b4743f3 --- /dev/null +++ b/tests/common/schema/test_schema_contract.py @@ -0,0 +1,324 @@ +from typing import cast + +import pytest +import copy + +from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE, TSchemaContractDict +from dlt.common.schema.exceptions import DataValidationError +from dlt.common.schema.typing import TTableSchema + +def get_schema() -> Schema: + s = Schema("event") + + columns = { + "column_1": { + "name": "column_1", + "data_type": "text" + }, + "column_2": { + "name": "column_2", + "data_type": "bigint", + "is_variant": True + } + } + + incomplete_columns = { + "incomplete_column_1": { + "name": "incomplete_column_1", + }, + "incomplete_column_2": { + "name": "incomplete_column_2", + } + } + + + # add some tables + s.update_table(cast(TTableSchema, { + "name": "tables", + "columns": columns + })) + + s.update_table(cast(TTableSchema, { + "name": "child_table", + "parent": "tables", + "columns": columns + })) + + s.update_table(cast(TTableSchema, { + "name": "incomplete_table", + "columns": incomplete_columns + })) + + s.update_table(cast(TTableSchema, { + "name": "mixed_table", + "columns": {**incomplete_columns, **columns} + })) + + s.update_table(cast(TTableSchema, { + "name": "evolve_once_table", + "x-normalizer": {"evolve-columns-once": True}, + "columns": {**incomplete_columns, **columns} + })) + + return s + + +def test_resolve_contract_settings() -> None: + + # defaults + schema = get_schema() + assert schema.resolve_contract_settings_for_table("tables") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("child_table") == DEFAULT_SCHEMA_CONTRACT_MODE + + # table specific full setting + schema = get_schema() + schema.tables["tables"]["schema_contract"] = "freeze" + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + + # table specific single setting + schema = get_schema() + schema.tables["tables"]["schema_contract"] = { + "tables": "freeze", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + + # schema specific full setting + schema = get_schema() + schema._settings["schema_contract"] = "freeze" + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + + # schema specific single setting + schema = get_schema() + schema._settings["schema_contract"] = { + "tables": "freeze", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + + # mixed settings: table setting always prevails + schema = get_schema() + schema._settings["schema_contract"] = "freeze" + schema.tables["tables"]["schema_contract"] = { + "tables": "evolve", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "evolve", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "evolve", + "columns": "discard_value", + "data_type": "evolve" + } + + +# ensure other settings do not interfere with the main setting we are testing +base_settings = [{ + "tables": "evolve", + "columns": "evolve", + "data_type": "evolve" + }, { + "tables": "discard_row", + "columns": "discard_row", + "data_type": "discard_row" + }, { + "tables": "discard_value", + "columns": "discard_value", + "data_type": "discard_value" + }, { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } +] + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_table(base_settings) -> None: + + schema = get_schema() + new_table = copy.deepcopy(schema.tables["tables"]) + new_table["name"] = "new_table" + + # + # check adding new table + # + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), new_table) + assert (partial, filters) == (new_table, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, raise_on_freeze=False) + assert (partial, filters) == (None, [("tables", "new_table", "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, data_item={"item": 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == "new_table" + assert val_ex.value.column_name is None + assert val_ex.value.contract_entity == "tables" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is None # there's no validating schema on new table + assert val_ex.value.data_item == {"item": 1} + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_new_columns(base_settings) -> None: + schema = get_schema() + + + def assert_new_column(table_update: TTableSchema, column_name: str) -> None: + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop(column_name) + + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), {column_name: 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == column_name + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item == {column_name: 1} + + # + # check adding new column + # + table_update: TTableSchema = { + "name": "tables", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "text" + } + } + } + assert_new_column(table_update, "new_column") + + # + # check adding new column if target column is not complete + # + table_update = { + "name": "mixed_table", + "columns": { + "incomplete_column_1": { + "name": "incomplete_column_1", + } + } + } + assert_new_column(table_update, "incomplete_column_1") + + # + # check x-normalize evolve_once behaving as evolve override + # + table_update = { + "name": "evolve_once_table", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "text" + }, + "incomplete_column_1": { + "name": "incomplete_column_1", + } + } + } + partial, filters = schema.apply_schema_contract(base_settings, copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + + +def test_check_adding_new_variant() -> None: + schema = get_schema() + + # + # check adding new variant column + # + table_update: TTableSchema = { + "name": "tables", + "columns": { + "column_2_variant": { + "name": "column_2_variant", + "data_type": "bigint", + "variant": True + } + } + } + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop("column_2_variant") + + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == "column_2_variant" + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item is None # we do not pass it to apply_schema_contract + + # variants are not new columns - new data types + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + + # evolve once does not apply to variant evolution + table_update["name"] = "evolve_once_table" + with pytest.raises(DataValidationError): + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 1bfaaa5da2..4e4278a539 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -83,10 +83,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") - version = eth_v6["version"] - version_hash = eth_v6["version_hash"] - schema = Schema.from_dict(eth_v6) # type: ignore[arg-type] + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + version = eth_v7["version"] + version_hash = eth_v7["version_hash"] + schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -95,8 +95,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") - schema = Schema.from_dict(eth_v6) # type: ignore[arg-type] + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] diff --git a/tests/common/storages/test_schema_storage.py b/tests/common/storages/test_schema_storage.py index f45773e4f5..a4b6c5c89f 100644 --- a/tests/common/storages/test_schema_storage.py +++ b/tests/common/storages/test_schema_storage.py @@ -11,7 +11,7 @@ from dlt.common.storages import SchemaStorageConfiguration, SchemaStorage, LiveSchemaStorage, FileStorage from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT -from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V6 +from tests.common.utils import load_yml_case, yml_case_path, COMMON_TEST_CASES_PATH, IMPORTED_VERSION_HASH_ETH_V7 @pytest.fixture @@ -194,10 +194,10 @@ def test_save_store_schema_over_import(ie_storage: SchemaStorage) -> None: ie_storage.save_schema(schema) assert schema.version_hash == schema_hash # we linked schema to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # load schema and make sure our new schema is here schema = ie_storage.load_schema("ethereum") - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 assert schema._stored_version_hash == schema_hash assert schema.version_hash == schema_hash # we have simple schema in export folder @@ -213,7 +213,7 @@ def test_save_store_schema_over_import_sync(synced_storage: SchemaStorage) -> No schema = Schema("ethereum") schema_hash = schema.version_hash synced_storage.save_schema(schema) - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # import schema is overwritten fs = FileStorage(synced_storage.config.import_schema_path) exported_name = synced_storage._file_name_in_store("ethereum", "yaml") @@ -269,12 +269,12 @@ def test_schema_from_file() -> None: def prepare_import_folder(storage: SchemaStorage) -> None: - shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v6"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) + shutil.copy(yml_case_path("schemas/eth/ethereum_schema_v7"), os.path.join(storage.storage.storage_path, "../import/ethereum.schema.yaml")) def assert_schema_imported(synced_storage: SchemaStorage, storage: SchemaStorage) -> Schema: prepare_import_folder(synced_storage) - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v6") + eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") schema = synced_storage.load_schema("ethereum") # is linked to imported schema schema._imported_version_hash = eth_v6["version_hash"] diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index 6900c6fdcf..a4296279bf 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -1,7 +1,5 @@ from typing import List from fsspec import AbstractFileSystem -import pandas -from pyarrow import parquet from dlt.common import pendulum from dlt.common.storages import FilesystemConfiguration @@ -29,13 +27,16 @@ def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFile assert content == f.read() # read via various readers if item["mime_type"] == "text/csv": - with file_dict.open() as f: - df = pandas.read_csv(f, header="infer") - assert len(df.to_dict(orient="records")) > 0 + # parse csv + with file_dict.open(mode="rt") as f: + from csv import DictReader + elements = list(DictReader(f)) + assert len(elements) > 0 if item["mime_type"] == "application/parquet": + # verify it is a real parquet with file_dict.open() as f: - table = parquet.ParquetFile(f).read() - assert len(table.to_pylist()) + parquet: bytes = f.read() + assert parquet.startswith(b"PAR1") if item["mime_type"].startswith("text"): with file_dict.open(mode="rt") as f_txt: lines = f_txt.readlines() diff --git a/tests/common/test_json.py b/tests/common/test_json.py index 983484d326..f6e9b06425 100644 --- a/tests/common/test_json.py +++ b/tests/common/test_json.py @@ -6,7 +6,7 @@ from dlt.common import json, Decimal, pendulum from dlt.common.arithmetics import numeric_default_context -from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, _orjson, _simplejson, SupportsJson, _DATETIME +from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, may_have_pua, _orjson, _simplejson, SupportsJson, _DATETIME from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED, JSON_TYPED_DICT_NESTED, JSON_TYPED_DICT_NESTED_DECODED @@ -250,6 +250,18 @@ def test_json_typed_encode(json_impl: SupportsJson) -> None: assert d_d == JSON_TYPED_DICT_DECODED +@pytest.mark.parametrize("json_impl", _JSON_IMPL) +def test_pua_detection(json_impl: SupportsJson) -> None: + with io.BytesIO() as b: + json_impl.typed_dump(JSON_TYPED_DICT, b) + content_b = b.getvalue() + assert may_have_pua(content_b) + with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: + content_b = f.read() + assert not may_have_pua(content_b) + + + def test_load_and_compare_all_impls() -> None: with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: content_b = f.read() diff --git a/tests/common/test_pydantic.py b/tests/common/test_pydantic.py deleted file mode 100644 index 770fcce6e5..0000000000 --- a/tests/common/test_pydantic.py +++ /dev/null @@ -1,134 +0,0 @@ -import pytest -from typing import Union, Optional, List, Dict, Any -from enum import Enum - -from datetime import datetime, date, time # noqa: I251 -from dlt.common import Decimal -from dlt.common import json - -from pydantic import BaseModel, Json, AnyHttpUrl -from dlt.common.libs.pydantic import pydantic_to_table_schema_columns - - -class StrEnum(str, Enum): - a = "a_value" - b = "b_value" - c = "c_value" - - -class IntEnum(int, Enum): - a = 0 - b = 1 - c = 2 - - -class MixedEnum(Enum): - a_int = 0 - b_str = "b_value" - c_int = 2 - - -class NestedModel(BaseModel): - nested_field: str - - -class Model(BaseModel): - bigint_field: int - text_field: str - timestamp_field: datetime - date_field: date - decimal_field: Decimal - double_field: float - time_field: time - - nested_field: NestedModel - list_field: List[str] - - union_field: Union[int, str] - - optional_field: Optional[float] - - blank_dict_field: dict # type: ignore[type-arg] - parametrized_dict_field: Dict[str, int] - - str_enum_field: StrEnum - int_enum_field: IntEnum - # Both of these shouold coerce to str - mixed_enum_int_field: MixedEnum - mixed_enum_str_field: MixedEnum - - json_field: Json[List[str]] - - url_field: AnyHttpUrl - - any_field: Any - json_any_field: Json[Any] - - - -@pytest.mark.parametrize('instance', [True, False]) -def test_pydantic_model_to_columns(instance: bool) -> None: - if instance: - model = Model( - bigint_field=1, text_field="text", timestamp_field=datetime.now(), - date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, - time_field=time(1, 2, 3, 12345), - nested_field=NestedModel(nested_field="nested"), - list_field=["a", "b", "c"], - union_field=1, - optional_field=None, - blank_dict_field={}, - parametrized_dict_field={"a": 1, "b": 2, "c": 3}, - str_enum_field=StrEnum.a, - int_enum_field=IntEnum.a, - mixed_enum_int_field=MixedEnum.a_int, - mixed_enum_str_field=MixedEnum.b_str, - json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] - url_field="https://example.com", # type: ignore[arg-type] - any_field="any_string", - json_any_field=json.dumps("any_string"), - ) - else: - model = Model # type: ignore[assignment] - - result = pydantic_to_table_schema_columns(model) - - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" - assert result["date_field"]["data_type"] == "date" - assert result["decimal_field"]["data_type"] == "decimal" - assert result["double_field"]["data_type"] == "double" - assert result["time_field"]["data_type"] == "time" - assert result["nested_field"]["data_type"] == "complex" - assert result['list_field']['data_type'] == 'complex' - assert result['union_field']['data_type'] == 'bigint' - assert result['optional_field']['data_type'] == 'double' - assert result['optional_field']['nullable'] is True - assert result['blank_dict_field']['data_type'] == 'complex' - assert result['parametrized_dict_field']['data_type'] == 'complex' - assert result['str_enum_field']['data_type'] == 'text' - assert result['int_enum_field']['data_type'] == 'bigint' - assert result['mixed_enum_int_field']['data_type'] == 'text' - assert result['mixed_enum_str_field']['data_type'] == 'text' - assert result['json_field']['data_type'] == 'complex' - assert result['url_field']['data_type'] == 'text' - - # Any type fields are excluded from schema - assert 'any_field' not in result - assert 'json_any_field' not in result - - -def test_pydantic_model_skip_complex_types() -> None: - result = pydantic_to_table_schema_columns(Model, skip_complex_types=True) - - assert result["bigint_field"]["data_type"] == "bigint" - - assert "nested_field" not in result - assert "list_field" not in result - assert "blank_dict_field" not in result - assert "parametrized_dict_field" not in result - assert "json_field" not in result - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 399ab284ea..41d3d8d274 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -3,7 +3,7 @@ from dlt.common.configuration.specs.base_configuration import BaseConfiguration, get_config_if_union_hint from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.common.typing import StrAny, extract_inner_type, extract_optional_type, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict +from dlt.common.typing import StrAny, extract_inner_type, extract_union_types, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict, is_union_type @@ -15,6 +15,8 @@ class TTestTyDi(TypedDict): TOptionalLi = Optional[TTestLi] TOptionalTyDi = Optional[TTestTyDi] +TOptionalUnionLiTyDi = Optional[Union[TTestTyDi, TTestLi]] + def test_is_typeddict() -> None: assert is_typeddict(TTestTyDi) is True @@ -28,6 +30,7 @@ def test_is_list_generic_type() -> None: assert is_list_generic_type(List[str]) is True assert is_list_generic_type(Sequence[str]) is True assert is_list_generic_type(MutableSequence[str]) is True + assert is_list_generic_type(TOptionalUnionLiTyDi) is False # type: ignore[arg-type] def test_is_dict_generic_type() -> None: @@ -46,8 +49,19 @@ def test_optional() -> None: assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] assert is_optional_type(TTestTyDi) is False - assert extract_optional_type(TOptionalLi) is TTestLi # type: ignore[arg-type] - assert extract_optional_type(TOptionalTyDi) is TTestTyDi # type: ignore[arg-type] + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] + + +def test_union_types() -> None: + assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] + assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] + assert is_optional_type(TTestTyDi) is False + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] + assert is_optional_type(TOptionalUnionLiTyDi) is True # type: ignore[arg-type] + assert extract_union_types(TOptionalUnionLiTyDi) == [TTestTyDi, TTestLi, type(None)] # type: ignore[arg-type] + assert is_union_type(MutableSequence[str]) is False def test_is_newtype() -> None: diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 4583da3a1e..0a034dc72f 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -1,7 +1,7 @@ from copy import deepcopy import pytest import yaml -from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional +from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional, Union from dlt.common import json from dlt.common.exceptions import DictValidationException @@ -10,8 +10,12 @@ from dlt.common.typing import DictStrStr, StrStr from dlt.common.validation import validate_dict, validate_dict_ignoring_xkeys + + TLiteral = Literal["uno", "dos", "tres"] +class TDict(TypedDict): + field: TLiteral class TTestRecord(TypedDict): f_bool: bool @@ -31,6 +35,7 @@ class TTestRecord(TypedDict): f_literal: TLiteral f_literal_optional: Optional[TLiteral] f_seq_literal: Sequence[Optional[TLiteral]] + f_optional_union: Optional[Union[TLiteral, TDict]] TEST_COL: TColumnSchema = { @@ -74,7 +79,8 @@ class TTestRecord(TypedDict): "f_column": deepcopy(TEST_COL), "f_literal": "uno", "f_literal_optional": "dos", - "f_seq_literal": ["uno", "dos", "tres"] + "f_seq_literal": ["uno", "dos", "tres"], + "f_optional_union": {"field": "uno"} } @pytest.fixture @@ -83,7 +89,7 @@ def test_doc() -> TTestRecord: def test_validate_schema_cases() -> None: - with open("tests/common/cases/schemas/eth/ethereum_schema_v4.yml", mode="r", encoding="utf-8") as f: + with open("tests/common/cases/schemas/eth/ethereum_schema_v7.yml", mode="r", encoding="utf-8") as f: schema_dict: TStoredSchema = yaml.safe_load(f) validate_dict_ignoring_xkeys( @@ -227,3 +233,23 @@ def test_filter(test_doc: TTestRecord) -> None: test_doc["x-extra"] = "x-annotation" # type: ignore[typeddict-unknown-key] # remove x-extra with a filter validate_dict(TTestRecord, test_doc, ".", filter_f=lambda k: k != "x-extra") + + +def test_nested_union(test_doc: TTestRecord) -> None: + test_doc["f_optional_union"] = {"field": "uno"} + validate_dict(TTestRecord, TEST_DOC, ".") + + test_doc["f_optional_union"] = {"field": "not valid"} # type: ignore[typeddict-item] + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == {'field': 'not valid'} + + test_doc["f_optional_union"] = "dos" + validate_dict(TTestRecord, test_doc, ".") + + test_doc["f_optional_union"] = "blah" # type: ignore[typeddict-item] + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == "blah" \ No newline at end of file diff --git a/tests/common/utils.py b/tests/common/utils.py index 54a48825af..d612dcbdcf 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -16,7 +16,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V6 = "Q/LxiP7taycE+u9PQNb2wiit+G5GntiifOUK2CFM3sQ=" +IMPORTED_VERSION_HASH_ETH_V7 = "yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/conftest.py b/tests/conftest.py index 56760508da..8a14fa1550 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,7 +59,7 @@ def _create_pipeline_instance_id(self) -> str: Pipeline._create_pipeline_instance_id = _create_pipeline_instance_id # type: ignore[method-assign] # push sentry to ci - os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + # os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" # disable sqlfluff logging for log in ["sqlfluff.parser", "sqlfluff.linter", "sqlfluff.templater", "sqlfluff.lexer"]: diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index b5d54f9c49..5a8db47163 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,328 +1,428 @@ -version: 11 -version_hash: GPHX4B+0xnRuGZM/w3UYVbldRyg8jSJp1G60K4RDcZg= -engine_version: 5 +version: 14 +version_hash: VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus= +engine_version: 7 name: ethereum tables: _dlt_loads: columns: load_id: - data_type: text nullable: false - schema_name: data_type: text + name: load_id + schema_name: nullable: true + data_type: text + name: schema_name status: - data_type: bigint nullable: false + data_type: bigint + name: status inserted_at: - data_type: timestamp nullable: false + data_type: timestamp + name: inserted_at + schema_version_hash: + nullable: true + data_type: text + name: schema_version_hash write_disposition: skip description: Created by DLT. Tracks completed loads + schema_contract: {} + name: _dlt_loads + resource: _dlt_loads _dlt_version: columns: version: - data_type: bigint nullable: false - engine_version: data_type: bigint + name: version + engine_version: nullable: false + data_type: bigint + name: engine_version inserted_at: - data_type: timestamp nullable: false + data_type: timestamp + name: inserted_at schema_name: - data_type: text nullable: false - version_hash: data_type: text + name: schema_name + version_hash: nullable: false - schema: data_type: text + name: version_hash + schema: nullable: false + data_type: text + name: schema write_disposition: skip description: Created by DLT. Tracks schema updates + schema_contract: {} + name: _dlt_version + resource: _dlt_version blocks: description: Ethereum blocks x-annotation: this will be preserved on save write_disposition: append - table_sealed: true filters: includes: [] excludes: [] columns: _dlt_load_id: + nullable: false description: load id coming from the extractor data_type: text - nullable: false + name: _dlt_load_id _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id number: + nullable: false primary_key: true data_type: bigint - nullable: false + name: number parent_hash: - data_type: text nullable: true + data_type: text + name: parent_hash hash: + nullable: false cluster: true unique: true data_type: text - nullable: false + name: hash base_fee_per_gas: - data_type: wei nullable: false - difficulty: data_type: wei + name: base_fee_per_gas + difficulty: nullable: false + data_type: wei + name: difficulty extra_data: - data_type: text nullable: true + data_type: text + name: extra_data gas_limit: - data_type: bigint nullable: false - gas_used: data_type: bigint + name: gas_limit + gas_used: nullable: false + data_type: bigint + name: gas_used logs_bloom: - data_type: binary nullable: true + data_type: binary + name: logs_bloom miner: - data_type: text nullable: true - mix_hash: data_type: text + name: miner + mix_hash: nullable: true - nonce: data_type: text + name: mix_hash + nonce: nullable: true - receipts_root: data_type: text + name: nonce + receipts_root: nullable: true - sha3_uncles: data_type: text + name: receipts_root + sha3_uncles: nullable: true + data_type: text + name: sha3_uncles size: - data_type: bigint nullable: true + data_type: bigint + name: size state_root: - data_type: text nullable: false + data_type: text + name: state_root timestamp: + nullable: false unique: true sort: true data_type: timestamp - nullable: false + name: timestamp total_difficulty: - data_type: wei nullable: true + data_type: wei + name: total_difficulty transactions_root: - data_type: text nullable: false + data_type: text + name: transactions_root + schema_contract: {} + name: blocks + resource: blocks blocks__transactions: parent: blocks columns: _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id block_number: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false primary_key: true data_type: bigint - nullable: false + name: transaction_index hash: + nullable: false unique: true data_type: text - nullable: false + name: hash block_hash: + nullable: false cluster: true data_type: text - nullable: false + name: block_hash block_timestamp: + nullable: false sort: true data_type: timestamp - nullable: false + name: block_timestamp chain_id: - data_type: text nullable: true - from: data_type: text + name: chain_id + from: nullable: true + data_type: text + name: from gas: - data_type: bigint nullable: true - gas_price: data_type: bigint + name: gas + gas_price: nullable: true + data_type: bigint + name: gas_price input: - data_type: text nullable: true + data_type: text + name: input max_fee_per_gas: - data_type: wei nullable: true - max_priority_fee_per_gas: data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: nullable: true + data_type: wei + name: max_priority_fee_per_gas nonce: - data_type: bigint nullable: true + data_type: bigint + name: nonce r: - data_type: text nullable: true - s: data_type: text + name: r + s: nullable: true + data_type: text + name: s status: - data_type: bigint nullable: true + data_type: bigint + name: status to: - data_type: text nullable: true - type: data_type: text + name: to + type: nullable: true + data_type: text + name: type v: - data_type: bigint nullable: true + data_type: bigint + name: v value: - data_type: wei nullable: false + data_type: wei + name: value eth_value: - data_type: decimal nullable: true + data_type: decimal + name: eth_value + name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions columns: _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id address: - data_type: text nullable: false + data_type: text + name: address block_timestamp: + nullable: false sort: true data_type: timestamp - nullable: false + name: block_timestamp block_hash: + nullable: false cluster: true data_type: text - nullable: false + name: block_hash block_number: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: transaction_index log_index: + nullable: false primary_key: true data_type: bigint - nullable: false + name: log_index data: - data_type: text nullable: true + data_type: text + name: data removed: - data_type: bool nullable: true + data_type: bool + name: removed transaction_hash: - data_type: text nullable: false + data_type: text + name: transaction_hash + name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id address: - data_type: text nullable: true + data_type: text + name: address + name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__uncles settings: - schema_sealed: true default_hints: foreign_key: - _dlt_parent_id @@ -342,6 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp + schema_contract: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index 05e3a2fbf3..28f3d34dcf 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -17,15 +17,18 @@ from dlt.common.schema import Schema from dlt.common.schema.utils import new_table, new_column from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.common.typing import TDataItem from dlt.cli.source_detection import detect_source_configs -from dlt.common.typing import TDataItem -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, DynamicNameNotStandaloneResource, ExplicitSourceNameInvalid, InconsistentTableTemplate, InvalidResourceDataTypeFunctionNotAGenerator, InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable -from dlt.extract.source import DltResource, DltSource -from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.extract import DltResource, DltSource +from dlt.extract.exceptions import (DynamicNameNotStandaloneResource, InvalidResourceDataTypeFunctionNotAGenerator, + InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, + PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, + SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable) from dlt.extract.typing import TableNameMeta -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V6 +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7 def test_none_returning_source() -> None: @@ -72,7 +75,7 @@ def test_load_schema_for_callable() -> None: schema = s.schema assert schema.name == "ethereum" == s.name # the schema in the associated file has this hash - assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema.stored_version_hash == IMPORTED_VERSION_HASH_ETH_V7 def test_unbound_parametrized_transformer() -> None: diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index c487d19aa1..7ed74b41f2 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -1,8 +1,9 @@ import dlt from dlt.common import json from dlt.common.storages import NormalizeStorageConfiguration + +from dlt.extract import DltResource, DltSource from dlt.extract.extract import ExtractorStorage, extract -from dlt.extract.source import DltResource, DltSource from tests.utils import clean_test_storage from tests.extract.utils import expect_extracted_file @@ -18,13 +19,11 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - # odd and even tables - assert len(schema_update) == 2 - assert "odd_table" in schema_update - assert "even_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + # odd and even tables must be in the source schema + assert len(source.schema.data_tables(include_incomplete=True)) == 2 + assert "odd_table" in source.schema._schema_tables + assert "even_table" in source.schema._schema_tables # you must commit the files assert len(storage.list_files_to_normalize_sorted()) == 0 storage.commit_extract_files(extract_id) @@ -42,11 +41,9 @@ def expect_tables(resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert len(schema_update) == 1 - assert "odd_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + assert len(source.schema.data_tables(include_incomplete=True)) == 1 + assert "odd_table" in source.schema._schema_tables storage.commit_extract_files(extract_id) assert len(storage.list_files_to_normalize_sorted()) == 1 expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1,3,5,7,9])) @@ -86,10 +83,10 @@ def input_gen(): source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, input_r.with_name("gen_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) + extract(extract_id, source, storage) # both tables got generated - assert "input_gen" in schema_update - assert "gen_clone" in schema_update + assert "input_gen" in source.schema._schema_tables + assert "gen_clone" in source.schema._schema_tables def test_extract_renamed_clone_and_parent(): @@ -105,8 +102,8 @@ def tx_step(item): source = DltSource("selectables", "module", dlt.Schema("selectables"), [input_r, (input_r | input_tx).with_name("tx_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert "input_gen" in schema_update - assert "tx_clone" in schema_update + extract(extract_id, source, storage) + assert "input_gen" in source.schema._schema_tables + assert "tx_clone" in source.schema._schema_tables # mind that pipe name of the evaluated parent will have different name than the resource assert source.tx_clone._pipe.parent.name == "input_gen_tx_clone" diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index 9d5b37f472..d03b125777 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -17,16 +17,17 @@ from dlt.common.utils import uniq_id, digest128, chunks from dlt.common.json import json -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.sources.helpers.transform import take_first -from dlt.extract.incremental import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing from dlt.pipeline.exceptions import PipelineStepFailed -from tests.extract.utils import AssertItems, data_to_item_format, TItemFormat, ALL_ITEM_FORMATS, data_item_to_list +from tests.extract.utils import AssertItems, data_item_to_list +from tests.utils import data_to_item_format, TDataItemFormat, ALL_DATA_ITEM_FORMATS -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -42,8 +43,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated_transformer(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated_transformer(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -61,8 +62,8 @@ def some_data(item, created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_batch_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_batch_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data1 = [{'created_at': i} for i in range(5)] data2 = [{'created_at': i} for i in range(5, 10)] @@ -81,8 +82,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_access_in_resource(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_access_in_resource(item_type: TDataItemFormat) -> None: values = [] data = [{'created_at': i} for i in range(6)] @@ -100,8 +101,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert values == [None, 5] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -127,9 +128,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -138,8 +138,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert rows == [(1, 'a'), (2, 'b'), (3, 'c'), (3, 'd'), (3, 'e'), (3, 'f'), (4, 'g')] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_rows_by_hash_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_rows_by_hash_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -166,8 +166,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -189,7 +189,7 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): @pytest.mark.parametrize("item_type", ["arrow", "pandas"]) -def test_nested_cursor_path_arrow_fails(item_type: TItemFormat) -> None: +def test_nested_cursor_path_arrow_fails(item_type: TDataItemFormat) -> None: data = [ {'data': {'items': [{'created_at': 2}]}} ] @@ -208,8 +208,8 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): assert ex.exception.json_path == "data.items[0].created_at" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_initial_value(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_initial_value(item_type: TDataItemFormat) -> None: @dlt.resource def some_data(created_at=dlt.sources.incremental('created_at')): data = [{"created_at": created_at.last_value}] @@ -222,8 +222,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 4242 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_incremental_instance(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_incremental_instance(item_type: TDataItemFormat) -> None: data = [{'inserted_at': 242, 'some_uq': 444}] source_items = data_to_item_format(item_type, data) @@ -238,7 +238,7 @@ def some_data(incremental=dlt.sources.incremental('created_at', initial_value=0) @dlt.resource -def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): +def some_data_from_config(call_no: int, item_type: TDataItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): assert created_at.cursor_path == 'created_at' # start value will update to the last_value on next call if call_no == 1: @@ -252,8 +252,8 @@ def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Opti yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_from_config(item_type: TDataItemFormat) -> None: os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__CURSOR_PATH'] = 'created_at' os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2022-02-03T00:00:00Z' @@ -263,8 +263,8 @@ def test_optional_incremental_from_config(item_type: TItemFormat) -> None: p.extract(some_data_from_config(2, item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_not_passed(item_type: TDataItemFormat) -> None: """Resource still runs when no incremental is passed""" data = [1,2,3] source_items = data_to_item_format(item_type, data) @@ -283,15 +283,15 @@ class OptionalIncrementalConfig(BaseConfiguration): @dlt.resource(spec=OptionalIncrementalConfig) -def optional_incremental_arg_resource(item_type: TItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: +def optional_incremental_arg_resource(item_type: TDataItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: data = [1,2,3] source_items = data_to_item_format(item_type, data) assert incremental is None yield source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_arg_from_spec_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_arg_from_spec_not_passed(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(optional_incremental_arg_resource(item_type)) @@ -303,7 +303,7 @@ class SomeDataOverrideConfiguration(BaseConfiguration): # provide what to inject via spec. the spec contain the default @dlt.resource(spec=SomeDataOverrideConfiguration) -def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): +def some_data_override_config(item_type: TDataItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): assert created_at.cursor_path == 'created_at' assert created_at.initial_value == '2000-02-03T00:00:00Z' data = [{'created_at': '2023-03-03T00:00:00Z'}] @@ -311,8 +311,8 @@ def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.in yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_initial_value_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_initial_value_from_config(item_type: TDataItemFormat) -> None: # use the shortest possible config version # os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_OVERRIDE_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' os.environ['CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' @@ -321,8 +321,8 @@ def test_override_initial_value_from_config(item_type: TItemFormat) -> None: p.extract(some_data_override_config(item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_primary_key_in_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_primary_key_in_pipeline(item_type: TDataItemFormat) -> None: """Primary key hint passed to pipeline is propagated through apply_hints """ data = [ @@ -342,8 +342,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): p.extract(some_data, primary_key=['id', 'other_id']) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_composite_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_composite_primary_key(item_type: TDataItemFormat) -> None: data = [ {'created_at': 1, 'isrc': 'AAA', 'market': 'DE'}, {'created_at': 2, 'isrc': 'BBB', 'market': 'DE'}, @@ -360,7 +360,7 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, isrc, market FROM some_data order by created_at, isrc, market") as cur: @@ -370,8 +370,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert set(rows) == expected -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_func_min(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_func_min(item_type: TDataItemFormat) -> None: data = [ {'created_at': 10}, {'created_at': 11}, @@ -410,8 +410,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', last_value_func=l assert s['last_value'] == 11 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_cursor_datetime_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_cursor_datetime_type(item_type: TDataItemFormat) -> None: initial_value = pendulum.now() data = [ {'created_at': initial_value + timedelta(minutes=1)}, @@ -434,8 +434,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', initial_value)): assert s['last_value'] == initial_value + timedelta(minutes=4) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_descending_order_unique_hashes(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_descending_order_unique_hashes(item_type: TDataItemFormat) -> None: """Resource returns items in descending order but using `max` last value function. Only hash matching last_value are stored. """ @@ -459,8 +459,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', 20)): assert list(some_data()) == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_json_identifiers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_json_identifiers(item_type: TDataItemFormat) -> None: """Uses primary key name that is matching the name of the JSON element in the original namespace but gets converted into destination namespace""" @dlt.resource(primary_key="DelTa") @@ -492,8 +492,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert rows2[-1][0] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_primary_key(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="DELTA") def some_data(last_timestamp=dlt.sources.incremental("ts")): @@ -506,8 +506,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert py_ex.value.primary_key_column == "DELTA" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_cursor_field(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_cursor_field(item_type: TDataItemFormat) -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @dlt.resource def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): @@ -566,12 +566,12 @@ def some_data(last_timestamp: dlt.sources.incremental[float] = dlt.sources.incre assert list(some_data(last_timestamp=None)) == [1] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_filter_processed_items(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_filter_processed_items(item_type: TDataItemFormat) -> None: """Checks if already processed items are filtered out""" @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -628,8 +628,8 @@ def some_data(step, last_timestamp=dlt.sources.incremental("ts")): p.run(r, destination="duckdb") -@pytest.mark.parametrize("item_type", set(ALL_ITEM_FORMATS) - {'json'}) -def test_start_value_set_to_last_value_arrow(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", set(ALL_DATA_ITEM_FORMATS) - {'json'}) +def test_start_value_set_to_last_value_arrow(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb') now = pendulum.now() @@ -655,13 +655,13 @@ def some_data(first: bool, last_timestamp=dlt.sources.incremental("ts")): p.run(some_data(False)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_replace_resets_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_replace_resets_state(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") now = pendulum.now() @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -688,10 +688,12 @@ def child(item): info = p.run(child, write_disposition="replace") # print(info.load_packages[0]) assert len(info.loads_ids) == 1 - # pipeline applied hints to the child resource - assert child.write_disposition == "replace" + # pipeline applied hints to the child resource but it was placed into source first + # so the original is still "append" + assert child.write_disposition == "append" # create a source where we place only child + child.write_disposition = "replace" s = DltSource("comp", "section", Schema("comp"), [child]) # but extracted resources will include its parent where it derives write disposition from child extracted = s.resources.extracted @@ -726,8 +728,8 @@ def child(item): assert extracted[child._pipe.parent.name].write_disposition == "append" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_as_transform(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_as_transform(item_type: TDataItemFormat) -> None: now = pendulum.now().timestamp() @@ -749,8 +751,8 @@ def some_data(): assert len(info.loads_ids) == 1 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_explicit_disable_unique_check(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_explicit_disable_unique_check(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="delta") def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): data = [{"delta": i, "ts": pendulum.now().timestamp()} for i in range(-10, 10)] @@ -764,8 +766,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): assert s.state["incremental"]["ts"]["unique_hashes"] == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_apply_hints_incremental(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_apply_hints_incremental(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] @@ -879,7 +881,7 @@ def some_data(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sourc @dlt.resource def endless_sequence( - item_type: TItemFormat, + item_type: TDataItemFormat, updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=1) ) -> Any: max_values = 20 @@ -889,8 +891,8 @@ def endless_sequence( yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_chunked_ranges(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_chunked_ranges(item_type: TDataItemFormat) -> None: """Load chunked ranges with end value along with incremental""" pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -933,8 +935,8 @@ def test_chunked_ranges(item_type: TItemFormat) -> None: assert items == expected_range -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_with_batches(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_with_batches(item_type: TDataItemFormat) -> None: """Ensure incremental with end_value works correctly when resource yields lists instead of single items""" @dlt.resource def batched_sequence( @@ -969,8 +971,8 @@ def batched_sequence( assert items == list(range(1, 14)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_load_with_end_value_does_not_write_state(item_type: TDataItemFormat) -> None: """When loading chunk with initial/end value range. The resource state is untouched. """ pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -980,8 +982,8 @@ def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> Non assert pipeline.state.get('sources') is None -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_initial_value_errors(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_initial_value_errors(item_type: TDataItemFormat) -> None: @dlt.resource def some_data( updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at') @@ -1016,8 +1018,8 @@ def custom_last_value(items): assert "The result of 'custom_last_value([end_value, initial_value])' must equal 'end_value'" in str(ex.value) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_out_of_range_flags(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_out_of_range_flags(item_type: TDataItemFormat) -> None: """Test incremental.start_out_of_range / end_out_of_range flags are set when items are filtered out""" @dlt.resource def descending( @@ -1085,8 +1087,8 @@ def ascending_single_item( pipeline.extract(ascending_single_item()) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_get_incremental_value_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_get_incremental_value_type(item_type: TDataItemFormat) -> None: assert dlt.sources.incremental("id").get_incremental_value_type() is Any assert dlt.sources.incremental("id", initial_value=0).get_incremental_value_type() is int assert dlt.sources.incremental("id", initial_value=None).get_incremental_value_type() is Any @@ -1146,8 +1148,8 @@ def test_type_5(updated_at = dlt.sources.incremental("updated_at", allow_externa assert r.incremental._incremental.get_incremental_value_type() is Any -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1165,8 +1167,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment assert data_item_to_list(item_type, result) == [{'updated_at': 2}] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler_pipeline(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1194,8 +1196,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment pipeline.extract(r) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_allow_external_schedulers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_allow_external_schedulers(item_type: TDataItemFormat) -> None: @dlt.resource() def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at")): data = [{"updated_at": d} for d in [1, 2, 3]] diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index d8223f2ee8..130e0a8d93 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -9,10 +9,14 @@ from dlt.common.pipeline import StateInjectableContext, source_state from dlt.common.schema import Schema from dlt.common.typing import TDataItems -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, ResourcesNotFoundError + +from dlt.extract import DltResource, DltSource, Incremental +from dlt.extract.source import DltResourceDict +from dlt.extract.exceptions import (DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, + InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, + ParametrizedResourceUnbound, ResourcesNotFoundError) from dlt.extract.pipe import Pipe -from dlt.extract.typing import FilterItem, MapItem -from dlt.extract.source import DltResource, DltResourceDict, DltSource def test_call_data_resource() -> None: @@ -1147,7 +1151,7 @@ def empty_gen(): empty_r = empty() # check defaults assert empty_r.name == empty.name == empty_r.table_name == empty.table_name == "empty_gen" - assert empty_r._table_schema_template is None + # assert empty_r._table_schema_template is None assert empty_r.compute_table_schema() == empty_table_schema assert empty_r.write_disposition == "append" @@ -1160,7 +1164,7 @@ def empty_gen(): empty_r.write_disposition = "append" assert empty_r.compute_table_schema()["write_disposition"] == "append" - empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"]) + empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"], schema_contract="freeze") table = empty_r.compute_table_schema() assert table["columns"]["a"] == {'merge_key': True, 'name': 'a', 'nullable': False, 'primary_key': True} assert table["columns"]["b"] == {'name': 'b', 'nullable': False, 'primary_key': True} @@ -1168,10 +1172,11 @@ def empty_gen(): assert table["name"] == "table" assert table["parent"] == "parent" assert empty_r.table_name == "table" + assert table["schema_contract"] == "freeze" # reset - empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}) - assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append'} + empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY, schema_contract={}) + assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append', 'original_columns': {}} table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" assert "parent" not in table diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index 64e06bcecc..db39530567 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -1,15 +1,19 @@ """Tests for resource validation with pydantic schema """ import typing as t - import pytest + import dlt -from dlt.extract.typing import ValidateItem +from dlt.common import json +from dlt.common.schema.exceptions import DataValidationError from dlt.common.typing import TDataItems -from dlt.extract.validation import PydanticValidator -from dlt.extract.exceptions import ValidationError, ResourceExtractionError +from dlt.common.libs.pydantic import BaseModel -from pydantic import BaseModel +from dlt.extract import DltResource +from dlt.extract.typing import ValidateItem +from dlt.extract.validation import PydanticValidator +from dlt.extract.exceptions import ResourceExtractionError +from dlt.pipeline.exceptions import PipelineStepFailed class SimpleModel(BaseModel): @@ -30,7 +34,8 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(some_data()) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + # compare content-wise. model names change due to extra settings on columns + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -50,7 +55,7 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(resource) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -68,7 +73,7 @@ def some_data() -> t.Iterator[TDataItems]: resource.validator = None data = list(resource) - assert data == [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] + assert json.dumpb(data) == json.dumpb([{"a": 1, "b": "2"}, {"a": 2, "b": "3"}]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -94,14 +99,15 @@ class AnotherModel(BaseModel): data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) # Ensure only one validator is applied in steps steps = resource._pipe.steps assert len(steps) == 2 assert isinstance(steps[-1], ValidateItem) - assert steps[-1].model is AnotherModel # type: ignore[attr-defined] + # model name will change according to extra items handling + assert steps[-1].model.__name__.startswith(AnotherModel.__name__) # type: ignore[attr-defined] @pytest.mark.parametrize("yield_list", [True, False]) @@ -117,24 +123,24 @@ def some_data() -> t.Iterator[TDataItems]: resource = some_data() - assert isinstance(resource.validator, PydanticValidator) and resource.validator.model is SimpleModel + assert isinstance(resource.validator, PydanticValidator) and resource.validator.model.__name__.startswith(SimpleModel.__name__) class AnotherModel(BaseModel): a: int b: str c: float = 0.5 - resource.validator = PydanticValidator(AnotherModel) + resource.validator = PydanticValidator(AnotherModel, column_mode="freeze", data_mode="freeze") - assert resource.validator and resource.validator.model is AnotherModel + assert resource.validator and resource.validator.model.__name__.startswith(AnotherModel.__name__) data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) @pytest.mark.parametrize("yield_list", [True, False]) -def test_failed_validation(yield_list: bool) -> None: +def test_default_validation(yield_list: bool) -> None: @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: # yield item that fails schema validation @@ -144,9 +150,94 @@ def some_data() -> t.Iterator[TDataItems]: else: yield from items + # some_data must have default Pydantic schema contract + assert some_data().schema_contract == {"tables": "evolve", "columns": "discard_value", "data_type": "freeze"} + # extraction fails with ValidationError with pytest.raises(ResourceExtractionError) as exinfo: list(some_data()) - assert isinstance(exinfo.value.__cause__, ValidationError) - assert str(PydanticValidator(SimpleModel)) in str(exinfo.value) + val_ex = exinfo.value.__cause__ + assert isinstance(val_ex, DataValidationError) + assert val_ex.schema_name is None + assert val_ex.table_name == "some_data" + assert val_ex.column_name == "('items', 1, 'a')" if yield_list else "('a',)" + assert val_ex.data_item == {"a": "not_int", "b": "x"} + assert val_ex.contract_entity == "data_type" + + # fail in pipeline + @dlt.resource(columns=SimpleModel) + def some_data_extra() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z", "c": 1.3}, {"a": "not_int", "b": "x"}] + if yield_list: + yield items + else: + yield from items + + pipeline = dlt.pipeline() + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(some_data_extra()) + assert isinstance(py_ex.value.__cause__, ResourceExtractionError) + assert isinstance(py_ex.value.__cause__.__cause__, DataValidationError) + val_ex = py_ex.value.__cause__.__cause__ + assert val_ex.table_name == "some_data_extra" + assert val_ex.contract_entity == "data_type" # extra field is the cause + assert val_ex.data_item == {"a": "not_int", "b": "x"} + + +@pytest.mark.parametrize("yield_list", [True, False]) +def test_validation_with_contracts(yield_list: bool) -> None: + + def some_data() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z"}, {"a": "not_int", "b": "x"}, {"c": "not_int"}] + if yield_list: + yield items + else: + yield from items + + # let it evolve + r: DltResource = dlt.resource(some_data(), schema_contract="evolve", columns=SimpleModel) + validator: PydanticValidator[SimpleModel] = r.validator # type: ignore[assignment] + assert validator.column_mode == "evolve" + assert validator.data_mode == "evolve" + assert validator.model.__name__.endswith("AnyExtraAllow") + items = list(r) + assert len(items) == 3 + # fully valid + assert items[0].a == 1 + assert items[0].b == "z" + # data type not valid + assert items[1].a == "not_int" + assert items[1].b == "x" + # extra attr and data invalid + assert items[2].a is None + assert items[2].b is None + assert items[2].c == "not_int" + + # let it drop + r = dlt.resource(some_data(), schema_contract="discard_row", columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_row" + assert validator.data_mode == "discard_row" + assert validator.model.__name__.endswith("ExtraForbid") + items = list(r) + assert len(items) == 1 + assert items[0].a == 1 + assert items[0].b == "z" + + # filter just offending values + with pytest.raises(NotImplementedError): + # pydantic data_type cannot be discard_value + dlt.resource(some_data(), schema_contract="discard_value", columns=SimpleModel) + r = dlt.resource(some_data(), schema_contract={"columns": "discard_value", "data_type": "evolve"}, columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_value" + assert validator.data_mode == "evolve" + # ignore is the default so no Extra in name + assert validator.model.__name__.endswith("Any") + items = list(r) + assert len(items) == 3 + # c is gone from the last model + assert not hasattr(items[2], "c") diff --git a/tests/extract/utils.py b/tests/extract/utils.py index b109cdbdd9..006816b5cd 100644 --- a/tests/extract/utils.py +++ b/tests/extract/utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, List, Literal, get_args +from typing import Any, Optional, List import pytest from itertools import zip_longest @@ -7,13 +7,7 @@ from dlt.extract.extract import ExtractorStorage from dlt.extract.typing import ItemTransform -import pandas as pd -from dlt.common.libs.pyarrow import pyarrow as pa - - -TItemFormat = Literal["json", "pandas", "arrow"] - -ALL_ITEM_FORMATS = get_args(TItemFormat) +from tests.utils import TDataItemFormat def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_name: str, content: str) -> None: @@ -35,7 +29,7 @@ def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_nam class AssertItems(ItemTransform[TDataItem]): - def __init__(self, expected_items: Any, item_type: TItemFormat = "json") -> None: + def __init__(self, expected_items: Any, item_type: TDataItemFormat = "json") -> None: self.expected_items = expected_items self.item_type = item_type @@ -44,22 +38,8 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: return item -def data_to_item_format(item_format: TItemFormat, data: List[TDataItem]): - """Return the given data in the form of pandas, arrow table or json items""" - if item_format == "json": - return data - # Make dataframe from the data - df = pd.DataFrame(data) - if item_format == "pandas": - return [df] - elif item_format == "arrow": - return [pa.Table.from_pandas(df)] - else: - raise ValueError(f"Unknown item format: {item_format}") - - -def data_item_to_list(from_type: TItemFormat, values: List[TDataItem]): - if from_type == "arrow": +def data_item_to_list(from_type: TDataItemFormat, values: List[TDataItem]): + if from_type in ["arrow", "arrow-batch"]: return values[0].to_pylist() elif from_type == "pandas": return values[0].to_dict("records") diff --git a/tests/libs/__init__.py b/tests/libs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/libs/test_buffered_writer_arrow,py b/tests/libs/test_buffered_writer_arrow,py new file mode 100644 index 0000000000..f0f0968942 --- /dev/null +++ b/tests/libs/test_buffered_writer_arrow,py @@ -0,0 +1,50 @@ +import pytest + +from dlt.common.destination import TLoaderFileFormat +from dlt.common.schema.utils import new_column + +from tests.common.data_writers.utils import get_writer, ALL_WRITERS + + +@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) +def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format=writer_format) as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item({"col1": 1}, columns=c1) + assert writer._buffered_items_count == 1 + # list + writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) + assert writer._buffered_items_count == 3 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 3 + + +def test_writer_items_count_arrow() -> None: + import pyarrow as pa + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format="arrow") as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) + assert writer._buffered_items_count == 1 + # single item with many rows + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) + assert writer._buffered_items_count == 3 + # empty list + writer.write_data_item([], columns=c1) + assert writer._buffered_items_count == 3 + # list with one item + writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) + assert writer._buffered_items_count == 4 + # list with many items + writer.write_data_item( + [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], + columns=c1 + ) + assert writer._buffered_items_count == 7 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/test_parquet_writer.py b/tests/libs/test_parquet_writer.py similarity index 100% rename from tests/common/data_writers/test_parquet_writer.py rename to tests/libs/test_parquet_writer.py diff --git a/tests/common/test_pyarrow.py b/tests/libs/test_pyarrow.py similarity index 100% rename from tests/common/test_pyarrow.py rename to tests/libs/test_pyarrow.py diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py new file mode 100644 index 0000000000..5606bd25b2 --- /dev/null +++ b/tests/libs/test_pydantic.py @@ -0,0 +1,391 @@ +from copy import copy +import pytest +from typing import ClassVar, Sequence, Mapping, Dict, MutableMapping, MutableSequence, Union, Optional, List, Dict, Any +from enum import Enum + +from datetime import datetime, date, time # noqa: I251 +from dlt.common import Decimal +from dlt.common import json + +from dlt.common.libs.pydantic import DltConfig, pydantic_to_table_schema_columns, apply_schema_contract_to_model, validate_item, validate_items, create_list_model +from pydantic import BaseModel, Json, AnyHttpUrl, ConfigDict, ValidationError + +from dlt.common.schema.exceptions import DataValidationError + + +class StrEnum(str, Enum): + a = "a_value" + b = "b_value" + c = "c_value" + + +class IntEnum(int, Enum): + a = 0 + b = 1 + c = 2 + + +class MixedEnum(Enum): + a_int = 0 + b_str = "b_value" + c_int = 2 + + +class NestedModel(BaseModel): + nested_field: str + + +class Model(BaseModel): + bigint_field: int + text_field: str + timestamp_field: datetime + date_field: date + decimal_field: Decimal + double_field: float + time_field: time + + nested_field: NestedModel + list_field: List[str] + + union_field: Union[int, str] + + optional_field: Optional[float] + + blank_dict_field: dict # type: ignore[type-arg] + parametrized_dict_field: Dict[str, int] + + str_enum_field: StrEnum + int_enum_field: IntEnum + # Both of these shouold coerce to str + mixed_enum_int_field: MixedEnum + mixed_enum_str_field: MixedEnum + + json_field: Json[List[str]] + + url_field: AnyHttpUrl + + any_field: Any + json_any_field: Json[Any] + + +class ModelWithConfig(Model): + model_config = ConfigDict(frozen=True, extra="allow") + + +TEST_MODEL_INSTANCE = Model( + bigint_field=1, text_field="text", timestamp_field=datetime.now(), + date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, + time_field=time(1, 2, 3, 12345), + nested_field=NestedModel(nested_field="nested"), + list_field=["a", "b", "c"], + union_field=1, + optional_field=None, + blank_dict_field={}, + parametrized_dict_field={"a": 1, "b": 2, "c": 3}, + str_enum_field=StrEnum.a, + int_enum_field=IntEnum.a, + mixed_enum_int_field=MixedEnum.a_int, + mixed_enum_str_field=MixedEnum.b_str, + json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] + url_field="https://example.com", # type: ignore[arg-type] + any_field="any_string", + json_any_field=json.dumps("any_string"), +) + + +@pytest.mark.parametrize('instance', [True, False]) +def test_pydantic_model_to_columns(instance: bool) -> None: + if instance: + model = TEST_MODEL_INSTANCE + else: + model = Model # type: ignore[assignment] + + result = pydantic_to_table_schema_columns(model) + + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + assert result["date_field"]["data_type"] == "date" + assert result["decimal_field"]["data_type"] == "decimal" + assert result["double_field"]["data_type"] == "double" + assert result["time_field"]["data_type"] == "time" + assert result["nested_field"]["data_type"] == "complex" + assert result['list_field']['data_type'] == 'complex' + assert result['union_field']['data_type'] == 'bigint' + assert result['optional_field']['data_type'] == 'double' + assert result['optional_field']['nullable'] is True + assert result['blank_dict_field']['data_type'] == 'complex' + assert result['parametrized_dict_field']['data_type'] == 'complex' + assert result['str_enum_field']['data_type'] == 'text' + assert result['int_enum_field']['data_type'] == 'bigint' + assert result['mixed_enum_int_field']['data_type'] == 'text' + assert result['mixed_enum_str_field']['data_type'] == 'text' + assert result['json_field']['data_type'] == 'complex' + assert result['url_field']['data_type'] == 'text' + + # Any type fields are excluded from schema + assert 'any_field' not in result + assert 'json_any_field' not in result + + +def test_pydantic_model_skip_complex_types() -> None: + class SkipNestedModel(Model): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + result = pydantic_to_table_schema_columns(SkipNestedModel) + + assert result["bigint_field"]["data_type"] == "bigint" + assert "nested_field" not in result + assert "list_field" not in result + assert "blank_dict_field" not in result + assert "parametrized_dict_field" not in result + assert "json_field" not in result + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + + +def test_model_for_column_mode() -> None: + # extra prop + instance_extra = TEST_MODEL_INSTANCE.dict() + instance_extra["extra_prop"] = "EXTRA" + # back to string + instance_extra["json_field"] = json.dumps(["a", "b", "c"]) + instance_extra["json_any_field"] = json.dumps("any_string") + + # evolve - allow extra fields + model_evolve = apply_schema_contract_to_model(ModelWithConfig, "evolve") + # assert "frozen" in model_evolve.model_config + extra_instance = model_evolve.parse_obj(instance_extra) + assert hasattr(extra_instance, "extra_prop") + assert extra_instance.extra_prop == "EXTRA" + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_evolve.parse_obj(instance_extra) + assert extra_instance.extra_prop == "EXTRA" # type: ignore[attr-defined] + + # freeze - validation error on extra fields + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "freeze") + # assert "frozen" in model_freeze.model_config + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + model_freeze = apply_schema_contract_to_model(Model, "freeze") # type: ignore[arg-type] + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard row - same as freeze + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "discard_row") + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard value - ignore extra fields + model_discard = apply_schema_contract_to_model(ModelWithConfig, "discard_value") + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + + # evolve data but freeze new columns + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "freeze") + instance_extra_2 = copy(instance_extra) + # should parse ok + model_discard.parse_obj(instance_extra_2) + # this must fail validation + instance_extra_2["bigint_field"] = "NOT INT" + with pytest.raises(ValidationError): + model_discard.parse_obj(instance_extra_2) + # let the datatypes evolve + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "evolve") + print(model_freeze.parse_obj(instance_extra_2).dict()) + + with pytest.raises(NotImplementedError): + apply_schema_contract_to_model(ModelWithConfig, "evolve", "discard_value") + + +def test_nested_model_config_propagation() -> None: + class UserLabel(BaseModel): + label: str + + class UserAddress(BaseModel): + street: str + zip_code: Sequence[int] + label: Optional[UserLabel] + ro_labels: Mapping[str, UserLabel] + wr_labels: MutableMapping[str, List[UserLabel]] + ro_list: Sequence[UserLabel] + wr_list: MutableSequence[Dict[str, UserLabel]] + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + address: UserAddress + unity: Union[UserAddress, UserLabel, Dict[str, UserAddress]] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + model_freeze = apply_schema_contract_to_model(User, "evolve", "freeze") + from typing import get_type_hints + print(get_type_hints(model_freeze)) + print(get_type_hints(model_freeze.model_fields["address"].annotation)) + + + +def test_item_list_validation() -> None: + + class ItemModel(BaseModel): + b: bool + opt: Optional[int] = None + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + discard_list_model = create_list_model(discard_model) + # violate data type + items = validate_items( + "items", + discard_list_model, + [{"b": True}, {"b": 2, "opt": "not int", "extra": 1.2}, {"b": 3}, {"b": False}], + "discard_row", "discard_row" + ) + # {"b": 2, "opt": "not int", "extra": 1.2} - note that this will generate 3 errors for the same item + # and is crucial in our tests when discarding rows + assert len(items) == 2 + assert items[0].b is True + assert items[1].b is False + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "discard_row") + assert len(items) == 1 + assert items[0].b is True + + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + freeze_list_model = create_list_model(freeze_model) + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'b')) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"a": 2, "b": False}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'a')) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + discard_list_model = create_list_model(discard_value_model) + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": False, "a": False}], "discard_value", "freeze") + assert len(items) == 2 + # "a" extra got remove + assert items[1].dict() == {"b": False, "opt": None} + # violate data type + with pytest.raises(NotImplementedError): + apply_schema_contract_to_model(ItemModel, "discard_value", "discard_value") + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + evolve_list_model = create_list_model(evolve_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields allowed + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[3].b is False + assert items[3].a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + mixed_list_model = create_list_model(mixed_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "discard_row", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields forbidden - full rows discarded + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "evolve") + assert len(items) == 3 + + +def test_item_validation() -> None: + + class ItemModel(BaseModel): + b: bool + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + # violate data type + assert validate_item("items", discard_model, {"b": 2}, "discard_row", "discard_row") is None + # violate extra field + assert validate_item("items", discard_model, {"b": False, "a": False}, "discard_row", "discard_row") is None + + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"b": 2}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('b',)) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"a": 2, "b": False}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('a',)) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + # violate extra field + item = validate_item("items", discard_value_model, {"b": False, "a": False}, "discard_value", "freeze") + # "a" extra got removed + assert item.dict() == {"b": False} + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", evolve_model, {"b": 2}, "evolve", "evolve") + assert item.b == 2 + # extra fields allowed + item = validate_item("items", evolve_model, {"b": False, "a": False}, "evolve", "evolve") + assert item.b is False + assert item.a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", mixed_model, {"b": 3}, "discard_row", "evolve") + assert item.b == 3 + # extra fields forbidden - full rows discarded + assert validate_item("items", mixed_model, {"b": False, "a": False}, "discard_row", "evolve") is None diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index bd709e764d..9a72536329 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -138,4 +138,4 @@ def some_data(): result_tbl = pa.parquet.read_table(f) # Parquet schema is written with normalized column names - assert result_tbl.column_names == expected_column_names + assert result_tbl.schema.names == expected_column_names diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index 2a20db62b4..4354460374 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -6,7 +6,7 @@ import pytest import dlt -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.common.utils import uniq_id from dlt.pipeline import helpers, state_sync, Pipeline from dlt.load import Load diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index fbc5088ab2..4e8d1f9049 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -13,7 +13,7 @@ from dlt.common.pipeline import StateInjectableContext from dlt.common.typing import AnyFun, StrAny from dlt.common.utils import digest128 -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.sources.helpers.transform import skip_first, take_first from tests.pipeline.utils import assert_load_info diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 2fc4aad1a8..004aac0285 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -14,7 +14,7 @@ from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceNameMissing -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.exceptions import CannotRestorePipelineException, PipelineConfigMissing, PipelineStepFailed from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.exceptions import DestinationHasFailedJobs diff --git a/tests/load/pipeline/test_restore_state.py b/tests/load/pipeline/test_restore_state.py index f80dbbd7e6..c9c6c4c437 100644 --- a/tests/load/pipeline/test_restore_state.py +++ b/tests/load/pipeline/test_restore_state.py @@ -18,7 +18,7 @@ from tests.utils import TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED -from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V6, yml_case_path as common_yml_case_path +from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7, yml_case_path as common_yml_case_path from tests.common.configuration.utils import environment from tests.load.pipeline.utils import assert_query_data, drop_active_pipeline_data from tests.load.utils import destinations_configs, DestinationTestConfiguration, get_normalized_dataset_name @@ -404,7 +404,7 @@ def test_restore_schemas_while_import_schemas_exist(destination_config: Destinat assert normalized_annotations in schema.tables # check if attached to import schema - assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V6 + assert schema._imported_version_hash == IMPORTED_VERSION_HASH_ETH_V7 # extract some data with restored pipeline p.run(["C", "D", "E"], table_name="blacklist") assert normalized_labels in schema.tables diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index 158993b7c8..c88fd79588 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -77,7 +77,7 @@ def source(): if with_root_key: assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["root"] == {'_dlt_id': '_dlt_root_id'} else: - assert "propagation" not in pipeline.default_schema._normalizers_config["json"]["config"] + assert "propagation" not in pipeline.default_schema._normalizers_config["json"].get("config", {}) # without a root key this will fail, it is expected if not with_root_key and destination_config.supports_merge: diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 113585f669..94fbc80cf8 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -1,23 +1,20 @@ -import posixpath, os -from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Optional, Tuple, Dict, Callable +from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Callable import pytest import dlt from dlt.common.destination.reference import WithStagingDataset -from dlt.pipeline.pipeline import Pipeline -from dlt.common import json from dlt.common.configuration.container import Container from dlt.common.pipeline import LoadInfo, PipelineContext -from dlt.common.typing import DictStrAny -from dlt.pipeline.exceptions import SqlClientNotAvailable -from dlt.common.schema.typing import LOADS_TABLE_NAME +from tests.pipeline.utils import (load_table_counts, load_data_table_counts, assert_data_table_counts, load_file, + load_files, load_tables_to_dicts, load_table_distinct_counts) from tests.load.utils import DestinationTestConfiguration, destinations_configs if TYPE_CHECKING: from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + @pytest.fixture(autouse=True) def drop_pipeline(request) -> Iterator[None]: yield @@ -120,149 +117,3 @@ def assert_query_data(p: dlt.Pipeline, sql: str, table_data: List[Any], schema_n # the second is load id if info: assert row[1] in info.loads_ids - - -def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: - """ - util function to load a filesystem destination file and return parsed content - values may not be cast to the right type, especially for insert_values, please - make sure to do conversions and casting if needed in your tests - """ - result: List[Dict[str, Any]] = [] - - # check if this is a file we want to read - file_name_items = file.split(".") - ext = file_name_items[-1] - if ext not in ["jsonl", "insert_values", "parquet"]: - return "skip", [] - - # table name will be last element of path - table_name = path.split("/")[-1] - - # skip loads table - if table_name == "_dlt_loads": - return table_name, [] - - full_path = posixpath.join(path, file) - - # load jsonl - if ext == "jsonl": - with open(full_path, "rU", encoding="utf-8") as f: - for line in f: - result.append(json.loads(line)) - - # load insert_values (this is a bit volatile if the exact format of the source file changes) - elif ext == "insert_values": - with open(full_path, "rU", encoding="utf-8") as f: - lines = f.readlines() - # extract col names - cols = lines[0][15:-2].split(",") - for line in lines[2:]: - values = line[1:-3].split(",") - result.append(dict(zip(cols, values))) - - # load parquet - elif ext == "parquet": - import pyarrow.parquet as pq - with open(full_path, "rb") as f: - table = pq.read_table(f) - cols = table.column_names - count = 0 - for column in table: - column_name = cols[count] - item_count = 0 - for item in column.to_pylist(): - if len(result) <= item_count: - result.append({column_name: item}) - else: - result[item_count][column_name] = item - item_count += 1 - count += 1 - - return table_name, result - - -def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client: FilesystemClient = p.destination_client() # type: ignore[assignment] - result: Dict[str, Any] = {} - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): - for file in files: - table_name, items = load_file(basedir, file) - if table_name not in table_names: - continue - if table_name in result: - result[table_name] = result[table_name] + items - else: - result[table_name] = items - - # loads file is special case - if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): - result[LOADS_TABLE_NAME] = [] - - return result - - -def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: - """Returns row counts for `table_names` as dict""" - - # try sql, could be other destination though - try: - with p.sql_client() as c: - qualified_names = [c.make_qualified_table_name(name) for name in table_names] - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} - except SqlClientNotAvailable: - pass - - # try filesystem - file_tables = load_files(p, *table_names) - result = {} - for table_name, items in file_tables.items(): - result[table_name] = len(items) - return result - -def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: - tables = [table["name"] for table in p.default_schema.data_tables()] - return load_table_counts(p, *tables) - - -def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: - table_counts = load_data_table_counts(p) - assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" - - -def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - - # try sql, could be other destination though - try: - result = {} - for table_name in table_names: - table_rows = [] - columns = p.default_schema.get_table_columns(table_name).keys() - query_columns = ",".join(columns) - - with p.sql_client() as c: - f_q_table_name = c.make_qualified_table_name(table_name) - query = f"SELECT {query_columns} FROM {f_q_table_name}" - with c.execute_query(query) as cur: - for row in list(cur.fetchall()): - table_rows.append(dict(zip(columns, row))) - result[table_name] = table_rows - return result - - except SqlClientNotAvailable: - pass - - # try files - return load_files(p, *table_names) - -def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: - """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) - with p.sql_client() as c: - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 35394ed1c6..e08919424a 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -337,7 +337,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: import random columns = deepcopy(TABLE_UPDATE) random.shuffle(columns) - print(columns) + schema.update_table(new_table(table_name, columns=columns)) schema.bump_version() diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index 25258a2479..850f70ee19 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -87,7 +87,7 @@ def test_reserved_property_names() -> None: # print(schema_2.name) # print(schema_2.naming) -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v6") +# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") # eth_v6_schema = dlt.Schema.from_dict(eth_v6) # pipeline.extract(s, schema=eth_v6_schema) @@ -101,7 +101,7 @@ def test_reserved_property_names() -> None: # print(pipeline.dataset_name) # s = small() -# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v6") +# eth_v6 = load_yml_case("schemas/eth/ethereum_schema_v7") # eth_v6_schema = dlt.Schema.from_dict(eth_v6) # pipeline.extract(s, schema=eth_v6_schema) diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 31d5d001df..686ad2ffd3 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -6,15 +6,17 @@ import os import io import pyarrow as pa -from typing import List import dlt +from dlt.common import json, Decimal from dlt.common.utils import uniq_id +from dlt.common.libs.pyarrow import NameNormalizationClash + from dlt.pipeline.exceptions import PipelineStepFailed + from tests.cases import arrow_table_all_data_types, TArrowFormat from tests.utils import preserve_environ -from dlt.common import json -from dlt.common import Decimal + @pytest.mark.parametrize( @@ -87,7 +89,6 @@ def some_data(): assert schema_columns['json']['data_type'] == 'complex' - @pytest.mark.parametrize( ("item_type", "is_list"), [("pandas", False), ("table", False), ("record_batch", False), ("pandas", True), ("table", True), ("record_batch", True)] ) @@ -181,6 +182,44 @@ def data_frames(): assert len(pipeline.get_load_package_info(load_id).jobs["new_jobs"]) == 10 +@pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) +def test_arrow_clashing_names(item_type: TArrowFormat) -> None: + # # use parquet for dummy + os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") + + item, _ = arrow_table_all_data_types(item_type, include_name_clash=True) + + @dlt.resource + def data_frames(): + for _ in range(10): + yield item + + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(data_frames()) + assert isinstance(py_ex.value.__context__, NameNormalizationClash) + + +@pytest.mark.parametrize("item_type", ["table", "record_batch"]) +def test_load_arrow_vary_schema(item_type: TArrowFormat) -> None: + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + # remove int column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + @pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" @@ -199,7 +238,7 @@ def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: assert info.row_counts["items"] == len(rows) -@pytest.mark.parametrize("item_type", ["table", "pandas", "record_batch"]) +@pytest.mark.parametrize("item_type", ["table"]) # , "pandas", "record_batch" def test_normalize_with_dlt_columns(item_type: TArrowFormat): item, records = arrow_table_all_data_types(item_type, num_rows=5432) os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID'] = "True" @@ -212,10 +251,10 @@ def test_normalize_with_dlt_columns(item_type: TArrowFormat): def some_data(): yield item - pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="filesystem") + pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="duckdb") pipeline.extract(some_data()) - pipeline.normalize() + pipeline.normalize(loader_file_format="parquet") load_id = pipeline.list_normalized_load_packages()[0] storage = pipeline._get_load_storage() @@ -241,3 +280,26 @@ def some_data(): schema = pipeline.default_schema assert schema.tables['some_data']['columns']['_dlt_id']['data_type'] == 'text' assert schema.tables['some_data']['columns']['_dlt_load_id']['data_type'] == 'text' + + pipeline.load().raise_on_failed_jobs() + + # should be able to load again + pipeline.run(some_data()).raise_on_failed_jobs() + + # should be able to load arrow without a column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + # should be able to load arrow with a new column + # TODO: uncomment when load_id fixed in normalizer + # item, records = arrow_table_all_data_types(item_type, num_rows=200) + # item = item.append_column("static_int", [[0] * 200]) + # pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + # schema = pipeline.default_schema + # assert schema.tables['some_data']['columns']['static_int']['data_type'] == 'bigint' diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 2f383c1c0a..7ac7dcbb34 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -55,7 +55,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: print(venv.run_script("../tests/pipeline/cases/github_pipeline/github_pipeline.py")) # hash hash in schema github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) - assert github_schema["engine_version"] == 6 + assert github_schema["engine_version"] == 7 assert "schema_version_hash" in github_schema["tables"][LOADS_TABLE_NAME]["columns"] with DuckDbSqlClient(GITHUB_DATASET, duckdb_cfg.credentials) as client: rows = client.execute_sql(f"SELECT * FROM {LOADS_TABLE_NAME} ORDER BY inserted_at") @@ -81,7 +81,7 @@ def test_pipeline_with_dlt_update(test_storage: FileStorage) -> None: pipeline.sync_destination() # print(pipeline.working_dir) # we have updated schema - assert pipeline.default_schema.ENGINE_VERSION == 6 + assert pipeline.default_schema.ENGINE_VERSION == 7 # make sure that schema hash retrieved from the destination is exactly the same as the schema hash that was in storage before the schema was wiped assert pipeline.default_schema.stored_version_hash == github_schema["version_hash"] @@ -114,6 +114,6 @@ def test_load_package_with_dlt_update(test_storage: FileStorage) -> None: github_schema = json.loads(test_storage.load(f".dlt/pipelines/{GITHUB_PIPELINE_NAME}/schemas/github.schema.json")) pipeline = pipeline.drop() pipeline.sync_destination() - assert pipeline.default_schema.ENGINE_VERSION == 6 + assert pipeline.default_schema.ENGINE_VERSION == 7 # schema version does not match `dlt.attach` does not update to the right schema by itself assert pipeline.default_schema.stored_version_hash != github_schema["version_hash"] diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 3fcb38d915..309511b95f 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,24 +1,22 @@ import itertools import logging import os -import random -from typing import Any, Optional, Iterator, Dict, Any, cast +from typing import Any, Any, cast from tenacity import retry_if_exception, Retrying, stop_after_attempt -from pydantic import BaseModel import pytest import dlt -from dlt.common import json, sleep, pendulum +from dlt.common import json, pendulum from dlt.common.configuration.container import Container from dlt.common.configuration.specs.aws_credentials import AwsCredentials from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.destination.reference import WithStateSync from dlt.common.exceptions import DestinationHasFailedJobs, DestinationTerminalException, PipelineStateNotAvailable, UnknownDestinationModule from dlt.common.pipeline import PipelineContext -from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id from dlt.common.schema import Schema @@ -26,18 +24,16 @@ from dlt.destinations import filesystem, redshift, dummy from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.load.exceptions import LoadClientJobFailed from dlt.pipeline.exceptions import InvalidPipelineName, PipelineNotActive, PipelineStepFailed from dlt.pipeline.helpers import retry_load -from dlt.pipeline import TCollectorArg from tests.common.utils import TEST_SENTRY_DSN -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.utils import TEST_STORAGE_ROOT from tests.common.configuration.utils import environment +from tests.utils import TEST_STORAGE_ROOT from tests.extract.utils import expect_extracted_file -from tests.pipeline.utils import assert_load_info, airtable_emojis +from tests.pipeline.utils import assert_load_info, airtable_emojis, many_delayed def test_default_pipeline() -> None: @@ -190,22 +186,6 @@ def test_deterministic_salt(environment) -> None: assert p.pipeline_salt != p3.pipeline_salt -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: - # create pipelines, extract and normalize. that should be possible without installing any dependencies - p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) - # are capabilities injected - caps = p._container[DestinationCapabilitiesContext] - print(caps.naming_convention) - # are right naming conventions created - assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.extract([1, "2", 3], table_name="data") - # is default schema with right naming convention - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.normalize() - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - - def test_destination_explicit_credentials(environment: Any) -> None: # test redshift p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="redshift", credentials="redshift://loader:loader@localhost:5432/dlt_data") @@ -349,7 +329,8 @@ def i_fail(): s4 = DltSource("default_4", "module", dlt.Schema("default_4"), [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) with pytest.raises(PipelineStepFailed): - p.extract([s3, s4]) + # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later + p.extract([s4, s3]) # nothing to normalize assert len(storage.list_files_to_normalize_sorted()) == 0 @@ -496,13 +477,16 @@ def data_piece_2(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None + # one of the schemas is in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 1 # restore the pipeline p = dlt.attach(pipeline_name) assert p.first_run is True assert p.has_data is False + # no schema was saved to storage, the one above was only in memory assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None @@ -530,12 +514,14 @@ def data_schema_3(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] + # schemas from two sources are in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 2 assert p.default_schema_name is None os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p.run([data_schema_1(), data_schema_2()], write_disposition="replace") - assert p.schema_names == p._schema_storage.list_schemas() + assert set(p.schema_names) == set(p._schema_storage.list_schemas()) def test_run_with_table_name_exceeding_path_length() -> None: @@ -722,6 +708,8 @@ def resource_1(): assert p.default_schema.get_table("resource_1")["write_disposition"] == "append" p.run(resource_1, write_disposition="replace") + print(list(p._schema_storage.live_schemas.values())[0].to_pretty_yaml()) + assert p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace" assert p.default_schema.get_table("resource_1")["write_disposition"] == "replace" @@ -826,52 +814,6 @@ def reverse_order(item): assert list(p.default_schema.tables["order_2"]["columns"].keys()) == ["col_3", "col_2", "col_1", '_dlt_load_id', '_dlt_id'] -def run_deferred(iters): - - @dlt.defer - def item(n): - sleep(random.random() / 2) - return n - - for n in range(iters): - yield item(n) - - -@dlt.source -def many_delayed(many, iters): - for n in range(many): - yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) - - -@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) -def test_pipeline_progress(progress: TCollectorArg) -> None: - - os.environ["TIMEOUT"] = "3.0" - - p = dlt.pipeline(destination="dummy", progress=progress) - p.extract(many_delayed(5, 10)) - p.normalize() - - collector = p.collector - - # attach pipeline - p = dlt.attach(progress=collector) - p.extract(many_delayed(5, 10)) - p.run(dataset_name="dummy") - - assert collector == p.drop().collector - - # make sure a valid logger was used - if progress == "tqdm": - assert isinstance(collector, TqdmCollector) - if progress == "enlighten": - assert isinstance(collector, EnlightenCollector) - if progress == "alive_progress": - assert isinstance(collector, AliveCollector) - if progress == "log": - assert isinstance(collector, LogCollector) - - def test_pipeline_log_progress() -> None: os.environ["TIMEOUT"] = "3.0" @@ -1103,50 +1045,6 @@ def res_return_yield(): assert "dlt.resource" in str(pip_ex.value) -@pytest.mark.parametrize('method', ('extract', 'run')) -def test_column_argument_pydantic(method: str) -> None: - """Test columns schema is created from pydantic model""" - p = dlt.pipeline(destination='duckdb') - - @dlt.resource - def some_data() -> Iterator[Dict[str, Any]]: - yield {} - - class Columns(BaseModel): - a: Optional[int] - b: Optional[str] - - if method == 'run': - p.run(some_data(), columns=Columns) - else: - p.extract(some_data(), columns=Columns) - - assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' - assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True - assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' - assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True - - -def test_extract_pydantic_models() -> None: - pipeline = dlt.pipeline(destination='duckdb') - - class User(BaseModel): - user_id: int - name: str - - @dlt.resource - def users() -> Iterator[User]: - yield User(user_id=1, name="a") - yield User(user_id=2, name="b") - - pipeline.extract(users()) - - storage = ExtractorStorage(pipeline._normalize_storage_config) - expect_extracted_file( - storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) - ) - - def test_resource_rename_same_table(): @dlt.resource(write_disposition="replace") def generic(start): @@ -1185,17 +1083,6 @@ def generic(start): assert pipeline.default_schema.get_table("single_table")["resource"] == "state1" -@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) -def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: - - @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) - def generic(start=8): - yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] - - pipeline = dlt.pipeline(destination='duckdb') - pipeline.run(generic(), loader_file_format=file_format) - - def test_remove_autodetect() -> None: now = pendulum.now() @@ -1273,6 +1160,23 @@ def test_empty_rows_are_included() -> None: assert values == [1, None, None, None, None, None, None, None] +def test_resource_state_name_not_normalized() -> None: + pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") + peacock_s = airtable_emojis().with_resources("🦚Peacock") + pipeline.extract(peacock_s) + assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} + pipeline.normalize() + pipeline.load() + + # get state from destination + from dlt.pipeline.state_sync import load_state_from_destination + client: WithStateSync + with pipeline.destination_client() as client: # type: ignore[assignment] + state = load_state_from_destination(pipeline.pipeline_name, client) + assert "airtable_emojis" in state["sources"] + assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} + + def test_remove_pending_packages() -> None: pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy") pipeline.extract(airtable_emojis()) diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py new file mode 100644 index 0000000000..d29bac13f2 --- /dev/null +++ b/tests/pipeline/test_pipeline_extra.py @@ -0,0 +1,176 @@ +import os +from typing import Any, ClassVar, Dict, Iterator, List, Optional +import pytest +from pydantic import BaseModel + +import dlt +from dlt.common import json, pendulum +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.libs.pydantic import DltConfig +from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.extract.storage import ExtractorStorage +from dlt.extract.validation import PydanticValidator + +from dlt.pipeline import TCollectorArg + +from tests.extract.utils import expect_extracted_file +from tests.load.utils import DestinationTestConfiguration, destinations_configs +from tests.pipeline.utils import assert_load_info, load_data_table_counts, many_delayed + + +@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: + # create pipelines, extract and normalize. that should be possible without installing any dependencies + p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) + # are capabilities injected + caps = p._container[DestinationCapabilitiesContext] + print(caps.naming_convention) + # are right naming conventions created + assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.extract([1, "2", 3], table_name="data") + # is default schema with right naming convention + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.normalize() + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + + +@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) +def test_pipeline_progress(progress: TCollectorArg) -> None: + + os.environ["TIMEOUT"] = "3.0" + + p = dlt.pipeline(destination="dummy", progress=progress) + p.extract(many_delayed(5, 10)) + p.normalize() + + collector = p.collector + + # attach pipeline + p = dlt.attach(progress=collector) + p.extract(many_delayed(5, 10)) + p.run(dataset_name="dummy") + + assert collector == p.drop().collector + + # make sure a valid logger was used + if progress == "tqdm": + assert isinstance(collector, TqdmCollector) + if progress == "enlighten": + assert isinstance(collector, EnlightenCollector) + if progress == "alive_progress": + assert isinstance(collector, AliveCollector) + if progress == "log": + assert isinstance(collector, LogCollector) + + +@pytest.mark.parametrize('method', ('extract', 'run')) +def test_column_argument_pydantic(method: str) -> None: + """Test columns schema is created from pydantic model""" + p = dlt.pipeline(destination='duckdb') + + @dlt.resource + def some_data() -> Iterator[Dict[str, Any]]: + yield {} + + class Columns(BaseModel): + a: Optional[int] = None + b: Optional[str] = None + + if method == 'run': + p.run(some_data(), columns=Columns) + else: + p.extract(some_data(), columns=Columns) + + assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' + assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True + assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' + assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True + + +@pytest.mark.parametrize("yield_list", [True, False]) +def test_pydantic_columns_with_contracts(yield_list: bool) -> None: + from datetime import datetime # noqa + + class UserLabel(BaseModel): + label: str + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + user = User( + user_id=1, + name="u1", + created_at=pendulum.now(), + labels=["l1", "l2"], + user_label=UserLabel(label="in_l1"), + user_labels=[UserLabel(label="l_l1"), UserLabel(label="l_l1")] + ) + + @dlt.resource(columns=User) + def users(users_list: List[Any]) -> Iterator[Any]: + if yield_list: + yield users_list + else: + yield from users_list + + pipeline = dlt.pipeline(destination='duckdb') + info = pipeline.run(users([user.dict(), user.dict()])) + assert_load_info(info) + print(pipeline.last_trace.last_normalize_info) + # data is passing validation, all filled in + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + + # produce two users with extra attrs in the child model but set the rows to discard so nothing is loaded + u1 = user.dict() + u1["user_labels"][0]["extra_1"] = "extra" + u1["user_labels"][1]["extra_1"] = "extra" + u2 = user.dict() + u2["user_labels"][0]["is_extra"] = True + + r = users([u1, u2]) + r.apply_hints(schema_contract="discard_row") + validator: PydanticValidator[User] = r.validator # type: ignore[assignment] + assert validator.data_mode == "discard_row" + assert validator.column_mode == "discard_row" + pipeline.run(r) + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + print(pipeline.last_trace.last_normalize_info) + + +def test_extract_pydantic_models() -> None: + pipeline = dlt.pipeline(destination='duckdb') + + class User(BaseModel): + user_id: int + name: str + + @dlt.resource + def users() -> Iterator[User]: + yield User(user_id=1, name="a") + yield User(user_id=2, name="b") + + pipeline.extract(users()) + + storage = ExtractorStorage(pipeline._normalize_storage_config) + expect_extracted_file( + storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) + ) + + +@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) +def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: + + @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) + def generic(start=8): + yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] + + pipeline = dlt.pipeline(destination='duckdb') + pipeline.run(generic(), loader_file_format=file_format) diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 0e8dea2145..019997ef6e 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -10,14 +10,13 @@ from dlt.common.storages import FileStorage from dlt.common import pipeline as state_module from dlt.common.utils import uniq_id -from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException, PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import migrate_state, STATE_ENGINE_VERSION from tests.utils import test_storage -from tests.pipeline.utils import json_case_path, load_json_case, airtable_emojis +from tests.pipeline.utils import json_case_path, load_json_case @dlt.resource() @@ -427,20 +426,3 @@ def test_migrate_state(test_storage: FileStorage) -> None: p = dlt.attach(pipeline_name="debug_pipeline", pipelines_dir=test_storage.storage_path) assert p.dataset_name == "debug_pipeline_data" assert p.default_schema_name == "example_source" - - -def test_resource_state_name_not_normalized() -> None: - pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") - peacock_s = airtable_emojis().with_resources("🦚Peacock") - pipeline.extract(peacock_s) - assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} - pipeline.normalize() - pipeline.load() - - # get state from destination - from dlt.pipeline.state_sync import load_state_from_destination - client: SqlJobClientBase - with pipeline.destination_client() as client: # type: ignore[assignment] - state = load_state_from_destination(pipeline.pipeline_name, client) - assert "airtable_emojis" in state["sources"] - assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 706644b60e..cd3e2444c8 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -22,7 +22,7 @@ from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.trace import PipelineTrace, SerializableResolvedValueTrace, describe_extract_data, load_trace from dlt.pipeline.track import slack_notify_load_success -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.extract.pipe import Pipe from tests.utils import start_test_telemetry diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py new file mode 100644 index 0000000000..93a5abf44c --- /dev/null +++ b/tests/pipeline/test_schema_contracts.py @@ -0,0 +1,601 @@ +import dlt, os, pytest +import contextlib +from typing import Any, Callable, Iterator, Union, Optional + +from dlt.common.schema.typing import TSchemaContract +from dlt.common.utils import uniq_id +from dlt.common.schema.exceptions import DataValidationError + +from dlt.extract import DltResource +from dlt.pipeline.pipeline import Pipeline +from dlt.pipeline.exceptions import PipelineStepFailed + +from tests.load.pipeline.utils import load_table_counts +from tests.utils import TDataItemFormat, skip_if_not_active, data_to_item_format, ALL_DATA_ITEM_FORMATS + +skip_if_not_active("duckdb") + +schema_contract = ["evolve", "discard_value", "discard_row", "freeze"] +LOCATIONS = ["source", "resource", "override"] +SCHEMA_ELEMENTS = ["tables", "columns", "data_type"] + + +@contextlib.contextmanager +def raises_frozen_exception(check_raise: bool = True) -> Any: + if not check_raise: + yield + return + with pytest.raises(PipelineStepFailed) as py_exc: + yield + assert isinstance(py_exc.value.__context__, DataValidationError) + + +def items(settings: TSchemaContract) -> Any: + + # NOTE: names must be normalizeds + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "SomeInt": 1, + "name": f"item {index}" + } + + return load_items + + +def items_with_variant(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "SomeInt": "hello" + } + + return load_items + + +def items_with_new_column(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "New^Col": "hello" + } + + return load_items + + +def items_with_subtable(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "sub_items": [{ + "id": index + 1000, + "name": f"sub item {index + 1000}" + }] + } + + return load_items + +def new_items(settings: TSchemaContract) -> Any: + + @dlt.resource(name="new_items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "some_int": 1, + "name": f"item {index}" + } + + return load_items + +OLD_COLUMN_NAME = "name" +NEW_COLUMN_NAME = "new_col" +VARIANT_COLUMN_NAME = "some_int__v_text" +SUBITEMS_TABLE = "items__sub_items" +NEW_ITEMS_TABLE = "new_items" + + +def run_resource(pipeline: Pipeline, resource_fun: Callable[..., DltResource], settings: Any, item_format: TDataItemFormat = "json", duplicates: int = 1) -> None: + + for item in settings.keys(): + assert item in LOCATIONS + ev_settings = settings[item] + if ev_settings in schema_contract: + continue + for key, val in ev_settings.items(): + assert val in schema_contract + assert key in SCHEMA_ELEMENTS + + @dlt.source(name="freeze_tests", schema_contract=settings.get("source")) + def source() -> Iterator[DltResource]: + for idx in range(duplicates): + resource: DltResource = resource_fun(settings.get("resource")) + if item_format != "json": + resource._pipe.replace_gen(data_to_item_format(item_format, resource._pipe.gen())) # type: ignore + resource.table_name = resource.name + yield resource.with_name(resource.name + str(idx)) + + # run pipeline + pipeline.run(source(), schema_contract=settings.get("override")) + + # check global settings + assert pipeline.default_schema._settings.get("schema_contract", None) == (settings.get("override") or settings.get("source")) + + # check items table settings + # assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("resource") or {}) + + # check effective table settings + # assert resolve_contract_settings_for_table(None, "items", pipeline.default_schema) == expand_schema_contract_settings(settings.get("resource") or settings.get("override") or "evolve") + +def get_pipeline(): + import duckdb + return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_tables(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: + + pipeline = get_pipeline() + + full_settings = { + setting_location: { + "tables": contract_setting + }} + run_resource(pipeline, items, {}, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + run_resource(pipeline, items_with_new_column, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding new table + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, new_items, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) + + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # run add variant column + run_resource(pipeline, items_with_variant, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding new subtable + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_subtable, full_settings) + + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 + assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_columns(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: + + full_settings = { + setting_location: { + "columns": contract_setting + }} + + pipeline = get_pipeline() + run_resource(pipeline, items, {}, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # new should work + run_resource(pipeline, new_items, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count = 10 + assert table_counts["items"] == expected_items_count + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column twice: filter will try to catch it before it is added for the second time + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_new_column, full_settings, item_format, duplicates=2) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) + + if contract_setting == "evolve": + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += (20 if contract_setting in ["evolve", "discard_value"] else 0) + assert table_counts["items"] == expected_items_count + + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # subtable should work + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count + assert table_counts[SUBITEMS_TABLE] == 10 + + # test adding variant column + run_resource(pipeline, items_with_variant, full_settings) + # variants are not new columns and should be able to always evolve + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_freeze_variants(contract_setting: str, setting_location: str) -> None: + + full_settings = { + setting_location: { + "data_type": contract_setting + }} + pipeline = get_pipeline() + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # subtable should work + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[SUBITEMS_TABLE] == 10 + + # new should work + run_resource(pipeline, new_items, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column + run_resource(pipeline, items_with_new_column, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding variant column + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_variant, full_settings) + + if contract_setting == "evolve": + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) + + +def test_settings_precedence() -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {}) + + # trying to add new column when forbidden on resource will fail + run_resource(pipeline, items_with_new_column, {"resource": { + "columns": "discard_row" + }}) + + # when allowed on override it will work + run_resource(pipeline, items_with_new_column, { + "resource": {"columns": "freeze"}, + "override": {"columns": "evolve"} + }) + + +def test_settings_precedence_2() -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {"source": { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden on source will fail + run_resource(pipeline, items_with_variant, {"source": { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # if allowed on resource it will pass + run_resource(pipeline, items_with_variant, { + "resource": {"data_type": "evolve"}, + "source": {"data_type": "discard_row"} + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + # if allowed on override it will also pass + run_resource(pipeline, items_with_variant, { + "resource": {"data_type": "discard_row"}, + "source": {"data_type": "discard_row"}, + "override": {"data_type": "evolve"}, + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + + +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_change_mode(setting_location: str) -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden will fail + run_resource(pipeline, items_with_variant, {setting_location: { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # now allow + run_resource(pipeline, items_with_variant, {setting_location: { + "data_type": "evolve" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_single_settings_value(setting_location: str) -> None: + pipeline = get_pipeline() + + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden will fail + run_resource(pipeline, items_with_variant, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new column will fail + run_resource(pipeline, items_with_new_column, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new table will fail + run_resource(pipeline, new_items, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert "new_items" not in table_counts + + +def test_data_contract_interaction() -> None: + """ + ensure data contracts with pydantic are enforced properly + """ + from pydantic import BaseModel, Extra + + class Items(BaseModel): + id: int # noqa: A003 + name: Optional[str] + amount: Union[int, str, None] + class Config: + extra = Extra.forbid + + @dlt.resource(name="items") + def get_items(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + }] + + @dlt.resource(name="items", columns=Items) + def get_items_with_model(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + }] + + @dlt.resource(name="items") + def get_items_new_col(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "new_col": "hello" + }] + + @dlt.resource(name="items") + def get_items_subtable(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "sub": [{"hello": "dave"}] + }] + + # test valid object + pipeline = get_pipeline() + # items with model work + pipeline.run([get_items_with_model()]) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + # loading once with pydantic will freeze the cols + pipeline = get_pipeline() + pipeline.run([get_items_with_model()]) + with raises_frozen_exception(True): + pipeline.run([get_items_new_col()]) + + # it is possible to override contract when there are new columns + # items with model alone does not work, since contract is set to freeze + pipeline = get_pipeline() + pipeline.run([get_items_with_model()]) + pipeline.run([get_items_new_col()], schema_contract="evolve") + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +def test_different_objects_in_one_load() -> None: + + pipeline = get_pipeline() + + @dlt.resource(name="items") + def get_items(): + yield { + "id": 1, + "name": "dave", + "amount": 50 + } + yield { + "id": 2, + "name": "dave", + "amount": 50, + "new_column": "some val" + } + + pipeline.run([get_items()], schema_contract={"columns": "freeze", "tables":"evolve"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "freeze"]) +def test_dynamic_tables(table_mode: str) -> None: + + pipeline = get_pipeline() + + # adding columns with a data type makes this columns complete which makes this table complete -> it fails in the normalize because + # the tables is NOT new according to normalizer so the row is not discarded + # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new + # if you uncomment update code in the extract the problem probably goes away + @dlt.resource(name="items", table_name=lambda i: i["tables"], columns={"id": {}}) + def get_items(): + yield { + "id": 1, + "tables": "one", + } + yield { + "id": 2, + "tables": "two", + "new_column": "some val" + } + with raises_frozen_exception(table_mode == "freeze"): + pipeline.run([get_items()], schema_contract={"tables": table_mode}) + + if table_mode != "freeze": + assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) + + +@pytest.mark.parametrize("column_mode", ["discard_row", "evolve", "freeze"]) +def test_defined_column_in_new_table(column_mode: str) -> None: + pipeline = get_pipeline() + + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + def get_items(): + yield { + "id": 1, + "key": "value", + } + pipeline.run([get_items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_new_column_from_hint_and_data(column_mode: str) -> None: + + pipeline = get_pipeline() + + # we define complete column on id, this creates a complete table + # normalizer does not know that it is a new table and discards the row + # and it also excepts on column freeze + + @dlt.resource( + name="items", + columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + def get_items(): + yield { + "id": 1, + "key": "value", + } + + pipeline.run([get_items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_two_new_columns_from_two_rows(column_mode: str) -> None: + + pipeline = get_pipeline() + + # this creates a complete table in first row + # and adds a new column to complete tables in 2nd row + # the test does not fail only because you clone schema in normalize + + @dlt.resource() + def items(): + yield { + "id": 1, + } + yield { + "id": 1, + "key": "value", + } + pipeline.run([items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_dynamic_new_columns(column_mode: str) -> None: + + pipeline = get_pipeline() + + # fails because dlt is not able to add _dlt_load_id to tables. I think we should do an exception for those + # 1. schema.dlt_tables() - everything evolve + # 2. is_dlt_column (I hope we have helper) - column evolve, data_type freeze + + def dynamic_columns(item): + if item["id"] == 1: + return [{"name": "key", "data_type": "text", "nullable": True}] + if item["id"] == 2: + return [{"name": "id", "data_type": "bigint", "nullable": True}] + + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"columns": column_mode}) # type: ignore + def get_items(): + yield { + "id": 1, + "key": "value", + } + yield { + "id": 2, + "key": "value", + } + + items = get_items() + items.apply_hints(columns=dynamic_columns) + # apply hints apply to `items` not the original resource, so doing get_items() below removed them completely + pipeline.run(items) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 2 diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index 97345061e3..b88c1a7773 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -1,8 +1,10 @@ +import os import dlt def test_schema_updates() -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name="test_schema_updates", full_refresh=True, destination="dummy") @dlt.source() @@ -15,7 +17,7 @@ def resource(): # test without normalizer attributes s = source() p.run(s, table_name="items", write_disposition="append") - assert p.default_schema._normalizers_config["json"]["config"] == {} + assert "config" not in p.default_schema._normalizers_config["json"] # add table propagation s = source() @@ -45,12 +47,12 @@ def resource(): s = source() s.root_key = False p.run(s, table_name="items", write_disposition="merge") + # source schema overwrites normalizer settings so `root` propagation is gone assert p.default_schema._normalizers_config["json"]["config"] == { "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } } } @@ -62,8 +64,7 @@ def resource(): "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 5 } @@ -77,8 +78,7 @@ def resource(): "tables": { "items": {'_dlt_id': '_dlt_root_id'}, "items2": {'_dlt_id': '_dlt_root_id'}, - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 50 } \ No newline at end of file diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 3e61c9510c..0d36ff3021 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -1,10 +1,16 @@ +import posixpath +from typing import Any, Dict, List, Tuple import pytest +import random from os import environ import dlt -from dlt.common import json -from dlt.common.pipeline import LoadInfo, PipelineContext +from dlt.common import json, sleep +from dlt.common.pipeline import LoadInfo +from dlt.common.schema.typing import LOADS_TABLE_NAME from dlt.common.typing import DictStrAny +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import TEST_STORAGE_ROOT @@ -35,6 +41,154 @@ def load_json_case(name: str) -> DictStrAny: return json.load(f) +def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: + """Returns row counts for `table_names` as dict""" + + # try sql, could be other destination though + try: + with p.sql_client() as c: + qualified_names = [c.make_qualified_table_name(name) for name in table_names] + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + except SqlClientNotAvailable: + pass + + # try filesystem + file_tables = load_files(p, *table_names) + result = {} + for table_name, items in file_tables.items(): + result[table_name] = len(items) + return result + +def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: + tables = [table["name"] for table in p.default_schema.data_tables()] + return load_table_counts(p, *tables) + + +def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: + table_counts = load_data_table_counts(p) + assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" + + +def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: + """ + util function to load a filesystem destination file and return parsed content + values may not be cast to the right type, especially for insert_values, please + make sure to do conversions and casting if needed in your tests + """ + result: List[Dict[str, Any]] = [] + + # check if this is a file we want to read + file_name_items = file.split(".") + ext = file_name_items[-1] + if ext not in ["jsonl", "insert_values", "parquet"]: + return "skip", [] + + # table name will be last element of path + table_name = path.split("/")[-1] + + # skip loads table + if table_name == "_dlt_loads": + return table_name, [] + + full_path = posixpath.join(path, file) + + # load jsonl + if ext == "jsonl": + with open(full_path, "rU", encoding="utf-8") as f: + for line in f: + result.append(json.loads(line)) + + # load insert_values (this is a bit volatile if the exact format of the source file changes) + elif ext == "insert_values": + with open(full_path, "rU", encoding="utf-8") as f: + lines = f.readlines() + # extract col names + cols = lines[0][15:-2].split(",") + for line in lines[2:]: + values = line[1:-3].split(",") + result.append(dict(zip(cols, values))) + + # load parquet + elif ext == "parquet": + import pyarrow.parquet as pq + with open(full_path, "rb") as f: + table = pq.read_table(f) + cols = table.column_names + count = 0 + for column in table: + column_name = cols[count] + item_count = 0 + for item in column.to_pylist(): + if len(result) <= item_count: + result.append({column_name: item}) + else: + result[item_count][column_name] = item + item_count += 1 + count += 1 + + return table_name, result + + +def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" + client: FilesystemClient = p.destination_client() # type: ignore[assignment] + result: Dict[str, Any] = {} + for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for file in files: + table_name, items = load_file(basedir, file) + if table_name not in table_names: + continue + if table_name in result: + result[table_name] = result[table_name] + items + else: + result[table_name] = items + + # loads file is special case + if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): + result[LOADS_TABLE_NAME] = [] + + return result + + + +def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + + # try sql, could be other destination though + try: + result = {} + for table_name in table_names: + table_rows = [] + columns = p.default_schema.get_table_columns(table_name).keys() + query_columns = ",".join(columns) + + with p.sql_client() as c: + f_q_table_name = c.make_qualified_table_name(table_name) + query = f"SELECT {query_columns} FROM {f_q_table_name}" + with c.execute_query(query) as cur: + for row in list(cur.fetchall()): + table_rows.append(dict(zip(columns, row))) + result[table_name] = table_rows + return result + + except SqlClientNotAvailable: + pass + + # try files + return load_files(p, *table_names) + + +def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: + """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) + with p.sql_client() as c: + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + + @dlt.source def airtable_emojis(): @@ -59,3 +213,20 @@ def wide_peacock(): return budget, schedule, peacock, wide_peacock + + +def run_deferred(iters): + + @dlt.defer + def item(n): + sleep(random.random() / 2) + return n + + for n in range(iters): + yield item(n) + + +@dlt.source +def many_delayed(many, iters): + for n in range(many): + yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) diff --git a/tests/utils.py b/tests/utils.py index 823b1cca83..8ec15a20ad 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ import requests import pytest from os import environ -from typing import Iterator, List +from typing import Any, Iterable, Iterator, List, Literal, Union, get_args from unittest.mock import patch from requests import Response @@ -21,7 +21,7 @@ from dlt.common.storages import FileStorage from dlt.common.schema import Schema from dlt.common.storages.versioned_storage import VersionedStorage -from dlt.common.typing import StrAny +from dlt.common.typing import StrAny, TDataItem from dlt.common.utils import custom_environ, uniq_id from dlt.common.pipeline import PipelineContext @@ -55,6 +55,13 @@ for destination in ACTIVE_DESTINATIONS: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown active destination {destination}" + +# possible TDataItem types +TDataItemFormat = Literal["json", "pandas", "arrow", "arrow-batch"] +ALL_DATA_ITEM_FORMATS = get_args(TDataItemFormat) +"""List with TDataItem formats: json, arrow table/batch / pandas""" + + def TEST_DICT_CONFIG_PROVIDER(): # add test dictionary provider providers_context = Container()[ConfigProvidersContext] @@ -136,6 +143,7 @@ def unload_modules() -> Iterator[None]: @pytest.fixture(autouse=True) def wipe_pipeline() -> Iterator[None]: + """Wipes pipeline local state and deactivates it""" container = Container() if container[PipelineContext].is_active(): container[PipelineContext].deactivate() @@ -148,6 +156,26 @@ def wipe_pipeline() -> Iterator[None]: container[PipelineContext].deactivate() +def data_to_item_format(item_format: TDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]]) -> Any: + """Return the given data in the form of pandas, arrow table/batch or json items""" + if item_format == "json": + return data + + import pandas as pd + from dlt.common.libs.pyarrow import pyarrow as pa + + # Make dataframe from the data + df = pd.DataFrame(list(data)) + if item_format == "pandas": + return [df] + elif item_format == "arrow": + return [pa.Table.from_pandas(df)] + elif item_format == "arrow-batch": + return [pa.RecordBatch.from_pandas(df)] + else: + raise ValueError(f"Unknown item format: {item_format}") + + def init_test_logging(c: RunConfiguration = None) -> None: if not c: c = resolve_configuration(RunConfiguration()) @@ -182,6 +210,7 @@ def create_schema_with_name(schema_name) -> Schema: def assert_no_dict_key_starts_with(d: StrAny, key_prefix: str) -> None: assert all(not key.startswith(key_prefix) for key in d.keys()) + def skip_if_not_active(destination: str) -> None: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown skipped destination {destination}" if destination not in ACTIVE_DESTINATIONS: