diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b597d49e6b..155b429b92 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction --all-extras --with airflow + run: poetry install --no-interaction --all-extras --with airflow --with docs --with providers --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_airflow.yml b/.github/workflows/test_airflow.yml index d78a48e8f7..bbed326344 100644 --- a/.github/workflows/test_airflow.yml +++ b/.github/workflows/test_airflow.yml @@ -41,7 +41,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-airflow-runner - name: Install dependencies - run: poetry install --no-interaction --with airflow -E duckdb -E parquet + run: poetry install --no-interaction --with airflow --with pipeline -E duckdb -E parquet --with sentry-sdk - run: | poetry run pytest tests/helpers/airflow_tests diff --git a/.github/workflows/test_common.yml b/.github/workflows/test_common.yml index 23b6eb9fdd..24c8215c2b 100644 --- a/.github/workflows/test_common.yml +++ b/.github/workflows/test_common.yml @@ -55,40 +55,67 @@ jobs: virtualenvs-in-project: true installer-parallel: true - - name: Load cached venv - id: cached-poetry-dependencies - uses: actions/cache@v3 - with: - # path: ${{ steps.pip-cache.outputs.dir }} - path: .venv - key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} + # NOTE: do not cache. we want to have a clean state each run and we upgrade depdendencies later + # - name: Load cached venv + # id: cached-poetry-dependencies + # uses: actions/cache@v3 + # with: + # # path: ${{ steps.pip-cache.outputs.dir }} + # path: .venv + # key: venv-${{ matrix.os }}-${{ matrix.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies + run: poetry install --no-interaction --with sentry-sdk + + - run: | + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py + if: runner.os != 'Windows' + name: Run common tests with minimum dependencies Linux/MAC + - run: | + poetry run pytest tests/common tests/normalize tests/reflection tests/sources tests/load/test_dummy_client.py tests/extract/test_extract.py tests/extract/test_sources.py tests/pipeline/test_pipeline_state.py -m "not forked" + if: runner.os == 'Windows' + name: Run common tests with minimum dependencies Windows + shell: cmd - - name: Install dependencies + sentry - run: poetry install --no-interaction -E parquet -E pydantic && pip install sentry-sdk + - name: Install duckdb dependencies + run: poetry install --no-interaction -E duckdb --with sentry-sdk - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os != 'Windows' - name: Run tests Linux/MAC + name: Run pipeline smoke tests with minimum deps Linux/MAC - run: | - poetry run pytest tests/common tests/normalize tests/reflection tests/sources -m "not forked" + poetry run pytest tests/pipeline/test_pipeline.py if: runner.os == 'Windows' - name: Run tests Windows + name: Run smoke tests with minimum deps Windows shell: cmd - - name: Install extra dependencies - run: poetry install --no-interaction -E duckdb -E cli -E parquet -E pydantic + - name: Install pipeline dependencies + run: poetry install --no-interaction -E duckdb -E cli -E parquet --with sentry-sdk --with pipeline - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common if: runner.os != 'Windows' - name: Run extra tests Linux/MAC + name: Run extract and pipeline tests Linux/MAC - run: | - poetry run pytest tests/extract tests/pipeline tests/cli/common + poetry run pytest tests/extract tests/pipeline tests/libs tests/cli/common if: runner.os == 'Windows' - name: Run extra tests Windows + name: Run extract tests Windows shell: cmd + # - name: Install Pydantic 1.0 + # run: pip install "pydantic<2" + + # - run: | + # poetry run pytest tests/libs + # if: runner.os != 'Windows' + # name: Run extract and pipeline tests Linux/MAC + # - run: | + # poetry run pytest tests/libs + # if: runner.os == 'Windows' + # name: Run extract tests Windows + # shell: cmd + matrix_job_required_check: name: Common tests needs: run_common diff --git a/.github/workflows/test_dbt_runner.yml b/.github/workflows/test_dbt_runner.yml index db3b53e9fa..1803a53fc1 100644 --- a/.github/workflows/test_dbt_runner.yml +++ b/.github/workflows/test_dbt_runner.yml @@ -68,7 +68,7 @@ jobs: - name: Install dependencies # install dlt with postgres support - run: poetry install --no-interaction -E postgres -E dbt + run: poetry install --no-interaction -E postgres -E dbt --with sentry-sdk - run: | poetry run pytest tests/helpers/dbt_tests -k '(not venv)' diff --git a/.github/workflows/test_destination_athena.yml b/.github/workflows/test_destination_athena.yml index 704e66522b..b849188ddd 100644 --- a/.github/workflows/test_destination_athena.yml +++ b/.github/workflows/test_destination_athena.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E athena --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_athena_iceberg.yml b/.github/workflows/test_destination_athena_iceberg.yml index 6892a96bf1..97544f24d1 100644 --- a/.github/workflows/test_destination_athena_iceberg.yml +++ b/.github/workflows/test_destination_athena_iceberg.yml @@ -70,7 +70,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E athena + run: poetry install --no-interaction -E --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_bigquery.yml b/.github/workflows/test_destination_bigquery.yml index dcc7e7ba9b..e12d7bd0f0 100644 --- a/.github/workflows/test_destination_bigquery.yml +++ b/.github/workflows/test_destination_bigquery.yml @@ -79,7 +79,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E bigquery --with providers -E parquet + run: poetry install --no-interaction -E bigquery --with providers -E parquet --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_destination_mssql.yml b/.github/workflows/test_destination_mssql.yml index bba44e750d..6eb4427bbf 100644 --- a/.github/workflows/test_destination_mssql.yml +++ b/.github/workflows/test_destination_mssql.yml @@ -65,7 +65,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet + run: poetry install --no-interaction -E mssql -E s3 -E gs -E az -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_qdrant.yml b/.github/workflows/test_destination_qdrant.yml index 09ded40f59..0ce3e3a3f9 100644 --- a/.github/workflows/test_destination_qdrant.yml +++ b/.github/workflows/test_destination_qdrant.yml @@ -59,7 +59,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E qdrant -E parquet + run: poetry install --no-interaction -E qdrant -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destination_snowflake.yml b/.github/workflows/test_destination_snowflake.yml index 4aae3ec62e..fe81c6121f 100644 --- a/.github/workflows/test_destination_snowflake.yml +++ b/.github/workflows/test_destination_snowflake.yml @@ -71,7 +71,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az + run: poetry install --no-interaction -E snowflake -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load diff --git a/.github/workflows/test_destination_synapse.yml b/.github/workflows/test_destination_synapse.yml index e86e29ebf6..d0f364c382 100644 --- a/.github/workflows/test_destination_synapse.yml +++ b/.github/workflows/test_destination_synapse.yml @@ -5,9 +5,9 @@ on: branches: - master - devel - + workflow_dispatch: - + env: DESTINATION__SYNAPSE__CREDENTIALS: ${{ secrets.SYNAPSE_CREDENTIALS }} DESTINATION__SYNAPSE__CREDENTIALS__PASSWORD: ${{ secrets.SYNAPSE_PASSWORD }} @@ -42,7 +42,7 @@ jobs: runs-on: ${{ matrix.os }} steps: - + - name: Check out uses: actions/checkout@master @@ -70,7 +70,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E synapse -E s3 -E gs -E az + run: poetry install --no-interaction -E synapse -E s3 -E gs -E az --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load --ignore tests/load/pipeline/test_dbt_helper.py diff --git a/.github/workflows/test_destination_weaviate.yml b/.github/workflows/test_destination_weaviate.yml index 6a7a2e95cd..c771a28204 100644 --- a/.github/workflows/test_destination_weaviate.yml +++ b/.github/workflows/test_destination_weaviate.yml @@ -61,7 +61,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-gcp - name: Install dependencies - run: poetry install --no-interaction -E weaviate -E parquet + run: poetry install --no-interaction -E weaviate -E parquet --with sentry-sdk --with pipeline - run: | poetry run pytest tests/load/ if: runner.os != 'Windows' diff --git a/.github/workflows/test_destinations.yml b/.github/workflows/test_destinations.yml index f3f6c492db..f37feb872f 100644 --- a/.github/workflows/test_destinations.yml +++ b/.github/workflows/test_destinations.yml @@ -87,7 +87,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli + run: poetry install --no-interaction -E redshift -E gs -E s3 -E az -E parquet -E duckdb -E cli --with sentry-sdk --with pipeline # - name: Install self # run: poetry install --no-interaction diff --git a/.github/workflows/test_doc_snippets.yml b/.github/workflows/test_doc_snippets.yml index ad7d544219..004bafba05 100644 --- a/.github/workflows/test_doc_snippets.yml +++ b/.github/workflows/test_doc_snippets.yml @@ -59,7 +59,7 @@ jobs: - name: Install dependencies # if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' - run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow + run: poetry install --no-interaction -E duckdb -E weaviate -E parquet --with docs --without airflow --with sentry-sdk --with pipeline - name: Run linter and tests run: make test-and-lint-snippets diff --git a/.github/workflows/test_local_destinations.yml b/.github/workflows/test_local_destinations.yml index 6c538d1968..42c3c2d13a 100644 --- a/.github/workflows/test_local_destinations.yml +++ b/.github/workflows/test_local_destinations.yml @@ -84,7 +84,7 @@ jobs: key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}-local-destinations - name: Install dependencies - run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate + run: poetry install --no-interaction -E postgres -E duckdb -E parquet -E filesystem -E cli -E weaviate --with sentry-sdk --with pipeline - run: poetry run pytest tests/load && poetry run pytest tests/cli name: Run tests Linux diff --git a/Makefile b/Makefile index 85f67818ac..66c429743b 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ has-poetry: poetry --version dev: has-poetry - poetry install --all-extras --with airflow --with docs --with providers + poetry install --all-extras --with airflow --with docs --with providers --with pipeline --with sentry-sdk lint: ./check-package.sh @@ -87,3 +87,4 @@ test-build-images: build-library grep `cat compiled_packages.txt` _gen_requirements.txt > compiled_requirements.txt docker build -f deploy/dlt/Dockerfile.airflow --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . docker build -f deploy/dlt/Dockerfile --build-arg=COMMIT_SHA="$(shell git log -1 --pretty=%h)" --build-arg=IMAGE_VERSION="$(shell poetry version -s)" . + diff --git a/dlt/__init__.py b/dlt/__init__.py index f5dde3f204..728343bdd6 100644 --- a/dlt/__init__.py +++ b/dlt/__init__.py @@ -31,6 +31,7 @@ from dlt.extract.decorators import source, resource, transformer, defer from dlt.pipeline import pipeline as _pipeline, run, attach, Pipeline, dbt, current as _current, mark as _mark from dlt.pipeline import progress +from dlt import destinations pipeline = _pipeline current = _current @@ -64,4 +65,5 @@ "TSecretValue", "TCredentials", "sources", + "destinations", ] diff --git a/dlt/cli/_dlt.py b/dlt/cli/_dlt.py index f719c30de0..dfda2966b9 100644 --- a/dlt/cli/_dlt.py +++ b/dlt/cli/_dlt.py @@ -25,13 +25,22 @@ pass +DEBUG_FLAG = False + + +def on_exception(ex: Exception, info: str) -> None: + click.secho(str(ex), err=True, fg="red") + fmt.note("Please refer to %s for further assistance" % fmt.bold(info)) + if DEBUG_FLAG: + raise ex + + @utils.track_command("init", False, "source_name", "destination_name") def init_command_wrapper(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str) -> int: try: init_command(source_name, destination_name, use_generic_template, repo_location, branch) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_INIT_DOCS_URL)) + on_exception(ex, DLT_INIT_DOCS_URL) return -1 return 0 @@ -41,8 +50,7 @@ def list_verified_sources_command_wrapper(repo_location: str, branch: str) -> in try: list_verified_sources_command(repo_location, branch) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_INIT_DOCS_URL)) + on_exception(ex, DLT_INIT_DOCS_URL) return -1 return 0 @@ -66,9 +74,8 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re **kwargs ) except (CannotRestorePipelineException, PipelineWasNotRun) as ex: - click.secho(str(ex), err=True, fg="red") fmt.note("You must run the pipeline locally successfully at least once in order to deploy it.") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_DEPLOY_DOCS_URL)) + on_exception(ex, DLT_DEPLOY_DOCS_URL) return -2 except InvalidGitRepositoryError: click.secho( @@ -89,10 +96,8 @@ def deploy_command_wrapper(pipeline_script_path: str, deployment_method: str, re ) return -4 except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_DEPLOY_DOCS_URL)) + on_exception(ex, DLT_DEPLOY_DOCS_URL) return -5 - # TODO: display stack trace if with debug flag return 0 @@ -106,10 +111,10 @@ def pipeline_command_wrapper( except CannotRestorePipelineException as ex: click.secho(str(ex), err=True, fg="red") click.secho("Try command %s to restore the pipeline state from destination" % fmt.bold(f"dlt pipeline {pipeline_name} sync")) - return 1 + return -1 except Exception as ex: - click.secho(str(ex), err=True, fg="red") - return 1 + on_exception(ex, DLT_PIPELINE_COMMAND_DOCS_URL) + return -2 @utils.track_command("schema", False, "operation") @@ -133,8 +138,7 @@ def telemetry_status_command_wrapper() -> int: try: telemetry_status_command() except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_TELEMETRY_DOCS_URL)) + on_exception(ex, DLT_TELEMETRY_DOCS_URL) return -1 return 0 @@ -144,8 +148,7 @@ def telemetry_change_status_command_wrapper(enabled: bool) -> int: try: change_telemetry_status_command(enabled) except Exception as ex: - click.secho(str(ex), err=True, fg="red") - fmt.note("Please refer to %s for further assistance" % fmt.bold(DLT_TELEMETRY_DOCS_URL)) + on_exception(ex, DLT_TELEMETRY_DOCS_URL) return -1 return 0 @@ -186,12 +189,28 @@ def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespac fmt.ALWAYS_CHOOSE_DEFAULT = True +class DebugAction(argparse.Action): + def __init__(self, option_strings: Sequence[str], dest: Any = argparse.SUPPRESS, default: Any = argparse.SUPPRESS, help: str = None) -> None: # noqa + super(DebugAction, self).__init__( + option_strings=option_strings, + dest=dest, + default=default, + nargs=0, + help=help + ) + def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Any, option_string: str = None) -> None: + global DEBUG_FLAG + # will show stack traces (and maybe more debug things) + DEBUG_FLAG = True + + def main() -> int: parser = argparse.ArgumentParser(description="Creates, adds, inspects and deploys dlt pipelines.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', action="version", version='%(prog)s {version}'.format(version=__version__)) parser.add_argument('--disable-telemetry', action=TelemetryAction, help="Disables telemetry before command is executed") parser.add_argument('--enable-telemetry', action=TelemetryAction, help="Enables telemetry before command is executed") parser.add_argument('--non-interactive', action=NonInteractiveAction, help="Non interactive mode. Default choices are automatically made for confirmations and prompts.") + parser.add_argument('--debug', action=DebugAction, help="Displays full stack traces on exceptions.") subparsers = parser.add_subparsers(dest="command") init_cmd = subparsers.add_parser("init", help="Creates a pipeline project in the current folder by adding existing verified source or creating a new one from template.") @@ -239,8 +258,6 @@ def main() -> int: pipe_cmd.add_argument("pipeline_name", nargs='?', help="Pipeline name") pipe_cmd.add_argument("--pipelines-dir", help="Pipelines working directory", default=None) pipe_cmd.add_argument("--verbose", "-v", action='count', default=0, help="Provides more information for certain commands.", dest="verbosity") - # pipe_cmd.add_argument("--dataset-name", help="Dataset name used to sync destination when local pipeline state is missing.") - # pipe_cmd.add_argument("--destination", help="Destination name used to sync when local pipeline state is missing.") pipeline_subparsers = pipe_cmd.add_subparsers(dest="operation", required=False) @@ -251,6 +268,7 @@ def main() -> int: pipeline_subparsers.add_parser("info", help="Displays state of the pipeline, use -v or -vv for more info") pipeline_subparsers.add_parser("show", help="Generates and launches Streamlit app with the loading status and dataset explorer") pipeline_subparsers.add_parser("failed-jobs", help="Displays information on all the failed loads in all completed packages, failed jobs and associated error messages") + pipeline_subparsers.add_parser("drop-pending-packages", help="Deletes all extracted and normalized packages including those that are partially loaded.") pipeline_subparsers.add_parser( "sync", help="Drops the local state of the pipeline and resets all the schemas and restores it from destination. The destination state, data and schemas are left intact.", @@ -290,6 +308,9 @@ def main() -> int: return pipeline_command_wrapper("list", "-", args.pipelines_dir, args.verbosity) else: command_kwargs = dict(args._get_kwargs()) + if not command_kwargs.get("pipeline_name"): + pipe_cmd.print_usage() + return -1 command_kwargs['operation'] = args.operation or "info" del command_kwargs["command"] del command_kwargs["list_pipelines"] diff --git a/dlt/cli/deploy_command.py b/dlt/cli/deploy_command.py index 7634f173b3..a7bdf2e0e7 100644 --- a/dlt/cli/deploy_command.py +++ b/dlt/cli/deploy_command.py @@ -16,7 +16,7 @@ from dlt.version import DLT_PKG_NAME -from dlt.common.destination.reference import DestinationReference +from dlt.common.destination.reference import Destination REQUIREMENTS_GITHUB_ACTION = "requirements_github_action.txt" DLT_DEPLOY_DOCS_URL = "https://dlthub.com/docs/walkthroughs/deploy-a-pipeline" @@ -198,7 +198,7 @@ def __init__( def _generate_workflow(self, *args: Optional[Any]) -> None: self.deployment_method = DeploymentMethods.airflow_composer.value - req_dep = f"{DLT_PKG_NAME}[{DestinationReference.to_name(self.state['destination'])}]" + req_dep = f"{DLT_PKG_NAME}[{Destination.to_name(self.state['destination'])}]" req_dep_line = f"{req_dep}>={pkg_version(DLT_PKG_NAME)}" self.artifacts["requirements_txt"] = req_dep_line diff --git a/dlt/cli/init_command.py b/dlt/cli/init_command.py index c246ac87de..4cec1706b9 100644 --- a/dlt/cli/init_command.py +++ b/dlt/cli/init_command.py @@ -12,7 +12,7 @@ from dlt.common.pipeline import get_dlt_repos_dir from dlt.common.source import _SOURCES from dlt.version import DLT_PKG_NAME, __version__ -from dlt.common.destination import DestinationReference +from dlt.common.destination import Destination from dlt.common.reflection.utils import rewrite_python_script from dlt.common.schema.utils import is_valid_schema_name from dlt.common.schema.exceptions import InvalidSchemaName @@ -160,8 +160,8 @@ def list_verified_sources_command(repo_location: str, branch: str = None) -> Non def init_command(source_name: str, destination_name: str, use_generic_template: bool, repo_location: str, branch: str = None) -> None: # try to import the destination and get config spec - destination_reference = DestinationReference.from_name(destination_name) - destination_spec = destination_reference.spec() + destination_reference = Destination.from_reference(destination_name) + destination_spec = destination_reference.spec fmt.echo("Looking up the init scripts in %s..." % fmt.bold(repo_location)) clone_storage = git.get_fresh_repo_files(repo_location, get_dlt_repos_dir(), branch=branch) diff --git a/dlt/cli/pipeline_command.py b/dlt/cli/pipeline_command.py index 52a9c8ffdc..2d705dc1a3 100644 --- a/dlt/cli/pipeline_command.py +++ b/dlt/cli/pipeline_command.py @@ -1,5 +1,5 @@ import yaml -from typing import Any +from typing import Any, Sequence, Tuple import dlt from dlt.cli.exceptions import CliCommandException @@ -9,8 +9,7 @@ from dlt.common.runners import Venv from dlt.common.runners.stdout import iter_stdout from dlt.common.schema.utils import group_tables_by_resource, remove_defaults -from dlt.common.storages.file_storage import FileStorage -from dlt.common.typing import DictStrAny +from dlt.common.storages import FileStorage, LoadStorage from dlt.pipeline.helpers import DropCommand from dlt.pipeline.exceptions import CannotRestorePipelineException @@ -33,6 +32,8 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver return try: + if verbosity > 0: + fmt.echo("Attaching to pipeline %s" % fmt.bold(pipeline_name)) p = dlt.attach(pipeline_name=pipeline_name, pipelines_dir=pipelines_dir) except CannotRestorePipelineException as e: if operation not in {"sync", "drop"}: @@ -52,6 +53,22 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver if operation == "sync": return # No need to sync again + def _display_pending_packages() -> Tuple[Sequence[str], Sequence[str]]: + extracted_files = p.list_extracted_resources() + if extracted_files: + fmt.echo("Has %s extracted files ready to be normalized" % fmt.bold(str(len(extracted_files)))) + norm_packages = p.list_normalized_load_packages() + if norm_packages: + fmt.echo("Has %s load packages ready to be loaded with following load ids:" % fmt.bold(str(len(norm_packages)))) + for load_id in norm_packages: + fmt.echo(load_id) + # load first (oldest) package + first_package_info = p.get_load_package_info(norm_packages[0]) + if LoadStorage.is_package_partially_loaded(first_package_info): + fmt.warning("This package is partially loaded. Data in the destination may be modified.") + fmt.echo() + return extracted_files, norm_packages + fmt.echo("Found pipeline %s in %s" % (fmt.bold(p.pipeline_name), fmt.bold(p.pipelines_dir))) if operation == "show": @@ -102,15 +119,7 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver fmt.echo("%s with %s table(s) and %s resource state slot(s)" % (fmt.bold(resource_name), fmt.bold(str(len(tables))), fmt.bold(str(res_state_slots)))) fmt.echo() fmt.echo("Working dir content:") - extracted_files = p.list_extracted_resources() - if extracted_files: - fmt.echo("Has %s extracted files ready to be normalized" % fmt.bold(str(len(extracted_files)))) - norm_packages = p.list_normalized_load_packages() - if norm_packages: - fmt.echo("Has %s load packages ready to be loaded with following load ids:" % fmt.bold(str(len(norm_packages)))) - for load_id in norm_packages: - fmt.echo(load_id) - fmt.echo() + _display_pending_packages() loaded_packages = p.list_completed_load_packages() if loaded_packages: fmt.echo("Has %s completed load packages with following load ids:" % fmt.bold(str(len(loaded_packages)))) @@ -148,6 +157,13 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver else: fmt.echo("No failed jobs found") + if operation == "drop-pending-packages": + extracted_files, norm_packages = _display_pending_packages() + if len(extracted_files) == 0 and len(norm_packages) == 0: + fmt.echo("No pending packages found") + if fmt.confirm("Delete the above packages?", default=False): + p.drop_pending_packages(with_partial_loads=True) + fmt.echo("Pending packages deleted") if operation == "sync": if fmt.confirm("About to drop the local state of the pipeline and reset all the schemas. The destination state, data and schemas are left intact. Proceed?", default=False): @@ -196,7 +212,7 @@ def pipeline_command(operation: str, pipeline_name: str, pipelines_dir: str, ver fmt.warning(warning) return - fmt.echo("About to drop the following data in dataset %s in destination %s:" % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.__name__))) + fmt.echo("About to drop the following data in dataset %s in destination %s:" % (fmt.bold(drop.info["dataset_name"]), fmt.bold(p.destination.name))) fmt.echo("%s: %s" % (fmt.style("Selected schema", fg="green"), drop.info["schema_name"])) fmt.echo("%s: %s" % (fmt.style("Selected resource(s)", fg="green"), drop.info["resource_names"])) fmt.echo("%s: %s" % (fmt.style("Table(s) to drop", fg="green"), drop.info["tables"])) diff --git a/dlt/common/configuration/inject.py b/dlt/common/configuration/inject.py index 1880727a0f..f50e947011 100644 --- a/dlt/common/configuration/inject.py +++ b/dlt/common/configuration/inject.py @@ -32,7 +32,8 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, ) -> TFun: ... @@ -45,7 +46,8 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, ) -> Callable[[TFun], TFun]: ... @@ -57,7 +59,9 @@ def with_config( sections: Tuple[str, ...] = (), sections_merge_style: ConfigSectionContext.TMergeFunc = ConfigSectionContext.prefer_incoming, auto_pipeline_section: bool = False, - include_defaults: bool = True + include_defaults: bool = True, + accept_partial: bool = False, + initial_config: Optional[BaseConfiguration] = None, ) -> Callable[[TFun], TFun]: """Injects values into decorated function arguments following the specification in `spec` or by deriving one from function's signature. @@ -127,7 +131,9 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: curr_sections = sections # if one of arguments is spec the use it as initial value - if spec_arg: + if initial_config: + config = initial_config + elif spec_arg: config = bound_args.arguments.get(spec_arg.name, None) # resolve SPEC, also provide section_context with pipeline_name if pipeline_name_arg: @@ -139,7 +145,7 @@ def _wrap(*args: Any, **kwargs: Any) -> Any: with _RESOLVE_LOCK: with inject_section(section_context): # print(f"RESOLVE CONF in inject: {f.__name__}: {section_context.sections} vs {sections}") - config = resolve_configuration(config or SPEC(), explicit_value=bound_args.arguments) + config = resolve_configuration(config or SPEC(), explicit_value=bound_args.arguments, accept_partial=accept_partial) resolved_params = dict(config) # overwrite or add resolved params for p in sig.parameters.values(): diff --git a/dlt/common/data_writers/buffered.py b/dlt/common/data_writers/buffered.py index 5c93e22bc6..783a3501d2 100644 --- a/dlt/common/data_writers/buffered.py +++ b/dlt/common/data_writers/buffered.py @@ -68,7 +68,7 @@ def __init__( except TypeError: raise InvalidFileNameTemplateException(file_name_template) - def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> int: self._ensure_open() # rotate file if columns changed and writer does not allow for that # as the only allowed change is to add new column (no updates/deletes), we detect the change by comparing lengths @@ -78,21 +78,24 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # until the first chunk is written we can change the columns schema freely if columns is not None: self._current_columns = dict(columns) + + new_rows_count: int if isinstance(item, List): # items coming in single list will be written together, not matter how many are there self._buffered_items.extend(item) # update row count, if item supports "num_rows" it will be used to count items if len(item) > 0 and hasattr(item[0], "num_rows"): - self._buffered_items_count += sum(tbl.num_rows for tbl in item) + new_rows_count = sum(tbl.num_rows for tbl in item) else: - self._buffered_items_count += len(item) + new_rows_count = len(item) else: self._buffered_items.append(item) # update row count, if item supports "num_rows" it will be used to count items if hasattr(item, "num_rows"): - self._buffered_items_count += item.num_rows + new_rows_count = item.num_rows else: - self._buffered_items_count += 1 + new_rows_count = 1 + self._buffered_items_count += new_rows_count # flush if max buffer exceeded if self._buffered_items_count >= self.buffer_max_items: self._flush_items() @@ -104,6 +107,7 @@ def write_data_item(self, item: TDataItems, columns: TTableSchemaColumns) -> Non # rotate on max items elif self.file_max_items and self._writer.items_count >= self.file_max_items: self._rotate_file() + return new_rows_count def write_empty_file(self, columns: TTableSchemaColumns) -> None: if columns is not None: diff --git a/dlt/common/data_writers/writers.py b/dlt/common/data_writers/writers.py index 401f6aafd2..412e732e97 100644 --- a/dlt/common/data_writers/writers.py +++ b/dlt/common/data_writers/writers.py @@ -220,7 +220,7 @@ def __init__(self, self.parquet_row_group_size = row_group_size def _create_writer(self, schema: "pa.Schema") -> "pa.parquet.ParquetWriter": - from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype + from dlt.common.libs.pyarrow import pyarrow return pyarrow.parquet.ParquetWriter(self._f, schema, flavor=self.parquet_flavor, version=self.parquet_version, data_page_size=self.parquet_data_page_size) def write_header(self, columns_schema: TTableSchemaColumns) -> None: diff --git a/dlt/common/destination/__init__.py b/dlt/common/destination/__init__.py index 88b5d5ef06..4857851fa9 100644 --- a/dlt/common/destination/__init__.py +++ b/dlt/common/destination/__init__.py @@ -1,10 +1,11 @@ from dlt.common.destination.capabilities import DestinationCapabilitiesContext, TLoaderFileFormat, ALL_SUPPORTED_FILE_FORMATS -from dlt.common.destination.reference import DestinationReference, TDestinationReferenceArg +from dlt.common.destination.reference import TDestinationReferenceArg, Destination, TDestination __all__ = [ "DestinationCapabilitiesContext", "TLoaderFileFormat", "ALL_SUPPORTED_FILE_FORMATS", - "DestinationReference", "TDestinationReferenceArg", + "Destination", + "TDestination", ] diff --git a/dlt/common/destination/reference.py b/dlt/common/destination/reference.py index 13172b41e9..1c3560cbbd 100644 --- a/dlt/common/destination/reference.py +++ b/dlt/common/destination/reference.py @@ -1,10 +1,11 @@ from abc import ABC, abstractmethod, abstractproperty from importlib import import_module from types import TracebackType, ModuleType -from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any +from typing import ClassVar, Final, Optional, NamedTuple, Literal, Sequence, Iterable, Type, Protocol, Union, TYPE_CHECKING, cast, List, ContextManager, Dict, Any, Callable, TypeVar, Generic from contextlib import contextmanager import datetime # noqa: 251 from copy import deepcopy +import inspect from dlt.common import logger from dlt.common.exceptions import IdentifierTooLongException, InvalidDestinationReference, UnknownDestinationModule @@ -12,7 +13,7 @@ from dlt.common.schema.typing import TWriteDisposition from dlt.common.schema.exceptions import InvalidDatasetName from dlt.common.schema.utils import get_write_disposition, get_table_format -from dlt.common.configuration import configspec +from dlt.common.configuration import configspec, with_config, resolve_configuration, known_sections from dlt.common.configuration.specs import BaseConfiguration, CredentialsConfiguration from dlt.common.configuration.accessors import config from dlt.common.destination.capabilities import DestinationCapabilitiesContext @@ -23,7 +24,10 @@ from dlt.common.utils import get_module_name from dlt.common.configuration.specs import GcpCredentials, AwsCredentialsWithoutDefaults + TLoaderReplaceStrategy = Literal["truncate-and-insert", "insert-from-staging", "staging-optimized"] +TDestinationConfig = TypeVar("TDestinationConfig", bound="DestinationClientConfiguration") +TDestinationClient = TypeVar("TDestinationClient", bound="JobClientBase") class StorageSchemaInfo(NamedTuple): @@ -344,59 +348,102 @@ def should_truncate_table_before_load_on_staging_destination(self, table: TTable # the default is to truncate the tables on the staging destination... return True -TDestinationReferenceArg = Union["DestinationReference", ModuleType, None, str] +TDestinationReferenceArg = Union[str, "Destination", None] -class DestinationReference(Protocol): - __name__: str - """Name of the destination""" +class Destination(ABC, Generic[TDestinationConfig, TDestinationClient]): + """A destination factory that can be partially pre-configured + with credentials and other config params. + """ + config_params: Optional[Dict[str, Any]] = None + + def __init__(self, **kwargs: Any) -> None: + # Create initial unresolved destination config + # Argument defaults are filtered out here because we only want arguments passed explicitly + # to supersede config from the environment or pipeline args + sig = inspect.signature(self.__class__) + params = sig.parameters + self.config_params = { + k: v for k, v in kwargs.items() + if k not in params or v != params[k].default + } + + @property + @abstractmethod + def spec(self) -> Type[TDestinationConfig]: + """A spec of destination configuration that also contains destination credentials""" + ... + @abstractmethod def capabilities(self) -> DestinationCapabilitiesContext: """Destination capabilities ie. supported loader file formats, identifier name lengths, naming conventions, escape function etc.""" + ... - def client(self, schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> "JobClientBase": - """A job client responsible for starting and resuming load jobs""" + @property + def name(self) -> str: + return self.__class__.__name__ - def spec(self) -> Type[DestinationClientConfiguration]: - """A spec of destination configuration that also contains destination credentials""" + @property + @abstractmethod + def client_class(self) -> Type[TDestinationClient]: + """A job client class responsible for starting and resuming load jobs""" + ... + + def configuration(self, initial_config: TDestinationConfig) -> TDestinationConfig: + """Get a fully resolved destination config from the initial config + """ + return resolve_configuration( + initial_config, + sections=(known_sections.DESTINATION, self.name), + # Already populated values will supersede resolved env config + explicit_value=self.config_params + ) + + @staticmethod + def to_name(ref: TDestinationReferenceArg) -> str: + if ref is None: + raise InvalidDestinationReference(ref) + if isinstance(ref, str): + return ref.rsplit(".", 1)[-1] + return ref.name @staticmethod - def from_name(destination: TDestinationReferenceArg) -> "DestinationReference": - if destination is None: + def from_reference(ref: TDestinationReferenceArg, credentials: Optional[CredentialsConfiguration] = None, **kwargs: Any) -> Optional["Destination[DestinationClientConfiguration, JobClientBase]"]: + """Instantiate destination from str reference. + The ref can be a destination name or import path pointing to a destination class (e.g. `dlt.destinations.postgres`) + """ + if ref is None: return None + if isinstance(ref, Destination): + return ref + if not isinstance(ref, str): + raise InvalidDestinationReference(ref) + try: + if "." in ref: + module_path, attr_name = ref.rsplit(".", 1) + dest_module = import_module(module_path) + else: + from dlt import destinations as dest_module + attr_name = ref + except ModuleNotFoundError as e: + raise UnknownDestinationModule(ref) from e - # if destination is a str, get destination reference by dynamically importing module - if isinstance(destination, str): - try: - if "." in destination: - # this is full module name - destination_ref = cast(DestinationReference, import_module(destination)) - else: - # from known location - destination_ref = cast(DestinationReference, import_module(f"dlt.destinations.{destination}")) - except ImportError: - if "." in destination: - raise UnknownDestinationModule(destination) - else: - # allow local external module imported without dot - try: - destination_ref = cast(DestinationReference, import_module(destination)) - except ImportError: - raise UnknownDestinationModule(destination) - else: - destination_ref = cast(DestinationReference, destination) - - # make sure the reference is correct try: - c = destination_ref.spec() - c.credentials - except Exception: - raise InvalidDestinationReference(destination) + factory: Type[Destination[DestinationClientConfiguration, JobClientBase]] = getattr(dest_module, attr_name) + except AttributeError as e: + raise UnknownDestinationModule(ref) from e + if credentials: + kwargs["credentials"] = credentials + try: + dest = factory(**kwargs) + dest.spec + except Exception as e: + raise InvalidDestinationReference(ref) from e + return dest - return destination_ref + def client(self, schema: Schema, initial_config: TDestinationConfig = config.value) -> TDestinationClient: + """Returns a configured instance of the destination's job client""" + return self.client_class(schema, self.configuration(initial_config)) - @staticmethod - def to_name(destination: TDestinationReferenceArg) -> str: - if isinstance(destination, ModuleType): - return get_module_name(destination) - return destination.split(".")[-1] # type: ignore + +TDestination = Destination[DestinationClientConfiguration, JobClientBase] diff --git a/dlt/common/json/__init__.py b/dlt/common/json/__init__.py index edb48643ef..c4acf66c72 100644 --- a/dlt/common/json/__init__.py +++ b/dlt/common/json/__init__.py @@ -181,6 +181,11 @@ def custom_pua_remove(obj: Any) -> Any: return obj +def may_have_pua(line: bytes) -> bool: + """Checks if bytes string contains pua marker""" + return b'\xef\x80' in line + + # pick the right impl json: SupportsJson = None if os.environ.get("DLT_USE_JSON") == "simplejson": diff --git a/dlt/common/libs/pyarrow.py b/dlt/common/libs/pyarrow.py index fb2f5c2e72..585bee0d2f 100644 --- a/dlt/common/libs/pyarrow.py +++ b/dlt/common/libs/pyarrow.py @@ -1,12 +1,14 @@ from typing import Any, Tuple, Optional, Union, Callable, Iterable, Iterator, Sequence, Tuple +from copy import copy + from dlt import version from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.typing import DLT_NAME_PREFIX, TTableSchemaColumns from dlt.common.destination.capabilities import DestinationCapabilitiesContext -from dlt.common.schema.typing import TColumnType, TColumnSchemaBase -from dlt.common.data_types import TDataType -from dlt.common.typing import TFileOrPath +from dlt.common.schema.typing import TColumnType +from dlt.common.typing import StrStr, TFileOrPath +from dlt.common.normalizers.naming import NamingConvention try: import pyarrow @@ -140,23 +142,120 @@ def _get_column_type_from_py_arrow(dtype: pyarrow.DataType) -> TColumnType: def remove_null_columns(item: TAnyArrowItem) -> TAnyArrowItem: - """Remove all columns of datatype pyarrow.null() from the table or record batch - """ + """Remove all columns of datatype pyarrow.null() from the table or record batch""" + return remove_columns(item, [field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + + +def remove_columns(item: TAnyArrowItem, columns: Sequence[str]) -> TAnyArrowItem: + """Remove `columns` from Arrow `item`""" + if not columns: + return item + if isinstance(item, pyarrow.Table): - return item.drop([field.name for field in item.schema if pyarrow.types.is_null(field.type)]) + return item.drop(columns) elif isinstance(item, pyarrow.RecordBatch): - null_idx = [i for i, col in enumerate(item.columns) if pyarrow.types.is_null(col.type)] - new_schema = item.schema - for i in reversed(null_idx): - new_schema = new_schema.remove(i) - return pyarrow.RecordBatch.from_arrays( - [col for i, col in enumerate(item.columns) if i not in null_idx], - schema=new_schema - ) + # NOTE: select is available in pyarrow 12 an up + return item.select([n for n in item.schema.names if n not in columns]) # reverse selection else: raise ValueError(item) +def append_column(item: TAnyArrowItem, name: str, data: Any) -> TAnyArrowItem: + """Appends new column to Table or RecordBatch""" + if isinstance(item, pyarrow.Table): + return item.append_column(name, data) + elif isinstance(item, pyarrow.RecordBatch): + new_field = pyarrow.field(name, data.type) + return pyarrow.RecordBatch.from_arrays(item.columns + [data], schema=item.schema.append(new_field)) + else: + raise ValueError(item) + + +def rename_columns(item: TAnyArrowItem, new_column_names: Sequence[str]) -> TAnyArrowItem: + """Rename arrow columns on Table or RecordBatch, returns same data but with renamed schema""" + + if list(item.schema.names) == list(new_column_names): + # No need to rename + return item + + if isinstance(item, pyarrow.Table): + return item.rename_columns(new_column_names) + elif isinstance(item, pyarrow.RecordBatch): + new_fields = [field.with_name(new_name) for new_name, field in zip(new_column_names, item.schema)] + return pyarrow.RecordBatch.from_arrays(item.columns, schema=pyarrow.schema(new_fields)) + else: + raise TypeError(f"Unsupported data item type {type(item)}") + + +def normalize_py_arrow_schema( + item: TAnyArrowItem, + columns: TTableSchemaColumns, + naming: NamingConvention, + caps: DestinationCapabilitiesContext +) -> TAnyArrowItem: + """Normalize arrow `item` schema according to the `columns`. + + 1. arrow schema field names will be normalized according to `naming` + 2. arrows columns will be reordered according to `columns` + 3. empty columns will be inserted if they are missing, types will be generated using `caps` + """ + rename_mapping = get_normalized_arrow_fields_mapping(item, naming) + rev_mapping = {v: k for k, v in rename_mapping.items()} + dlt_table_prefix = naming.normalize_table_identifier(DLT_NAME_PREFIX) + + # remove all columns that are dlt columns but are not present in arrow schema. we do not want to add such columns + # that should happen in the normalizer + columns = {name:column for name, column in columns.items() if not name.startswith(dlt_table_prefix) or name in rev_mapping} + + # check if nothing to rename + if list(rename_mapping.keys()) == list(rename_mapping.values()): + # check if nothing to reorder + if list(rename_mapping.keys())[:len(columns)]== list(columns.keys()): + return item + + schema = item.schema + new_fields = [] + new_columns = [] + + for column_name, column in columns.items(): + # get original field name + field_name = rev_mapping.pop(column_name, column_name) + if field_name in rename_mapping: + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + else: + # column does not exist in pyarrow. create empty field and column + new_field = pyarrow.field( + column_name, + get_py_arrow_datatype(column, caps, "UTC"), + nullable=column.get("nullable", True) + ) + new_fields.append(new_field) + new_columns.append(pyarrow.nulls(item.num_rows, type=new_field.type)) + + # add the remaining columns + for column_name, field_name in rev_mapping.items(): + idx = schema.get_field_index(field_name) + # use renamed field + new_fields.append(schema.field(idx).with_name(column_name)) + new_columns.append(item.column(idx)) + + # create desired type + return item.__class__.from_arrays(new_columns, schema=pyarrow.schema(new_fields)) + + +def get_normalized_arrow_fields_mapping(item: TAnyArrowItem, naming: NamingConvention) -> StrStr: + """Normalizes schema field names and returns mapping from original to normalized name. Raises on name clashes""" + norm_f = naming.normalize_identifier + name_mapping = {n.name: norm_f(n.name) for n in item.schema} + # verify if names uniquely normalize + normalized_names = set(name_mapping.values()) + if len(name_mapping) != len(normalized_names): + raise NameNormalizationClash(f"Arrow schema fields normalized from {list(name_mapping.keys())} to {list(normalized_names)}") + return name_mapping + def py_arrow_to_table_schema_columns(schema: pyarrow.Schema) -> TTableSchemaColumns: """Convert a PyArrow schema to a table schema columns dict. @@ -193,9 +292,8 @@ def get_row_count(parquet_file: TFileOrPath) -> int: def is_arrow_item(item: Any) -> bool: return isinstance(item, (pyarrow.Table, pyarrow.RecordBatch)) - -TNewColumns = Sequence[Tuple[pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] - +TNewColumns = Sequence[Tuple[int, pyarrow.Field, Callable[[pyarrow.Table], Iterable[Any]]]] +"""Sequence of tuples: (field index, field, generating function)""" def pq_stream_with_new_columns( parquet_file: TFileOrPath, columns: TNewColumns, row_groups_per_read: int = 1 @@ -206,7 +304,7 @@ def pq_stream_with_new_columns( Args: parquet_file: path or file object to parquet file - columns: list of columns to add in the form of (`pyarrow.Field`, column_value_callback) + columns: list of columns to add in the form of (insertion index, `pyarrow.Field`, column_value_callback) The callback should accept a `pyarrow.Table` and return an array of values for the column. row_groups_per_read: number of row groups to read at a time. Defaults to 1. @@ -218,6 +316,15 @@ def pq_stream_with_new_columns( # Iterate through n row groups at a time for i in range(0, n_groups, row_groups_per_read): tbl: pyarrow.Table = reader.read_row_groups(range(i, min(i + row_groups_per_read, n_groups))) - for col in columns: - tbl = tbl.append_column(col[0], col[1](tbl)) + for idx, field, gen_ in columns: + if idx == -1: + tbl = tbl.append_column(field, gen_(tbl)) + else: + tbl = tbl.add_column(idx, field, gen_(tbl)) yield tbl + + +class NameNormalizationClash(ValueError): + def __init__(self, reason: str) -> None: + msg = f"Arrow column name clash after input data normalization. {reason}" + super().__init__(msg) diff --git a/dlt/common/libs/pydantic.py b/dlt/common/libs/pydantic.py index c66d67f1f7..1b65fa3a7e 100644 --- a/dlt/common/libs/pydantic.py +++ b/dlt/common/libs/pydantic.py @@ -1,30 +1,68 @@ -from typing import Type, Union, get_type_hints, get_args, Any +from __future__ import annotations +import inspect +from copy import copy +from typing import Dict, Generic, Set, TypedDict, List, Type, Union, TypeVar, get_origin, get_args, Any from dlt.common.exceptions import MissingDependencyException -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.data_types import py_type_to_sc_type, TDataType -from dlt.common.typing import is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union +from dlt.common.schema import DataValidationError +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns +from dlt.common.data_types import py_type_to_sc_type +from dlt.common.typing import TDataItem, TDataItems, extract_union_types, is_optional_type, extract_inner_type, is_list_generic_type, is_dict_generic_type, is_union try: - from pydantic import BaseModel, Field, Json + from pydantic import BaseModel, ValidationError, Json, create_model except ImportError: - raise MissingDependencyException("DLT pydantic Helpers", ["pydantic"], "DLT Helpers for for pydantic.") + raise MissingDependencyException("dlt Pydantic helpers", ["pydantic"], "Both Pydantic 1.x and 2.x are supported") + +_PYDANTIC_2 = False +try: + from pydantic import PydanticDeprecatedSince20 + _PYDANTIC_2 = True + # hide deprecation warning + import warnings + warnings.simplefilter("ignore", category=PydanticDeprecatedSince20) +except ImportError: + pass + +_TPydanticModel = TypeVar("_TPydanticModel", bound=BaseModel) + + +class ListModel(BaseModel, Generic[_TPydanticModel]): + items: List[_TPydanticModel] -def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], skip_complex_types: bool = False) -> TTableSchemaColumns: +class DltConfig(TypedDict, total=False): + """dlt configuration that can be attached to Pydantic model + + Example below removes `nested` field from the resulting dlt schema. + >>> class ItemModel(BaseModel): + >>> b: bool + >>> nested: Dict[str, Any] + >>> dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + """ + skip_complex_types: bool + """If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from dlt schema generated from the model""" + + +def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]]) -> TTableSchemaColumns: """Convert a pydantic model to a table schema columns dict + See also DltConfig for more control over how the schema is created + Args: model: The pydantic model to convert. Can be a class or an instance. - skip_complex_types: If True, columns of complex types (`dict`, `list`, `BaseModel`) will be excluded from the result. + Returns: TTableSchemaColumns: table schema columns dict """ + skip_complex_types = False + if hasattr(model, "dlt_config"): + skip_complex_types = model.dlt_config.get("skip_complex_types", False) + result: TTableSchemaColumns = {} - fields = model.__fields__ - for field_name, field in fields.items(): + for field_name, field in model.__fields__.items(): # type: ignore[union-attr] annotation = field.annotation if inner_annotation := getattr(annotation, 'inner_type', None): # This applies to pydantic.Json fields, the inner type is the type after json parsing @@ -49,7 +87,12 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s inner_type = dict name = field.alias or field_name - data_type = py_type_to_sc_type(inner_type) + try: + data_type = py_type_to_sc_type(inner_type) + except TypeError: + # try to coerce unknown type to text + data_type = "text" + if data_type == 'complex' and skip_complex_types: continue @@ -60,3 +103,195 @@ def pydantic_to_table_schema_columns(model: Union[BaseModel, Type[BaseModel]], s } return result + + +def column_mode_to_extra(column_mode: TSchemaEvolutionMode) -> str: + extra = "forbid" + if column_mode == "evolve": + extra = "allow" + elif column_mode == "discard_value": + extra = "ignore" + return extra + + +def extra_to_column_mode(extra: str) -> TSchemaEvolutionMode: + if extra == "forbid": + return "freeze" + if extra == "allow": + return "evolve" + return "discard_value" + + +def get_extra_from_model(model: Type[BaseModel]) -> str: + default_extra = "ignore" + if _PYDANTIC_2: + default_extra = model.model_config.get("extra", default_extra) + else: + default_extra = model.Config.extra or default_extra # type: ignore[attr-defined] + return default_extra + + +def apply_schema_contract_to_model( + model: Type[_TPydanticModel], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode = "freeze" +) -> Type[_TPydanticModel]: + """Configures or re-creates `model` so it behaves according to `column_mode` and `data_mode` settings. + + `column_mode` sets the model behavior when unknown field is found. + `data_mode` sets model behavior when known field does not validate. currently `evolve` and `freeze` are supported here. + + `discard_row` is implemented in `validate_item`. + """ + if data_mode == "evolve": + # create a lenient model that accepts any data + model = create_model(model.__name__ + "Any", **{n:(Any, None) for n in model.__fields__}) # type: ignore[call-overload, attr-defined] + elif data_mode == "discard_value": + raise NotImplementedError("data_mode is discard_value. Cannot discard defined fields with validation errors using Pydantic models.") + + extra = column_mode_to_extra(column_mode) + + if extra == get_extra_from_model(model): + # no need to change the model + return model + + if _PYDANTIC_2: + config = copy(model.model_config) + config["extra"] = extra # type: ignore[typeddict-item] + else: + config = copy(model.Config) # type: ignore[attr-defined] + config.extra = extra # type: ignore[attr-defined] + + _child_models: Dict[int, Type[BaseModel]] = {} + + def _process_annotation(t_: Type[Any]) -> Type[Any]: + """Recursively recreates models with applied schema contract """ + if is_list_generic_type(t_): + l_t: Type[Any] = get_args(t_)[0] + try: + return get_origin(t_)[_process_annotation(l_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return List[_process_annotation(l_t)] # type: ignore + elif is_dict_generic_type(t_): + k_t: Type[Any] + v_t: Type[Any] + k_t, v_t = get_args(t_) + try: + return get_origin(t_)[k_t, _process_annotation(v_t)] # type: ignore[no-any-return] + except TypeError: + # this is Python3.8 fallback. it does not support indexers on types + return Dict[k_t, _process_annotation(v_t)] # type: ignore + elif is_union(t_): + u_t_s = tuple(_process_annotation(u_t) for u_t in extract_union_types(t_)) + return Union[u_t_s] # type: ignore[return-value] + elif inspect.isclass(t_) and issubclass(t_, BaseModel): + # types must be same before and after processing + if id(t_) in _child_models: + return _child_models[id(t_)] + else: + _child_models[id(t_)] = child_model = apply_schema_contract_to_model(t_, column_mode, data_mode) + return child_model + return t_ + + new_model: Type[_TPydanticModel] = create_model( # type: ignore[call-overload] + model.__name__ + "Extra" + extra.title(), + __config__ = config, + **{n:(_process_annotation(f.annotation), f) for n, f in model.__fields__.items()} # type: ignore[attr-defined] + ) + # pass dlt config along + dlt_config = getattr(model, "dlt_config", None) + if dlt_config: + new_model.dlt_config = dlt_config # type: ignore[attr-defined] + return new_model + + +def create_list_model(model: Type[_TPydanticModel], data_mode: TSchemaEvolutionMode = "freeze") -> Type[ListModel[_TPydanticModel]]: + """Creates a model from `model` for validating list of items in batch according to `data_mode` + + Currently only freeze is supported. See comments in the code + """ + # TODO: use LenientList to create list model that automatically discards invalid items + # https://github.com/pydantic/pydantic/issues/2274 and https://gist.github.com/dmontagu/7f0cef76e5e0e04198dd608ad7219573 + return create_model( + "List" + __name__, + items=(List[model], ...) # type: ignore[return-value,valid-type] + ) + + +def validate_items( + table_name: str, + list_model: Type[ListModel[_TPydanticModel]], + items: List[TDataItem], + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> List[_TPydanticModel]: + """Validates list of `item` with `list_model` and returns parsed Pydantic models + + `list_model` should be created with `create_list_model` and have `items` field which this function returns. + """ + try: + return list_model(items=items).items + except ValidationError as e: + deleted: Set[int] = set() + for err in e.errors(): + # TODO: we can get rid of most of the code if we use LenientList as explained above + if len(err["loc"]) >= 2: + err_idx = int(err["loc"][1]) + if err_idx in deleted: + # already dropped + continue + err_item = items[err_idx - len(deleted)] + else: + # top level error which means misalignment of list model and items + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, items) from e + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", list_model, {"columns": "freeze"}, err_item) from e + elif column_mode == "discard_row": + # pop at the right index + items.pop(err_idx - len(deleted)) + # store original index so we do not pop again + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + else: + if data_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", list_model, {"data_type": "freeze"}, err_item) from e + elif data_mode == "discard_row": + items.pop(err_idx - len(deleted)) + deleted.add(err_idx) + else: + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + + # validate again with error items removed + return validate_items(table_name, list_model, items, column_mode, data_mode) + + +def validate_item( + table_name: str, + model: Type[_TPydanticModel], + item: TDataItems, + column_mode: TSchemaEvolutionMode, + data_mode: TSchemaEvolutionMode +) -> _TPydanticModel: + """Validates `item` against model `model` and returns an instance of it""" + try: + return model.parse_obj(item) + except ValidationError as e: + for err in e.errors(): + # raise on freeze + if err["type"] == 'extra_forbidden': + if column_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "columns", "freeze", model, {"columns": "freeze"}, item) from e + elif column_mode == "discard_row": + return None + raise NotImplementedError(f"{column_mode} column mode not implemented for Pydantic validation") + else: + if data_mode == "freeze": + raise DataValidationError(None, table_name, str(err["loc"]), "data_type", "freeze", model, {"data_type": "freeze"}, item) from e + elif data_mode == "discard_row": + return None + raise NotImplementedError(f"{data_mode} data mode not implemented for Pydantic validation") + raise AssertionError("unreachable") \ No newline at end of file diff --git a/dlt/common/normalizers/json/__init__.py b/dlt/common/normalizers/json/__init__.py index e1c5c3b846..ab133b36c9 100644 --- a/dlt/common/normalizers/json/__init__.py +++ b/dlt/common/normalizers/json/__init__.py @@ -1,5 +1,5 @@ import abc -from typing import Any, Generic, Type, Iterator, Tuple, Protocol, TYPE_CHECKING, TypeVar +from typing import Any, Generic, Type, Generator, Tuple, Protocol, TYPE_CHECKING, TypeVar from dlt.common.typing import DictStrAny, TDataItem, StrAny if TYPE_CHECKING: @@ -10,7 +10,7 @@ # type definitions for json normalization function # iterator of form ((table_name, parent_table), dict) must be returned from normalization function -TNormalizedRowIterator = Iterator[Tuple[Tuple[str, str], StrAny]] +TNormalizedRowIterator = Generator[Tuple[Tuple[str, str], StrAny], bool, None] # type var for data item normalizer config TNormalizerConfig = TypeVar("TNormalizerConfig", bound=Any) diff --git a/dlt/common/normalizers/json/relational.py b/dlt/common/normalizers/json/relational.py index 98b34e298d..c9ce5a9d25 100644 --- a/dlt/common/normalizers/json/relational.py +++ b/dlt/common/normalizers/json/relational.py @@ -48,6 +48,8 @@ class DataItemNormalizer(DataItemNormalizerBase[RelationalNormalizerConfig]): _skip_primary_key: Dict[str, bool] def __init__(self, schema: Schema) -> None: + """This item normalizer works with nested dictionaries. It flattens dictionaries and descends into lists. + It yields row dictionaries at each nesting level.""" self.schema = schema self._reset() @@ -230,7 +232,9 @@ def _normalize_row( extend.update(self._get_propagated_values(table, flattened_row, _r_lvl )) # yield parent table first - yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + should_descend = yield (table, schema.naming.shorten_fragments(*parent_path)), flattened_row + if should_descend is False: + return # normalize and yield lists for list_path, list_content in lists.items(): @@ -264,7 +268,7 @@ def extend_schema(self) -> None: def extend_table(self, table_name: str) -> None: # if the table has a merge w_d, add propagation info to normalizer table = self.schema.tables.get(table_name) - if not table.get("parent") and table["write_disposition"] == "merge": + if not table.get("parent") and table.get("write_disposition") == "merge": DataItemNormalizer.update_normalizer_config(self.schema, {"propagation": { "tables": { table_name: { diff --git a/dlt/common/pipeline.py b/dlt/common/pipeline.py index aeb0bdc68a..973abb2451 100644 --- a/dlt/common/pipeline.py +++ b/dlt/common/pipeline.py @@ -14,10 +14,10 @@ from dlt.common.configuration.specs.config_section_context import ConfigSectionContext from dlt.common.configuration.paths import get_dlt_data_dir from dlt.common.configuration.specs import RunConfiguration -from dlt.common.destination import DestinationReference, TDestinationReferenceArg +from dlt.common.destination import Destination, TDestinationReferenceArg, TDestination from dlt.common.exceptions import DestinationHasFailedJobs, PipelineStateNotAvailable, ResourceNameNotAvailable, SourceSectionNotAvailable from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.source import get_current_pipe_name from dlt.common.storages.load_storage import LoadPackageInfo from dlt.common.typing import DictStrAny, REPattern @@ -177,7 +177,7 @@ class SupportsPipeline(Protocol): """Name of the pipeline""" default_schema_name: str """Name of the default schema""" - destination: DestinationReference + destination: TDestination """The destination reference which is ModuleType. `destination.__name__` returns the name string""" dataset_name: str """Name of the dataset to which pipeline will be loaded to""" @@ -212,7 +212,8 @@ def run( columns: Sequence[TColumnSchema] = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: ... diff --git a/dlt/common/schema/__init__.py b/dlt/common/schema/__init__.py index 1a3b4db223..ac320bef0a 100644 --- a/dlt/common/schema/__init__.py +++ b/dlt/common/schema/__init__.py @@ -1,9 +1,11 @@ -from dlt.common.schema.typing import TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase +from dlt.common.schema.typing import TSchemaContractDict, TSchemaUpdate, TSchemaTables, TTableSchema, TStoredSchema, TTableSchemaColumns, TColumnHint, TColumnSchema, TColumnSchemaBase from dlt.common.schema.typing import COLUMN_HINTS -from dlt.common.schema.schema import Schema +from dlt.common.schema.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE +from dlt.common.schema.exceptions import DataValidationError from dlt.common.schema.utils import verify_schema_hash __all__ = [ "TSchemaUpdate", "TSchemaTables", "TTableSchema", "TStoredSchema", "TTableSchemaColumns", "TColumnHint", - "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash" + "TColumnSchema", "TColumnSchemaBase", "COLUMN_HINTS", "Schema", "verify_schema_hash", "TSchemaContractDict", + "DEFAULT_SCHEMA_CONTRACT_MODE", "DataValidationError" ] diff --git a/dlt/common/schema/exceptions.py b/dlt/common/schema/exceptions.py index 5f638a111d..96df6b7418 100644 --- a/dlt/common/schema/exceptions.py +++ b/dlt/common/schema/exceptions.py @@ -2,6 +2,7 @@ from dlt.common.exceptions import DltException from dlt.common.data_types import TDataType +from dlt.common.schema.typing import TSchemaContractDict, TSchemaContractEntities, TSchemaEvolutionMode class SchemaException(DltException): @@ -16,11 +17,6 @@ def __init__(self, name: str) -> None: super().__init__(f"{name} is an invalid schema/source name. The source or schema name must be a valid Python identifier ie. a snake case function name and have maximum {self.MAXIMUM_SCHEMA_NAME_LENGTH} characters. Ideally should contain only small letters, numbers and underscores.") -# class InvalidDatasetName(ValueError, SchemaException): -# def __init__(self, name: str, normalized_name: str) -> None: -# self.name = name -# super().__init__(f"{name} is an invalid dataset name. The dataset name must conform to wide range of destinations and ideally should contain only small letters, numbers and underscores. Try {normalized_name} instead as suggested by current naming module.") - class InvalidDatasetName(ValueError, SchemaException): def __init__(self, destination_name: str) -> None: self.destination_name = destination_name @@ -70,7 +66,47 @@ def __init__(self, schema_name: str, init_engine: int, from_engine: int, to_engi self.to_engine = to_engine super().__init__(f"No engine upgrade path in schema {schema_name} from {init_engine} to {to_engine}, stopped at {from_engine}") + +class DataValidationError(SchemaException): + def __init__( + self, + schema_name: str, + table_name: str, + column_name: str, + contract_entity: TSchemaContractEntities, + contract_mode: TSchemaEvolutionMode, + table_schema: Any, + schema_contract: TSchemaContractDict, + data_item: Any = None, + extended_info: str = None + ) -> None: + """Raised when `data_item` violates `contract_mode` on a `contract_entity` as defined by `table_schema` + + Schema, table and column names are given as a context and full `schema_contract` and causing `data_item` as an evidence. + """ + msg = "" + if schema_name: + msg = f"Schema: {schema_name} " + msg += f"Table: {table_name} " + if column_name: + msg += f"Column: {column_name}" + msg = "In " + msg + f" . Contract on {contract_entity} with mode {contract_mode} is violated. " + (extended_info or "") + super().__init__(msg) + self.schema_name = schema_name + self.table_name = table_name + self.column_name = column_name + + # violated contract + self.contract_entity = contract_entity + self.contract_mode = contract_mode + + # some evidence + self.table_schema = table_schema + self.schema_contract = schema_contract + self.data_item = data_item + + class UnknownTableException(SchemaException): def __init__(self, table_name: str) -> None: self.table_name = table_name - super().__init__(f"Trying to access unknown table {table_name}.") \ No newline at end of file + super().__init__(f"Trying to access unknown table {table_name}.") diff --git a/dlt/common/schema/schema.py b/dlt/common/schema/schema.py index 1878cf63d6..f1c798b7c5 100644 --- a/dlt/common/schema/schema.py +++ b/dlt/common/schema/schema.py @@ -1,6 +1,6 @@ import yaml from copy import copy, deepcopy -from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast +from typing import ClassVar, Dict, List, Mapping, Optional, Sequence, Tuple, Any, cast, Literal from dlt.common import json from dlt.common.utils import extend_list_deduplicated @@ -10,13 +10,20 @@ from dlt.common.normalizers.json import DataItemNormalizer, TNormalizedRowIterator from dlt.common.schema import utils from dlt.common.data_types import py_type_to_sc_type, coerce_value, TDataType -from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaSettings, TSimpleRegex, TStoredSchema, - TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections) +from dlt.common.schema.typing import (COLUMN_HINTS, DLT_NAME_PREFIX, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, VERSION_TABLE_NAME, STATE_TABLE_NAME, TPartialTableSchema, TSchemaContractEntities, TSchemaEvolutionMode, TSchemaSettings, TSimpleRegex, TStoredSchema, + TSchemaTables, TTableSchema, TTableSchemaColumns, TColumnSchema, TColumnProp, TColumnHint, TTypeDetections, TSchemaContractDict, TSchemaContract) from dlt.common.schema.exceptions import (CannotCoerceColumnException, CannotCoerceNullException, InvalidSchemaName, ParentTableNotFoundException, SchemaCorruptedException) from dlt.common.validation import validate_dict +from dlt.common.schema.exceptions import DataValidationError +DEFAULT_SCHEMA_CONTRACT_MODE: TSchemaContractDict = { + "tables": "evolve", + "columns": "evolve", + "data_type": "evolve" +} + class Schema: ENGINE_VERSION: ClassVar[int] = SCHEMA_ENGINE_VERSION @@ -61,7 +68,7 @@ def __init__(self, name: str, normalizers: TNormalizersConfig = None) -> None: self._reset_schema(name, normalizers) @classmethod - def from_dict(cls, d: DictStrAny) -> "Schema": + def from_dict(cls, d: DictStrAny, bump_version: bool = True) -> "Schema": # upgrade engine if needed stored_schema = utils.migrate_schema(d, d["engine_version"], cls.ENGINE_VERSION) @@ -71,7 +78,8 @@ def from_dict(cls, d: DictStrAny) -> "Schema": stored_schema = utils.apply_defaults(stored_schema) # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) return cls.from_stored_schema(stored_schema) @classmethod @@ -83,9 +91,10 @@ def from_stored_schema(cls, stored_schema: TStoredSchema) -> "Schema": def replace_schema_content(self, schema: "Schema") -> None: self._reset_schema(schema.name, schema._normalizers_config) - self._from_stored_schema(schema.to_dict()) + # do not bump version so hash from `schema` is preserved + self._from_stored_schema(schema.to_dict(bump_version=False)) - def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: + def to_dict(self, remove_defaults: bool = False, bump_version: bool = True) -> TStoredSchema: stored_schema: TStoredSchema = { "version": self._stored_version, "version_hash": self._stored_version_hash, @@ -102,7 +111,8 @@ def to_dict(self, remove_defaults: bool = False) -> TStoredSchema: stored_schema["description"] = self._schema_description # bump version if modified - utils.bump_version_if_modified(stored_schema) + if bump_version: + utils.bump_version_if_modified(stored_schema) # remove defaults after bumping version if remove_defaults: utils.remove_defaults(stored_schema) @@ -190,8 +200,120 @@ def coerce_row(self, table_name: str, parent_table: str, row: StrAny) -> Tuple[D return new_row, updated_table_partial + def apply_schema_contract( + self, + schema_contract: TSchemaContractDict, + partial_table: TPartialTableSchema, + data_item: TDataItem = None, + raise_on_freeze: bool = True + ) -> Tuple[TPartialTableSchema, List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]]]: + """ + Checks if `schema_contract` allows for the `partial_table` to update the schema. It applies the contract dropping + the affected columns or the whole `partial_table`. It generates and returns a set of filters that should be applied to incoming data in order to modify it + so it conforms to the contract. `data_item` is provided only as evidence in case DataValidationError is raised. + + Example `schema_contract`: + { + "tables": "freeze", + "columns": "evolve", + "data_type": "discard_row" + } + + Settings for table affects new tables, settings for column affects new columns and settings for data_type affects new variant columns. Each setting can be set to one of: + * evolve: allow all changes + * freeze: allow no change and fail the load + * discard_row: allow no schema change and filter out the row + * discard_value: allow no schema change and filter out the value but load the rest of the row + + Returns a tuple where a first element is modified partial table and the second is a list of filters. The modified partial may be None in case the + whole table is not allowed. + Each filter is a tuple of (table|columns, entity name, freeze | discard_row | discard_value). + Note: by default `freeze` immediately raises DataValidationError which is convenient in most use cases + + """ + # default settings allow all evolutions, skip all else + if schema_contract == DEFAULT_SCHEMA_CONTRACT_MODE: + return partial_table, [] + + assert partial_table + table_name = partial_table["name"] + existing_table: TTableSchema = self._schema_tables.get(table_name, None) + + # table is new when not yet exist or + is_new_table = not existing_table or self.is_new_table(table_name) + # check case where we have a new table + if is_new_table and schema_contract["tables"] != "evolve": + if raise_on_freeze and schema_contract["tables"] == "freeze": + raise DataValidationError( + self.name, table_name, None, "tables", "freeze", None, schema_contract, data_item, f"Trying to add table {table_name} but new tables are frozen." + ) + # filter tables with name below + return None, [("tables", table_name, schema_contract["tables"])] + + column_mode, data_mode = schema_contract["columns"], schema_contract["data_type"] + # allow to add new columns when table is new or if columns are allowed to evolve once + if is_new_table or existing_table.get("x-normalizer", {}).get("evolve-columns-once", False): # type: ignore[attr-defined] + column_mode = "evolve" + + # check if we should filter any columns, partial table below contains only new columns + filters: List[Tuple[TSchemaContractEntities, str, TSchemaEvolutionMode]] = [] + for column_name, column in list(partial_table["columns"].items()): + # dlt cols may always be added + if column_name.startswith(self._dlt_tables_prefix): + continue + is_variant = column.get("variant", False) + # new column and contract prohibits that + if column_mode != "evolve" and not is_variant: + if raise_on_freeze and column_mode == "freeze": + raise DataValidationError( + self.name, table_name, column_name, "columns", "freeze", existing_table, schema_contract, data_item, f"Trying to add column {column_name} to table {table_name} but columns are frozen." + ) + # filter column with name below + filters.append(("columns", column_name, column_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + # variant (data type evolution) and contract prohibits that + if data_mode != "evolve" and is_variant: + if raise_on_freeze and data_mode == "freeze": + raise DataValidationError( + self.name, table_name, column_name, "data_type", "freeze", existing_table, schema_contract, data_item, f"Trying to create new variant column {column_name} to table {table_name} but data_types are frozen." + ) + # filter column with name below + filters.append(("columns", column_name, data_mode)) + # pop the column + partial_table["columns"].pop(column_name) + + return partial_table, filters + + @staticmethod + def expand_schema_contract_settings(settings: TSchemaContract, default: TSchemaContractDict = None) -> TSchemaContractDict: + """Expand partial or shorthand settings into full settings dictionary using `default` for unset entities""" + if isinstance(settings, str): + settings = TSchemaContractDict(tables=settings, columns=settings, data_type=settings) + return cast(TSchemaContractDict, {**(default or DEFAULT_SCHEMA_CONTRACT_MODE), **(settings or {})}) + + def resolve_contract_settings_for_table(self, table_name: str, new_table_schema: TTableSchema = None) -> TSchemaContractDict: + """Resolve the exact applicable schema contract settings for the table `table_name`. `new_table_schema` is added to the tree during the resolution.""" + + settings: TSchemaContract = {} + if not table_name.startswith(self._dlt_tables_prefix): + if new_table_schema: + tables = copy(self._schema_tables) + tables[table_name] = new_table_schema + else: + tables = self._schema_tables + # find root table + try: + table = utils.get_top_level_table(tables, table_name) + settings = table["schema_contract"] + except KeyError: + settings = self._settings.get("schema_contract", {}) + + # expand settings, empty settings will expand into default settings + return Schema.expand_schema_contract_settings(settings) + def update_table(self, partial_table: TPartialTableSchema) -> TPartialTableSchema: - """Update table in this schema""" table_name = partial_table["name"] parent_table_name = partial_table.get("parent") # check if parent table present @@ -218,15 +340,14 @@ def update_schema(self, schema: "Schema") -> None: # update all tables for table in schema.tables.values(): self.update_table(table) - # update normalizer config nondestructively - self.data_item_normalizer.update_normalizer_config(self, self.data_item_normalizer.get_normalizer_config(schema)) - self.update_normalizers() + # pass normalizer config + self._configure_normalizers(schema._normalizers_config) # update and compile settings self._settings = deepcopy(schema.settings) self._compile_settings() - def bump_version(self) -> Tuple[int, str, List[str]]: + def bump_version(self) -> Tuple[int, str]: """Computes schema hash in order to check if schema content was modified. In such case the schema ``stored_version`` and ``stored_version_hash`` are updated. Should not be used in production code. The method ``to_dict`` will generate TStoredSchema with correct value, only once before persisting schema to storage. @@ -234,9 +355,8 @@ def bump_version(self) -> Tuple[int, str, List[str]]: Returns: Tuple[int, str]: Current (``stored_version``, ``stored_version_hash``) tuple """ - version = utils.bump_version_if_modified(self.to_dict()) - self._stored_version, self._stored_version_hash, self._stored_ancestors = version - return version + self._stored_version, self._stored_version_hash, _, _ = utils.bump_version_if_modified(self.to_dict(bump_version=False)) + return self._stored_version, self._stored_version_hash def filter_row_with_hint(self, table_name: str, hint_type: TColumnHint, row: StrAny) -> StrAny: rv_row: DictStrAny = {} @@ -329,6 +449,10 @@ def dlt_tables(self) -> List[TTableSchema]: def get_preferred_type(self, col_name: str) -> Optional[TDataType]: return next((m[1] for m in self._compiled_preferred_types if m[0].search(col_name)), None) + def is_new_table(self, table_name: str) -> bool: + """Returns true if this table does not exist OR is incomplete (has only incomplete columns) and therefore new""" + return (table_name not in self.tables) or (not [c for c in self.tables[table_name]["columns"].values() if utils.is_complete_column(c)]) + @property def version(self) -> int: """Version of the schema content that takes into account changes from the time of schema loading/creation. @@ -401,6 +525,12 @@ def update_normalizers(self) -> None: normalizers["json"] = normalizers["json"] or self._normalizers_config["json"] self._configure_normalizers(normalizers) + def set_schema_contract(self, settings: TSchemaContract) -> None: + if not settings: + self._settings.pop("schema_contract", None) + else: + self._settings["schema_contract"] = settings + def add_type_detection(self, detection: TTypeDetections) -> None: """Add type auto detection to the schema.""" if detection not in self.settings["detections"]: @@ -525,7 +655,7 @@ def _configure_normalizers(self, normalizers: TNormalizersConfig) -> None: # name normalization functions self.naming = naming_module - self._dlt_tables_prefix = self.naming.normalize_table_identifier("_dlt") + self._dlt_tables_prefix = self.naming.normalize_table_identifier(DLT_NAME_PREFIX) self.version_table_name = self.naming.normalize_table_identifier(VERSION_TABLE_NAME) self.loads_table_name = self.naming.normalize_table_identifier(LOADS_TABLE_NAME) self.state_table_name = self.naming.normalize_table_identifier(STATE_TABLE_NAME) diff --git a/dlt/common/schema/typing.py b/dlt/common/schema/typing.py index 7d53f4e8a8..e986d951e5 100644 --- a/dlt/common/schema/typing.py +++ b/dlt/common/schema/typing.py @@ -17,6 +17,7 @@ VERSION_TABLE_NAME = "_dlt_version" LOADS_TABLE_NAME = "_dlt_loads" STATE_TABLE_NAME = "_dlt_pipeline_state" +DLT_NAME_PREFIX = "_dlt" TColumnProp = Literal["name", "data_type", "nullable", "partition", "cluster", "primary_key", "foreign_key", "sort", "unique", "merge_key", "root_key"] """Known properties and hints of the column""" @@ -71,17 +72,32 @@ class TColumnSchema(TColumnSchemaBase, total=False): TColumnName = NewType("TColumnName", str) SIMPLE_REGEX_PREFIX = "re:" +TSchemaEvolutionMode = Literal["evolve", "discard_value", "freeze", "discard_row"] +TSchemaContractEntities = Literal["tables", "columns", "data_type"] + +class TSchemaContractDict(TypedDict, total=False): + """TypedDict defining the schema update settings""" + tables: Optional[TSchemaEvolutionMode] + columns: Optional[TSchemaEvolutionMode] + data_type: Optional[TSchemaEvolutionMode] + +TSchemaContract = Union[TSchemaEvolutionMode, TSchemaContractDict] class TRowFilters(TypedDict, total=True): excludes: Optional[List[TSimpleRegex]] includes: Optional[List[TSimpleRegex]] +class NormalizerInfo(TypedDict, total=True): + new_table: bool + +# TypedDict that defines properties of a table class TTableSchema(TypedDict, total=False): """TypedDict that defines properties of a table""" name: Optional[str] description: Optional[str] write_disposition: Optional[TWriteDisposition] + schema_contract: Optional[TSchemaContract] table_sealed: Optional[bool] parent: Optional[str] filters: Optional[TRowFilters] @@ -89,16 +105,15 @@ class TTableSchema(TypedDict, total=False): resource: Optional[str] table_format: Optional[TTableFormat] - class TPartialTableSchema(TTableSchema): pass - TSchemaTables = Dict[str, TTableSchema] TSchemaUpdate = Dict[str, List[TPartialTableSchema]] + class TSchemaSettings(TypedDict, total=False): - schema_sealed: Optional[bool] + schema_contract: Optional[TSchemaContract] detections: Optional[List[TTypeDetections]] default_hints: Optional[Dict[TColumnHint, List[TSimpleRegex]]] preferred_types: Optional[Dict[TSimpleRegex, TDataType]] @@ -116,3 +131,4 @@ class TStoredSchema(TypedDict, total=False): settings: Optional[TSchemaSettings] tables: TSchemaTables normalizers: TNormalizersConfig + diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index 75b7d0dd31..41b0010242 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -16,7 +16,7 @@ from dlt.common.schema import detections from dlt.common.schema.typing import (COLUMN_HINTS, SCHEMA_ENGINE_VERSION, LOADS_TABLE_NAME, SIMPLE_REGEX_PREFIX, VERSION_TABLE_NAME, TColumnName, TPartialTableSchema, TSchemaTables, TSchemaUpdate, TSimpleRegex, TStoredSchema, TTableSchema, TTableSchemaColumns, TColumnSchemaBase, TColumnSchema, TColumnProp, TTableFormat, - TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition) + TColumnHint, TTypeDetectionFunc, TTypeDetections, TWriteDisposition, TSchemaContract, TSchemaContractDict) from dlt.common.schema.exceptions import (CannotCoerceColumnException, ParentTableNotFoundException, SchemaEngineNoUpgradePathException, SchemaException, TablePropertiesConflictException, InvalidSchemaName, UnknownTableException) @@ -134,8 +134,8 @@ def add_column_defaults(column: TColumnSchemaBase) -> TColumnSchema: # return copy(column) # type: ignore -def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, List[str]]: - # if any change to schema document is detected then bump version and write new hash +def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, str, List[str]]: + """Bumps the `stored_schema` version and version hash if content modified, returns (new version, new hash, old hash) tuple""" hash_ = generate_version_hash(stored_schema) previous_hash = stored_schema.get("version_hash") if not previous_hash: @@ -149,7 +149,7 @@ def bump_version_if_modified(stored_schema: TStoredSchema) -> Tuple[int, str, Li stored_schema["ancestors"] = stored_schema["ancestors"][:10] stored_schema["version_hash"] = hash_ - return stored_schema["version"], hash_, stored_schema["ancestors"] + return stored_schema["version"], hash_, previous_hash, stored_schema["ancestors"] def generate_version_hash(stored_schema: TStoredSchema) -> str: @@ -354,8 +354,17 @@ def migrate_filters(group: str, filters: List[str]) -> None: schema_dict["tables"][LOADS_TABLE_NAME] = load_table() from_engine = 6 if from_engine == 6 and to_engine > 6: - schema_dict["ancestors"] = [] + # migrate from sealed properties to schema evolution settings + schema_dict["settings"].pop("schema_sealed", None) + schema_dict["settings"]["schema_contract"] = {} + for table in schema_dict["tables"].values(): + table.pop("table_sealed", None) + if not table.get("parent"): + table["schema_contract"] = {} from_engine = 7 + if from_engine == 7 and to_engine > 7: + schema_dict["ancestors"] = [] + from_engine = 8 schema_dict["engine_version"] = from_engine if from_engine != to_engine: @@ -442,7 +451,6 @@ def diff_tables(tab_a: TTableSchema, tab_b: TPartialTableSchema) -> TPartialTabl continue existing_v = tab_a.get(k) if existing_v != v: - # print(f"{k} ==? {v} ==? {existing_v}") partial_table[k] = v # type: ignore # this should not really happen @@ -665,6 +673,7 @@ def new_table( columns: Sequence[TColumnSchema] = None, validate_schema: bool = False, resource: str = None, + schema_contract: TSchemaContract = None, table_format: TTableFormat = None ) -> TTableSchema: @@ -676,10 +685,13 @@ def new_table( table["parent"] = parent_table_name assert write_disposition is None assert resource is None + assert schema_contract is None else: # set write disposition only for root tables table["write_disposition"] = write_disposition or DEFAULT_WRITE_DISPOSITION table["resource"] = resource or table_name + if schema_contract is not None: + table["schema_contract"] = schema_contract if table_format: table["table_format"] = table_format if validate_schema: @@ -708,7 +720,6 @@ def new_column(column_name: str, data_type: TDataType = None, nullable: bool = T return column - def standard_hints() -> Dict[TColumnHint, List[TSimpleRegex]]: return None diff --git a/dlt/common/storages/data_item_storage.py b/dlt/common/storages/data_item_storage.py index 8de95a6f60..6621f07e26 100644 --- a/dlt/common/storages/data_item_storage.py +++ b/dlt/common/storages/data_item_storage.py @@ -24,10 +24,10 @@ def get_writer(self, load_id: str, schema_name: str, table_name: str) -> Buffere self.buffered_writers[writer_id] = writer return writer - def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + def write_data_item(self, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> int: writer = self.get_writer(load_id, schema_name, table_name) # write item(s) - writer.write_data_item(item, columns) + return writer.write_data_item(item, columns) def write_empty_file(self, load_id: str, schema_name: str, table_name: str, columns: TTableSchemaColumns) -> None: writer = self.get_writer(load_id, schema_name, table_name) diff --git a/dlt/common/storages/file_storage.py b/dlt/common/storages/file_storage.py index 006ff4843d..3c5a391200 100644 --- a/dlt/common/storages/file_storage.py +++ b/dlt/common/storages/file_storage.py @@ -125,7 +125,7 @@ def has_folder(self, relative_path: str) -> bool: return os.path.isdir(self.make_full_path(relative_path)) def list_folder_files(self, relative_path: str, to_root: bool = True) -> List[str]: - """List all files in ``relative_path`` folder + """List all files in `relative_path` folder Args: relative_path (str): A path to folder, relative to storage root diff --git a/dlt/common/storages/live_schema_storage.py b/dlt/common/storages/live_schema_storage.py index c482d5e7ea..79aeb22e61 100644 --- a/dlt/common/storages/live_schema_storage.py +++ b/dlt/common/storages/live_schema_storage.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, List from dlt.common.schema.schema import Schema from dlt.common.configuration.accessors import config @@ -18,7 +18,7 @@ def __getitem__(self, name: str) -> Schema: else: # return new schema instance schema = super().load_schema(name) - self._update_live_schema(schema) + self.update_live_schema(schema) return schema @@ -30,7 +30,7 @@ def load_schema(self, name: str) -> Schema: def save_schema(self, schema: Schema) -> str: rv = super().save_schema(schema) # update the live schema with schema being saved, if no live schema exist, create one to be available for a getter - self._update_live_schema(schema) + self.update_live_schema(schema) return rv def remove_schema(self, name: str) -> None: @@ -54,12 +54,18 @@ def commit_live_schema(self, name: str) -> Schema: self._save_schema(live_schema) return live_schema - def _update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + def update_live_schema(self, schema: Schema, can_create_new: bool = True) -> None: + """Will update live schema content without writing to storage. Optionally allows to create a new live schema""" live_schema = self.live_schemas.get(schema.name) if live_schema: - # replace content without replacing instance - # print(f"live schema {live_schema} updated in place") - live_schema.replace_schema_content(schema) + if id(live_schema) != id(schema): + # replace content without replacing instance + # print(f"live schema {live_schema} updated in place") + live_schema.replace_schema_content(schema) elif can_create_new: # print(f"live schema {schema.name} created from schema") self.live_schemas[schema.name] = schema + + def list_schemas(self) -> List[str]: + names = list(set(super().list_schemas()) | set(self.live_schemas.keys())) + return names diff --git a/dlt/common/storages/load_storage.py b/dlt/common/storages/load_storage.py index d034ef239a..d8eee9b8d6 100644 --- a/dlt/common/storages/load_storage.py +++ b/dlt/common/storages/load_storage.py @@ -27,7 +27,7 @@ # folders to manage load jobs in a single load package TJobState = Literal["new_jobs", "failed_jobs", "started_jobs", "completed_jobs"] -WORKING_FOLDERS = set(get_args(TJobState)) +WORKING_FOLDERS: Set[TJobState] = set(get_args(TJobState)) TLoadPackageState = Literal["normalized", "loaded", "aborted"] @@ -193,7 +193,7 @@ def write_temp_job_file(self, load_id: str, table_name: str, table: TTableSchema def load_package_schema(self, load_id: str) -> Schema: # load schema from a load package to be processed - schema_path = join(self.get_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) + schema_path = join(self.get_normalized_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) return self._load_schema(schema_path) def load_temp_schema(self, load_id: str) -> Schema: @@ -211,14 +211,16 @@ def save_temp_schema_updates(self, load_id: str, schema_update: TSchemaTables) - json.dump(schema_update, f) def commit_temp_load_package(self, load_id: str) -> None: - self.storage.rename_tree(load_id, self.get_package_path(load_id)) + self.storage.rename_tree(load_id, self.get_normalized_package_path(load_id)) - def list_packages(self) -> Sequence[str]: + def list_normalized_packages(self) -> Sequence[str]: + """Lists all packages that are normalized and will be loaded or are currently loaded""" loads = self.storage.list_folder_dirs(LoadStorage.NORMALIZED_FOLDER, to_root=False) # start from the oldest packages return sorted(loads) def list_completed_packages(self) -> Sequence[str]: + """List packages that are completely loaded""" loads = self.storage.list_folder_dirs(LoadStorage.LOADED_FOLDER, to_root=False) # start from the oldest packages return sorted(loads) @@ -264,7 +266,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: # check if package is completed or in process package_created_at: DateTime = None package_state: TLoadPackageState = "normalized" - package_path = self.get_package_path(load_id) + package_path = self.get_normalized_package_path(load_id) applied_update: TSchemaTables = {} if not self.storage.has_folder(package_path): package_path = self.get_completed_package_path(load_id) @@ -291,7 +293,7 @@ def get_load_package_info(self, load_id: str) -> LoadPackageInfo: return LoadPackageInfo(load_id, self.storage.make_full_path(package_path), package_state, schema.name, applied_update, package_created_at, all_jobs) def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: - package_path = self.get_package_path(load_id) + package_path = self.get_normalized_package_path(load_id) if not self.storage.has_folder(package_path): raise FileNotFoundError(package_path) schema_update_file = join(package_path, LoadStorage.SCHEMA_UPDATES_FILE_NAME) @@ -303,7 +305,7 @@ def begin_schema_update(self, load_id: str) -> Optional[TSchemaTables]: def commit_schema_update(self, load_id: str, applied_update: TSchemaTables) -> None: """Marks schema update as processed and stores the update that was applied at the destination""" - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) schema_update_file = join(load_path, LoadStorage.SCHEMA_UPDATES_FILE_NAME) processed_schema_update_file = join(load_path, LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) # delete initial schema update @@ -344,7 +346,7 @@ def complete_job(self, load_id: str, file_name: str) -> str: return self._move_job(load_id, LoadStorage.STARTED_JOBS_FOLDER, LoadStorage.COMPLETED_JOBS_FOLDER, file_name) def complete_load_package(self, load_id: str, aborted: bool) -> None: - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) has_failed_jobs = len(self.list_failed_jobs(load_id)) > 0 # delete completed jobs if self.config.delete_completed_jobs and not has_failed_jobs: @@ -367,7 +369,7 @@ def delete_completed_package(self, load_id: str) -> None: def wipe_normalized_packages(self) -> None: self.storage.delete_folder(self.NORMALIZED_FOLDER, recursively=True) - def get_package_path(self, load_id: str) -> str: + def get_normalized_package_path(self, load_id: str) -> str: return join(LoadStorage.NORMALIZED_FOLDER, load_id) def get_completed_package_path(self, load_id: str) -> str: @@ -378,7 +380,7 @@ def job_elapsed_time_seconds(self, file_path: str, now_ts: float = None) -> floa def _save_schema(self, schema: Schema, load_id: str) -> str: dump = json.dumps(schema.to_dict()) - schema_path = join(self.get_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) + schema_path = join(self.get_normalized_package_path(load_id), LoadStorage.SCHEMA_FILE_NAME) return self.storage.save(schema_path, dump) def _load_schema(self, schema_path: str) -> Schema: @@ -388,14 +390,14 @@ def _load_schema(self, schema_path: str) -> Schema: def _move_job(self, load_id: str, source_folder: TJobState, dest_folder: TJobState, file_name: str, new_file_name: str = None) -> str: # ensure we move file names, not paths assert file_name == FileStorage.get_file_name_from_file_path(file_name) - load_path = self.get_package_path(load_id) + load_path = self.get_normalized_package_path(load_id) dest_path = join(load_path, dest_folder, new_file_name or file_name) self.storage.atomic_rename(join(load_path, source_folder, file_name), dest_path) # print(f"{join(load_path, source_folder, file_name)} -> {dest_path}") return self.storage.make_full_path(dest_path) def _get_job_folder_path(self, load_id: str, folder: TJobState) -> str: - return join(self.get_package_path(load_id), folder) + return join(self.get_normalized_package_path(load_id), folder) def _get_job_file_path(self, load_id: str, folder: TJobState, file_name: str) -> str: return join(self._get_job_folder_path(load_id, folder), file_name) @@ -430,6 +432,15 @@ def build_job_file_name(self, table_name: str, file_id: str, retry_count: int = return fn + f".{format_spec.file_extension}" return fn + @staticmethod + def is_package_partially_loaded(package_info: LoadPackageInfo) -> bool: + """Checks if package is partially loaded - has jobs that are not new.""" + if package_info.state == "normalized": + pending_jobs: Sequence[TJobState] = ["new_jobs"] + else: + pending_jobs = ["completed_jobs", "failed_jobs"] + return sum(len(package_info.jobs[job_state]) for job_state in WORKING_FOLDERS if job_state not in pending_jobs) > 0 + @staticmethod def parse_job_file_name(file_name: str) -> ParsedLoadJobFileName: p = Path(file_name) diff --git a/dlt/common/typing.py b/dlt/common/typing.py index b2bd03f7e6..3b3a0d3353 100644 --- a/dlt/common/typing.py +++ b/dlt/common/typing.py @@ -68,41 +68,38 @@ def asstr(self, verbosity: int = 0) -> str: ... +def is_union_type(t: Type[Any]) -> bool: + return get_origin(t) is Union + def is_optional_type(t: Type[Any]) -> bool: return get_origin(t) is Union and type(None) in get_args(t) - def is_final_type(t: Type[Any]) -> bool: return get_origin(t) is Final - -def extract_optional_type(t: Type[Any]) -> Any: - return get_args(t)[0] - +def extract_union_types(t: Type[Any], no_none: bool = False) -> List[Any]: + if no_none: + return [arg for arg in get_args(t) if arg is not type(None)] # noqa: E721 + return list(get_args(t)) def is_literal_type(hint: Type[Any]) -> bool: return get_origin(hint) is Literal - def is_union(hint: Type[Any]) -> bool: return get_origin(hint) is Union - def is_newtype_type(t: Type[Any]) -> bool: return hasattr(t, "__supertype__") - def is_typeddict(t: Type[Any]) -> bool: return isinstance(t, _TypedDict) - def is_list_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Sequence) except TypeError: return False - def is_dict_generic_type(t: Type[Any]) -> bool: try: return issubclass(get_origin(t), C_Mapping) diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 0214bc037a..94c9144086 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -245,6 +245,7 @@ def update_dict_with_prune(dest: DictStrAny, update: StrAny) -> None: def update_dict_nested(dst: TDict, src: StrAny) -> TDict: + """Merges `src` into `dst` key wise. Does not recur into lists. Values in `src` overwrite `dst` if both keys exit.""" # based on https://github.com/clarketm/mergedeep/blob/master/mergedeep/mergedeep.py def _is_recursive_merge(a: StrAny, b: StrAny) -> bool: diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 7a313b1b29..791138054a 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -2,7 +2,7 @@ from typing import Callable, Any, Type, get_type_hints, get_args from dlt.common.exceptions import DictValidationException -from dlt.common.typing import StrAny, extract_optional_type, is_literal_type, is_optional_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict +from dlt.common.typing import StrAny, is_literal_type, is_optional_type, extract_union_types, is_union_type, is_typeddict, is_list_generic_type, is_dict_generic_type, _TypedDict, is_union TFilterFunc = Callable[[str], bool] @@ -50,10 +50,27 @@ def validate_dict(spec: Type[_TypedDict], doc: StrAny, path: str, filter_f: TFil raise DictValidationException(f"In {path}: following fields are unexpected {unexpected}", path) def verify_prop(pk: str, pv: Any, t: Any) -> None: - if is_optional_type(t): - t = extract_optional_type(t) - - if is_literal_type(t): + # covers none in optional and union types + if is_optional_type(t) and pv is None: + pass + elif is_union_type(t): + # pass if value actually is none + union_types = extract_union_types(t, no_none=True) + # this is the case for optional fields + if len(union_types) == 1: + verify_prop(pk, pv, union_types[0]) + else: + has_passed = False + for ut in union_types: + try: + verify_prop(pk, pv, ut) + has_passed = True + except DictValidationException: + pass + if not has_passed: + type_names = [str(get_args(ut)) if is_literal_type(ut) else ut.__name__ for ut in union_types] + raise DictValidationException(f"In {path}: field {pk} value {pv} has invalid type {type(pv).__name__}. One of these types expected: {', '.join(type_names)}.", path, pk, pv) + elif is_literal_type(t): a_l = get_args(t) if pv not in a_l: raise DictValidationException(f"In {path}: field {pk} value {pv} not in allowed {a_l}", path, pk, pv) diff --git a/dlt/destinations/__init__.py b/dlt/destinations/__init__.py index e69de29bb2..980c4ce7f2 100644 --- a/dlt/destinations/__init__.py +++ b/dlt/destinations/__init__.py @@ -0,0 +1,28 @@ +from dlt.destinations.impl.postgres.factory import postgres +from dlt.destinations.impl.snowflake.factory import snowflake +from dlt.destinations.impl.filesystem.factory import filesystem +from dlt.destinations.impl.duckdb.factory import duckdb +from dlt.destinations.impl.dummy.factory import dummy +from dlt.destinations.impl.mssql.factory import mssql +from dlt.destinations.impl.bigquery.factory import bigquery +from dlt.destinations.impl.athena.factory import athena +from dlt.destinations.impl.redshift.factory import redshift +from dlt.destinations.impl.qdrant.factory import qdrant +from dlt.destinations.impl.motherduck.factory import motherduck +from dlt.destinations.impl.weaviate.factory import weaviate + + +__all__ = [ + "postgres", + "snowflake", + "filesystem", + "duckdb", + "dummy", + "mssql", + "bigquery", + "athena", + "redshift", + "qdrant", + "motherduck", + "weaviate", +] diff --git a/dlt/destinations/filesystem/__init__.py b/dlt/destinations/filesystem/__init__.py deleted file mode 100644 index 3dc6c62480..0000000000 --- a/dlt/destinations/filesystem/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientDwhWithStagingConfiguration - -from dlt.destinations.filesystem.configuration import FilesystemDestinationClientConfiguration - - -@with_config(spec=FilesystemDestinationClientConfiguration, sections=(known_sections.DESTINATION, "filesystem",)) -def _configure(config: FilesystemDestinationClientConfiguration = config.value) -> FilesystemDestinationClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - return DestinationCapabilitiesContext.generic_capabilities("jsonl") - - -def client(schema: Schema, initial_config: DestinationClientDwhWithStagingConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.filesystem.filesystem import FilesystemClient - - return FilesystemClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[FilesystemDestinationClientConfiguration]: - return FilesystemDestinationClientConfiguration diff --git a/dlt/destinations/impl/__init__.py b/dlt/destinations/impl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/dlt/destinations/athena/__init__.py b/dlt/destinations/impl/athena/__init__.py similarity index 55% rename from dlt/destinations/athena/__init__.py rename to dlt/destinations/impl/athena/__init__.py index 1fd7f14d57..9f0b829819 100644 --- a/dlt/destinations/athena/__init__.py +++ b/dlt/destinations/impl/athena/__init__.py @@ -1,18 +1,7 @@ -from typing import Type - from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.schema.schema import Schema from dlt.common.data_writers.escape import escape_athena_identifier from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.athena.configuration import AthenaClientConfiguration -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration - -@with_config(spec=AthenaClientConfiguration, sections=(known_sections.DESTINATION, "athena",)) -def _configure(config: AthenaClientConfiguration = config.value) -> AthenaClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -37,15 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.timestamp_precision = 3 caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.athena.athena import AthenaClient - return AthenaClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return AthenaClientConfiguration - - diff --git a/dlt/destinations/athena/athena.py b/dlt/destinations/impl/athena/athena.py similarity index 99% rename from dlt/destinations/athena/athena.py rename to dlt/destinations/impl/athena/athena.py index 44d020c127..f675e7a496 100644 --- a/dlt/destinations/athena/athena.py +++ b/dlt/destinations/impl/athena/athena.py @@ -27,11 +27,11 @@ from dlt.destinations.typing import DBApi, DBTransaction from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation, LoadJobTerminalException -from dlt.destinations.athena import capabilities +from dlt.destinations.impl.athena import capabilities from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error from dlt.destinations.typing import DBApiCursor from dlt.destinations.job_client_impl import SqlJobClientWithStaging -from dlt.destinations.athena.configuration import AthenaClientConfiguration +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration from dlt.destinations.type_mapping import TypeMapper from dlt.destinations import path_utils diff --git a/dlt/destinations/athena/configuration.py b/dlt/destinations/impl/athena/configuration.py similarity index 100% rename from dlt/destinations/athena/configuration.py rename to dlt/destinations/impl/athena/configuration.py diff --git a/dlt/destinations/impl/athena/factory.py b/dlt/destinations/impl/athena/factory.py new file mode 100644 index 0000000000..cc2b027695 --- /dev/null +++ b/dlt/destinations/impl/athena/factory.py @@ -0,0 +1,53 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.athena.configuration import AthenaClientConfiguration +from dlt.common.configuration.specs import AwsCredentials +from dlt.destinations.impl.athena import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.athena.athena import AthenaClient + + +class athena(Destination[AthenaClientConfiguration, "AthenaClient"]): + + spec = AthenaClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["AthenaClient"]: + from dlt.destinations.impl.athena.athena import AthenaClient + + return AthenaClient + + def __init__( + self, + query_result_bucket: t.Optional[str] = None, + credentials: t.Union[AwsCredentials, t.Dict[str, t.Any], t.Any] = None, + athena_work_group: t.Optional[str] = None, + aws_data_catalog: t.Optional[str] = "awsdatacatalog", + force_iceberg: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the Athena destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + query_result_bucket: S3 bucket to store query results in + credentials: AWS credentials to connect to the Athena database. + athena_work_group: Athena work group to use + aws_data_catalog: Athena data catalog to use + force_iceberg: Force iceberg tables + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + query_result_bucket=query_result_bucket, + credentials=credentials, + athena_work_group=athena_work_group, + aws_data_catalog=aws_data_catalog, + force_iceberg=force_iceberg, + **kwargs, + ) diff --git a/dlt/destinations/bigquery/README.md b/dlt/destinations/impl/bigquery/README.md similarity index 100% rename from dlt/destinations/bigquery/README.md rename to dlt/destinations/impl/bigquery/README.md diff --git a/dlt/destinations/bigquery/__init__.py b/dlt/destinations/impl/bigquery/__init__.py similarity index 50% rename from dlt/destinations/bigquery/__init__.py rename to dlt/destinations/impl/bigquery/__init__.py index 3d97e9a929..1304bd72bb 100644 --- a/dlt/destinations/bigquery/__init__.py +++ b/dlt/destinations/impl/bigquery/__init__.py @@ -1,20 +1,7 @@ -from typing import Type from dlt.common.data_writers.escape import escape_bigquery_identifier - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration - - -@with_config(spec=BigQueryClientConfiguration, sections=(known_sections.DESTINATION, "bigquery",)) -def _configure(config: BigQueryClientConfiguration = config.value) -> BigQueryClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -35,14 +22,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.bigquery.bigquery import BigQueryClient - - return BigQueryClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return BigQueryClientConfiguration \ No newline at end of file diff --git a/dlt/destinations/bigquery/bigquery.py b/dlt/destinations/impl/bigquery/bigquery.py similarity index 98% rename from dlt/destinations/bigquery/bigquery.py rename to dlt/destinations/impl/bigquery/bigquery.py index 9cc7591f57..440123e46d 100644 --- a/dlt/destinations/bigquery/bigquery.py +++ b/dlt/destinations/impl/bigquery/bigquery.py @@ -17,9 +17,9 @@ from dlt.destinations.job_client_impl import SqlJobClientWithStaging from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate, DestinationTransientException, LoadJobNotExistsException, LoadJobTerminalException -from dlt.destinations.bigquery import capabilities -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration -from dlt.destinations.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS +from dlt.destinations.impl.bigquery import capabilities +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.sql_client import BigQuerySqlClient, BQ_TERMINAL_REASONS from dlt.destinations.sql_jobs import SqlMergeJob, SqlStagingCopyJob, SqlJobParams from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase diff --git a/dlt/destinations/bigquery/configuration.py b/dlt/destinations/impl/bigquery/configuration.py similarity index 100% rename from dlt/destinations/bigquery/configuration.py rename to dlt/destinations/impl/bigquery/configuration.py diff --git a/dlt/destinations/impl/bigquery/factory.py b/dlt/destinations/impl/bigquery/factory.py new file mode 100644 index 0000000000..ce6ace3bf7 --- /dev/null +++ b/dlt/destinations/impl/bigquery/factory.py @@ -0,0 +1,35 @@ +import typing as t + +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration +from dlt.common.configuration.specs import GcpServiceAccountCredentials +from dlt.destinations.impl.bigquery import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +if t.TYPE_CHECKING: + from dlt.destinations.impl.bigquery.bigquery import BigQueryClient + + +class bigquery(Destination[BigQueryClientConfiguration, "BigQueryClient"]): + + spec = BigQueryClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["BigQueryClient"]: + from dlt.destinations.impl.bigquery.bigquery import BigQueryClient + + return BigQueryClient + + def __init__( + self, + credentials: t.Optional[GcpServiceAccountCredentials] = None, + location: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + super().__init__( + credentials=credentials, + location=location, + **kwargs + ) diff --git a/dlt/destinations/bigquery/sql_client.py b/dlt/destinations/impl/bigquery/sql_client.py similarity index 99% rename from dlt/destinations/bigquery/sql_client.py rename to dlt/destinations/impl/bigquery/sql_client.py index 3d6eb19833..4939add0da 100644 --- a/dlt/destinations/bigquery/sql_client.py +++ b/dlt/destinations/impl/bigquery/sql_client.py @@ -17,7 +17,7 @@ from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.bigquery import capabilities +from dlt.destinations.impl.bigquery import capabilities # terminal reasons as returned in BQ gRPC error response # https://cloud.google.com/bigquery/docs/error-messages diff --git a/dlt/destinations/duckdb/__init__.py b/dlt/destinations/impl/duckdb/__init__.py similarity index 54% rename from dlt/destinations/duckdb/__init__.py rename to dlt/destinations/impl/duckdb/__init__.py index d9882cc0eb..5cbc8dea53 100644 --- a/dlt/destinations/duckdb/__init__.py +++ b/dlt/destinations/impl/duckdb/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration - - -@with_config(spec=DuckDbClientConfiguration, sections=(known_sections.DESTINATION, "duckdb",)) -def _configure(config: DuckDbClientConfiguration = config.value) -> DuckDbClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -37,14 +24,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.duckdb.duck import DuckDbClient - - return DuckDbClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return DuckDbClientConfiguration diff --git a/dlt/destinations/duckdb/configuration.py b/dlt/destinations/impl/duckdb/configuration.py similarity index 94% rename from dlt/destinations/duckdb/configuration.py rename to dlt/destinations/impl/duckdb/configuration.py index 82ee325ed3..a5f77be8fd 100644 --- a/dlt/destinations/duckdb/configuration.py +++ b/dlt/destinations/impl/duckdb/configuration.py @@ -25,6 +25,7 @@ class DuckDbBaseCredentials(ConnectionStringCredentials): read_only: bool = False # open database read/write def borrow_conn(self, read_only: bool) -> Any: + # TODO: Can this be done in sql client instead? import duckdb if not hasattr(self, "_conn_lock"): @@ -95,6 +96,13 @@ class DuckDbCredentials(DuckDbBaseCredentials): __config_gen_annotations__: ClassVar[List[str]] = [] + def is_partial(self) -> bool: + partial = super().is_partial() + if partial: + return True + # Wait until pipeline context is set up before resolving + return self.database == ":pipeline:" + def on_resolved(self) -> None: # do not set any paths for external database if self.database == ":external:": @@ -126,8 +134,7 @@ def _path_in_pipeline(self, rel_path: str) -> str: if context.is_active(): # pipeline is active, get the working directory return os.path.join(context.pipeline().working_dir, rel_path) - return None - + raise RuntimeError("Attempting to use special duckdb database :pipeline: outside of pipeline context.") def _path_to_pipeline(self, abspath: str) -> None: from dlt.common.configuration.container import Container @@ -173,6 +180,9 @@ def _path_from_pipeline(self, default_path: str) -> Tuple[str, bool]: return default_path, True + def _conn_str(self) -> str: + return self.database + @configspec class DuckDbClientConfiguration(DestinationClientDwhWithStagingConfiguration): diff --git a/dlt/destinations/duckdb/duck.py b/dlt/destinations/impl/duckdb/duck.py similarity index 96% rename from dlt/destinations/duckdb/duck.py rename to dlt/destinations/impl/duckdb/duck.py index 4a2e54f2b6..6e6ec359fe 100644 --- a/dlt/destinations/duckdb/duck.py +++ b/dlt/destinations/impl/duckdb/duck.py @@ -12,9 +12,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.duckdb import capabilities -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb import capabilities +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/impl/duckdb/factory.py b/dlt/destinations/impl/duckdb/factory.py new file mode 100644 index 0000000000..1b882c52a1 --- /dev/null +++ b/dlt/destinations/impl/duckdb/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.duckdb.configuration import DuckDbCredentials, DuckDbClientConfiguration +from dlt.destinations.impl.duckdb import capabilities + +if t.TYPE_CHECKING: + from duckdb import DuckDBPyConnection + from dlt.destinations.impl.duckdb.duck import DuckDbClient + + +class duckdb(Destination[DuckDbClientConfiguration, "DuckDbClient"]): + + spec = DuckDbClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["DuckDbClient"]: + from dlt.destinations.impl.duckdb.duck import DuckDbClient + + return DuckDbClient + + def __init__( + self, + credentials: t.Union[DuckDbCredentials, t.Dict[str, t.Any], str, "DuckDBPyConnection"] = None, + create_indexes: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the DuckDB destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the duckdb database. Can be an instance of `DuckDbCredentials` or + a path to a database file. Use `:memory:` to create an in-memory database. + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/duckdb/sql_client.py b/dlt/destinations/impl/duckdb/sql_client.py similarity index 98% rename from dlt/destinations/duckdb/sql_client.py rename to dlt/destinations/impl/duckdb/sql_client.py index cd2160f676..cb4e1678a2 100644 --- a/dlt/destinations/duckdb/sql_client.py +++ b/dlt/destinations/impl/duckdb/sql_client.py @@ -8,8 +8,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error -from dlt.destinations.duckdb import capabilities -from dlt.destinations.duckdb.configuration import DuckDbBaseCredentials +from dlt.destinations.impl.duckdb import capabilities +from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials class DuckDBDBApiCursorImpl(DBApiCursorImpl): diff --git a/dlt/destinations/dummy/__init__.py b/dlt/destinations/impl/dummy/__init__.py similarity index 60% rename from dlt/destinations/dummy/__init__.py rename to dlt/destinations/impl/dummy/__init__.py index 7131f0109a..476523cb8f 100644 --- a/dlt/destinations/dummy/__init__.py +++ b/dlt/destinations/impl/dummy/__init__.py @@ -1,12 +1,8 @@ -from typing import Type - -from dlt.common.schema.schema import Schema from dlt.common.configuration import with_config, known_sections from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration @with_config(spec=DummyClientConfiguration, sections=(known_sections.DESTINATION, "dummy",)) @@ -30,14 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.dummy.dummy import DummyClient - - return DummyClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return DummyClientConfiguration diff --git a/dlt/destinations/dummy/configuration.py b/dlt/destinations/impl/dummy/configuration.py similarity index 90% rename from dlt/destinations/dummy/configuration.py rename to dlt/destinations/impl/dummy/configuration.py index 79cbe3e41e..1a8072300c 100644 --- a/dlt/destinations/dummy/configuration.py +++ b/dlt/destinations/impl/dummy/configuration.py @@ -20,6 +20,8 @@ class DummyClientConfiguration(DestinationClientConfiguration): fail_prob: float = 0.0 retry_prob: float = 0.0 completed_prob: float = 0.0 + exception_prob: float = 0.0 + """probability of exception when checking job status""" timeout: float = 10.0 fail_in_init: bool = True @@ -35,6 +37,7 @@ def __init__( fail_prob: float = None, retry_prob: float = None, completed_prob: float = None, + exception_prob: float = None, timeout: float = None, fail_in_init: bool = None, ) -> None: diff --git a/dlt/destinations/dummy/dummy.py b/dlt/destinations/impl/dummy/dummy.py similarity index 89% rename from dlt/destinations/dummy/dummy.py rename to dlt/destinations/impl/dummy/dummy.py index c8cac05d3a..0bc061a7dd 100644 --- a/dlt/destinations/dummy/dummy.py +++ b/dlt/destinations/impl/dummy/dummy.py @@ -13,8 +13,8 @@ from dlt.destinations.exceptions import (LoadJobNotExistsException, LoadJobInvalidStateTransitionException, DestinationTerminalException, DestinationTransientException) -from dlt.destinations.dummy import capabilities -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy import capabilities +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration class LoadDummyJob(LoadJob, FollowupJob): @@ -24,17 +24,20 @@ def __init__(self, file_name: str, config: DummyClientConfiguration) -> None: self._exception: str = None self.start_time: float = pendulum.now().timestamp() super().__init__(file_name) - # if config.fail_in_init: - s = self.state() - if s == "failed": - raise DestinationTerminalException(self._exception) - if s == "retry": - raise DestinationTransientException(self._exception) + if config.fail_in_init: + s = self.state() + if s == "failed": + raise DestinationTerminalException(self._exception) + if s == "retry": + raise DestinationTransientException(self._exception) def state(self) -> TLoadJobState: # this should poll the server for a job status, here we simulate various outcomes if self._status == "running": + c_r = random.random() + if self.config.exception_prob >= c_r: + raise DestinationTransientException("Dummy job status raised exception") n = pendulum.now().timestamp() if n - self.start_time > self.config.timeout: self._status = "failed" diff --git a/dlt/destinations/impl/dummy/factory.py b/dlt/destinations/impl/dummy/factory.py new file mode 100644 index 0000000000..265c77b0f4 --- /dev/null +++ b/dlt/destinations/impl/dummy/factory.py @@ -0,0 +1,30 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration, DummyClientCredentials +from dlt.destinations.impl.dummy import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.dummy.dummy import DummyClient + + +class dummy(Destination[DummyClientConfiguration, "DummyClient"]): + + spec = DummyClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["DummyClient"]: + from dlt.destinations.impl.dummy.dummy import DummyClient + + return DummyClient + + def __init__( + self, + credentials: DummyClientCredentials = None, + **kwargs: t.Any, + ) -> None: + super().__init__(credentials=credentials, **kwargs) diff --git a/dlt/destinations/impl/filesystem/__init__.py b/dlt/destinations/impl/filesystem/__init__.py new file mode 100644 index 0000000000..12e83216cf --- /dev/null +++ b/dlt/destinations/impl/filesystem/__init__.py @@ -0,0 +1,5 @@ +from dlt.common.destination import DestinationCapabilitiesContext + + +def capabilities() -> DestinationCapabilitiesContext: + return DestinationCapabilitiesContext.generic_capabilities("jsonl") diff --git a/dlt/destinations/filesystem/configuration.py b/dlt/destinations/impl/filesystem/configuration.py similarity index 100% rename from dlt/destinations/filesystem/configuration.py rename to dlt/destinations/impl/filesystem/configuration.py diff --git a/dlt/destinations/impl/filesystem/factory.py b/dlt/destinations/impl/filesystem/factory.py new file mode 100644 index 0000000000..4e2a716d79 --- /dev/null +++ b/dlt/destinations/impl/filesystem/factory.py @@ -0,0 +1,50 @@ +import typing as t + +from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.common.storages.configuration import FileSystemCredentials + +if t.TYPE_CHECKING: + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + +class filesystem(Destination[FilesystemDestinationClientConfiguration, "FilesystemClient"]): + + spec = FilesystemDestinationClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["FilesystemClient"]: + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + + return FilesystemClient + + def __init__( + self, + bucket_url: str = None, + credentials: t.Union[FileSystemCredentials, t.Dict[str, t.Any], t.Any] = None, + **kwargs: t.Any, + ) -> None: + """Configure the filesystem destination to use in a pipeline and load data to local or remote filesystem. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + The `bucket_url` determines the protocol to be used: + + - Local folder: `file:///path/to/directory` + - AWS S3 (and S3 compatible storages): `s3://bucket-name + - Azure Blob Storage: `az://container-name + - Google Cloud Storage: `gs://bucket-name + - Memory fs: `memory://m` + + Args: + bucket_url: The fsspec compatible bucket url to use for the destination. + credentials: Credentials to connect to the filesystem. The type of credentials should correspond to + the bucket protocol. For example, for AWS S3, the credentials should be an instance of `AwsCredentials`. + A dictionary with the credentials parameters can also be provided. + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(bucket_url=bucket_url, credentials=credentials, **kwargs) diff --git a/dlt/destinations/filesystem/filesystem.py b/dlt/destinations/impl/filesystem/filesystem.py similarity index 98% rename from dlt/destinations/filesystem/filesystem.py rename to dlt/destinations/impl/filesystem/filesystem.py index 766f384024..fe349aac6b 100644 --- a/dlt/destinations/filesystem/filesystem.py +++ b/dlt/destinations/impl/filesystem/filesystem.py @@ -12,8 +12,8 @@ from dlt.common.destination.reference import NewLoadJob, TLoadJobState, LoadJob, JobClientBase, FollowupJob, WithStagingDataset from dlt.destinations.job_impl import EmptyLoadJob -from dlt.destinations.filesystem import capabilities -from dlt.destinations.filesystem.configuration import FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem import capabilities +from dlt.destinations.impl.filesystem.configuration import FilesystemDestinationClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations import path_utils diff --git a/dlt/destinations/motherduck/__init__.py b/dlt/destinations/impl/motherduck/__init__.py similarity index 51% rename from dlt/destinations/motherduck/__init__.py rename to dlt/destinations/impl/motherduck/__init__.py index eae67eaa74..74c0e36ef3 100644 --- a/dlt/destinations/motherduck/__init__.py +++ b/dlt/destinations/impl/motherduck/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_duckdb_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.motherduck.configuration import MotherDuckClientConfiguration - - -@with_config(spec=MotherDuckClientConfiguration, sections=(known_sections.DESTINATION, "motherduck",)) -def _configure(config: MotherDuckClientConfiguration = config.value) -> MotherDuckClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -35,14 +22,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_truncate_command = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.motherduck.motherduck import MotherDuckClient - - return MotherDuckClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return MotherDuckClientConfiguration diff --git a/dlt/destinations/motherduck/configuration.py b/dlt/destinations/impl/motherduck/configuration.py similarity index 97% rename from dlt/destinations/motherduck/configuration.py rename to dlt/destinations/impl/motherduck/configuration.py index 18d480c945..a376f1a5aa 100644 --- a/dlt/destinations/motherduck/configuration.py +++ b/dlt/destinations/impl/motherduck/configuration.py @@ -7,7 +7,7 @@ from dlt.common.utils import digest128 from dlt.common.configuration.exceptions import ConfigurationValueError -from dlt.destinations.duckdb.configuration import DuckDbBaseCredentials +from dlt.destinations.impl.duckdb.configuration import DuckDbBaseCredentials MOTHERDUCK_DRIVERNAME = "md" diff --git a/dlt/destinations/impl/motherduck/factory.py b/dlt/destinations/impl/motherduck/factory.py new file mode 100644 index 0000000000..17cf4a76b4 --- /dev/null +++ b/dlt/destinations/impl/motherduck/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck import capabilities + +if t.TYPE_CHECKING: + from duckdb import DuckDBPyConnection + from dlt.destinations.impl.motherduck.motherduck import MotherDuckClient + + +class motherduck(Destination[MotherDuckClientConfiguration, "MotherDuckClient"]): + + spec = MotherDuckClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["MotherDuckClient"]: + from dlt.destinations.impl.motherduck.motherduck import MotherDuckClient + + return MotherDuckClient + + def __init__( + self, + credentials: t.Union[MotherDuckCredentials, str, t.Dict[str, t.Any], "DuckDBPyConnection"] = None, + create_indexes: bool = False, + **kwargs: t.Any, + ) -> None: + """Configure the MotherDuck destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the MotherDuck database. Can be an instance of `MotherDuckCredentials` or + a connection string in the format `md:///?token=` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/motherduck/motherduck.py b/dlt/destinations/impl/motherduck/motherduck.py similarity index 70% rename from dlt/destinations/motherduck/motherduck.py rename to dlt/destinations/impl/motherduck/motherduck.py index 93c0ed163b..9822f2b7b6 100644 --- a/dlt/destinations/motherduck/motherduck.py +++ b/dlt/destinations/impl/motherduck/motherduck.py @@ -4,10 +4,10 @@ from dlt.common.schema import Schema -from dlt.destinations.duckdb.duck import DuckDbClient -from dlt.destinations.motherduck import capabilities -from dlt.destinations.motherduck.sql_client import MotherDuckSqlClient -from dlt.destinations.motherduck.configuration import MotherDuckClientConfiguration +from dlt.destinations.impl.duckdb.duck import DuckDbClient +from dlt.destinations.impl.motherduck import capabilities +from dlt.destinations.impl.motherduck.sql_client import MotherDuckSqlClient +from dlt.destinations.impl.motherduck.configuration import MotherDuckClientConfiguration class MotherDuckClient(DuckDbClient): diff --git a/dlt/destinations/motherduck/sql_client.py b/dlt/destinations/impl/motherduck/sql_client.py similarity index 83% rename from dlt/destinations/motherduck/sql_client.py rename to dlt/destinations/impl/motherduck/sql_client.py index 2fc664a2e8..672c377fd9 100644 --- a/dlt/destinations/motherduck/sql_client.py +++ b/dlt/destinations/impl/motherduck/sql_client.py @@ -8,9 +8,9 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame from dlt.destinations.sql_client import SqlClientBase, DBApiCursorImpl, raise_database_error, raise_open_connection_error -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl -from dlt.destinations.motherduck import capabilities -from dlt.destinations.motherduck.configuration import MotherDuckCredentials +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient, DuckDBDBApiCursorImpl +from dlt.destinations.impl.motherduck import capabilities +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials class MotherDuckSqlClient(DuckDbSqlClient): diff --git a/dlt/destinations/mssql/README.md b/dlt/destinations/impl/mssql/README.md similarity index 100% rename from dlt/destinations/mssql/README.md rename to dlt/destinations/impl/mssql/README.md diff --git a/dlt/destinations/mssql/__init__.py b/dlt/destinations/impl/mssql/__init__.py similarity index 57% rename from dlt/destinations/mssql/__init__.py rename to dlt/destinations/impl/mssql/__init__.py index 56051a324e..40e971cacf 100644 --- a/dlt/destinations/mssql/__init__.py +++ b/dlt/destinations/impl/mssql/__init__.py @@ -1,21 +1,8 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_mssql_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration - - -@with_config(spec=MsSqlClientConfiguration, sections=(known_sections.DESTINATION, "mssql",)) -def _configure(config: MsSqlClientConfiguration = config.value) -> MsSqlClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -39,14 +26,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.timestamp_precision = 7 return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.mssql.mssql import MsSqlClient - - return MsSqlClient(schema, _configure(initial_config)) # type: ignore[arg-type] - - -def spec() -> Type[DestinationClientConfiguration]: - return MsSqlClientConfiguration diff --git a/dlt/destinations/mssql/configuration.py b/dlt/destinations/impl/mssql/configuration.py similarity index 100% rename from dlt/destinations/mssql/configuration.py rename to dlt/destinations/impl/mssql/configuration.py diff --git a/dlt/destinations/impl/mssql/factory.py b/dlt/destinations/impl/mssql/factory.py new file mode 100644 index 0000000000..c98531ca79 --- /dev/null +++ b/dlt/destinations/impl/mssql/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials, MsSqlClientConfiguration +from dlt.destinations.impl.mssql import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.mssql.mssql import MsSqlClient + + +class mssql(Destination[MsSqlClientConfiguration, "MsSqlClient"]): + + spec = MsSqlClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["MsSqlClient"]: + from dlt.destinations.impl.mssql.mssql import MsSqlClient + + return MsSqlClient + + def __init__( + self, + credentials: t.Union[MsSqlCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the MsSql destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the mssql database. Can be an instance of `MsSqlCredentials` or + a connection string in the format `mssql://user:password@host:port/database` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/mssql/mssql.py b/dlt/destinations/impl/mssql/mssql.py similarity index 97% rename from dlt/destinations/mssql/mssql.py rename to dlt/destinations/impl/mssql/mssql.py index cd999441ff..851122f20c 100644 --- a/dlt/destinations/mssql/mssql.py +++ b/dlt/destinations/impl/mssql/mssql.py @@ -12,9 +12,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.mssql import capabilities -from dlt.destinations.mssql.sql_client import PyOdbcMsSqlClient -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration +from dlt.destinations.impl.mssql import capabilities +from dlt.destinations.impl.mssql.sql_client import PyOdbcMsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/mssql/sql_client.py b/dlt/destinations/impl/mssql/sql_client.py similarity index 97% rename from dlt/destinations/mssql/sql_client.py rename to dlt/destinations/impl/mssql/sql_client.py index 4dd983a334..5372fa3626 100644 --- a/dlt/destinations/mssql/sql_client.py +++ b/dlt/destinations/impl/mssql/sql_client.py @@ -13,8 +13,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.mssql.configuration import MsSqlCredentials -from dlt.destinations.mssql import capabilities +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.mssql import capabilities def handle_datetimeoffset(dto_value: bytes) -> datetime: diff --git a/dlt/destinations/postgres/README.md b/dlt/destinations/impl/postgres/README.md similarity index 100% rename from dlt/destinations/postgres/README.md rename to dlt/destinations/impl/postgres/README.md diff --git a/dlt/destinations/postgres/__init__.py b/dlt/destinations/impl/postgres/__init__.py similarity index 58% rename from dlt/destinations/postgres/__init__.py rename to dlt/destinations/impl/postgres/__init__.py index e8904c075f..009174ecc9 100644 --- a/dlt/destinations/postgres/__init__.py +++ b/dlt/destinations/impl/postgres/__init__.py @@ -1,20 +1,9 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_postgres_identifier, escape_postgres_literal from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE from dlt.common.wei import EVM_DECIMAL_PRECISION -from dlt.destinations.postgres.configuration import PostgresClientConfiguration - - -@with_config(spec=PostgresClientConfiguration, sections=(known_sections.DESTINATION, "postgres",)) -def _configure(config: PostgresClientConfiguration = config.value) -> PostgresClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: @@ -39,12 +28,3 @@ def capabilities() -> DestinationCapabilitiesContext: return caps -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.postgres.postgres import PostgresClient - - return PostgresClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return PostgresClientConfiguration diff --git a/dlt/destinations/postgres/configuration.py b/dlt/destinations/impl/postgres/configuration.py similarity index 100% rename from dlt/destinations/postgres/configuration.py rename to dlt/destinations/impl/postgres/configuration.py diff --git a/dlt/destinations/impl/postgres/factory.py b/dlt/destinations/impl/postgres/factory.py new file mode 100644 index 0000000000..33971eb642 --- /dev/null +++ b/dlt/destinations/impl/postgres/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.postgres.postgres import PostgresClient + + +class postgres(Destination[PostgresClientConfiguration, "PostgresClient"]): + + spec = PostgresClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["PostgresClient"]: + from dlt.destinations.impl.postgres.postgres import PostgresClient + + return PostgresClient + + def __init__( + self, + credentials: t.Union[PostgresCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the Postgres destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the postgres database. Can be an instance of `PostgresCredentials` or + a connection string in the format `postgres://user:password@host:port/database` + create_indexes: Should unique indexes be created + **kwargs: Additional arguments passed to the destination config + """ + super().__init__(credentials=credentials, create_indexes=create_indexes, **kwargs) diff --git a/dlt/destinations/postgres/postgres.py b/dlt/destinations/impl/postgres/postgres.py similarity index 95% rename from dlt/destinations/postgres/postgres.py rename to dlt/destinations/impl/postgres/postgres.py index 2812d1d4c4..03c42f4d75 100644 --- a/dlt/destinations/postgres/postgres.py +++ b/dlt/destinations/impl/postgres/postgres.py @@ -11,9 +11,9 @@ from dlt.destinations.insert_job_client import InsertValuesJobClient -from dlt.destinations.postgres import capabilities -from dlt.destinations.postgres.sql_client import Psycopg2SqlClient -from dlt.destinations.postgres.configuration import PostgresClientConfiguration +from dlt.destinations.impl.postgres import capabilities +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient +from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/postgres/sql_client.py b/dlt/destinations/impl/postgres/sql_client.py similarity index 97% rename from dlt/destinations/postgres/sql_client.py rename to dlt/destinations/impl/postgres/sql_client.py index 079a0ae477..b6c4c1a1be 100644 --- a/dlt/destinations/postgres/sql_client.py +++ b/dlt/destinations/impl/postgres/sql_client.py @@ -16,8 +16,8 @@ from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error -from dlt.destinations.postgres.configuration import PostgresCredentials -from dlt.destinations.postgres import capabilities +from dlt.destinations.impl.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres import capabilities class Psycopg2SqlClient(SqlClientBase["psycopg2.connection"], DBTransaction): diff --git a/dlt/destinations/impl/qdrant/__init__.py b/dlt/destinations/impl/qdrant/__init__.py new file mode 100644 index 0000000000..1a2c466b14 --- /dev/null +++ b/dlt/destinations/impl/qdrant/__init__.py @@ -0,0 +1,18 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + + return caps diff --git a/dlt/destinations/qdrant/configuration.py b/dlt/destinations/impl/qdrant/configuration.py similarity index 100% rename from dlt/destinations/qdrant/configuration.py rename to dlt/destinations/impl/qdrant/configuration.py diff --git a/dlt/destinations/impl/qdrant/factory.py b/dlt/destinations/impl/qdrant/factory.py new file mode 100644 index 0000000000..316b5ae434 --- /dev/null +++ b/dlt/destinations/impl/qdrant/factory.py @@ -0,0 +1,30 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.qdrant.configuration import QdrantCredentials, QdrantClientConfiguration +from dlt.destinations.impl.qdrant import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient + + +class qdrant(Destination[QdrantClientConfiguration, "QdrantClient"]): + + spec = QdrantClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["QdrantClient"]: + from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient + + return QdrantClient + + def __init__( + self, + credentials: t.Union[QdrantCredentials, t.Dict[str, t.Any]] = None, + **kwargs: t.Any, + ) -> None: + super().__init__(credentials=credentials, **kwargs) diff --git a/dlt/destinations/qdrant/qdrant_adapter.py b/dlt/destinations/impl/qdrant/qdrant_adapter.py similarity index 95% rename from dlt/destinations/qdrant/qdrant_adapter.py rename to dlt/destinations/impl/qdrant/qdrant_adapter.py index ac51bd5f42..f37a1f6cd8 100644 --- a/dlt/destinations/qdrant/qdrant_adapter.py +++ b/dlt/destinations/impl/qdrant/qdrant_adapter.py @@ -1,8 +1,7 @@ from typing import Any from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource VECTORIZE_HINT = "x-qdrant-embed" diff --git a/dlt/destinations/qdrant/qdrant_client.py b/dlt/destinations/impl/qdrant/qdrant_client.py similarity index 98% rename from dlt/destinations/qdrant/qdrant_client.py rename to dlt/destinations/impl/qdrant/qdrant_client.py index cba87e9528..029530d624 100644 --- a/dlt/destinations/qdrant/qdrant_client.py +++ b/dlt/destinations/impl/qdrant/qdrant_client.py @@ -11,9 +11,9 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.qdrant import capabilities -from dlt.destinations.qdrant.configuration import QdrantClientConfiguration -from dlt.destinations.qdrant.qdrant_adapter import VECTORIZE_HINT +from dlt.destinations.impl.qdrant import capabilities +from dlt.destinations.impl.qdrant.configuration import QdrantClientConfiguration +from dlt.destinations.impl.qdrant.qdrant_adapter import VECTORIZE_HINT from qdrant_client import QdrantClient as QC, models from qdrant_client.qdrant_fastembed import uuid @@ -406,4 +406,4 @@ def _collection_exists(self, table_name: str, qualify_table_name: bool = True) - except UnexpectedResponse as e: if e.status_code == 404: return False - raise e \ No newline at end of file + raise e diff --git a/dlt/destinations/redshift/README.md b/dlt/destinations/impl/redshift/README.md similarity index 100% rename from dlt/destinations/redshift/README.md rename to dlt/destinations/impl/redshift/README.md diff --git a/dlt/destinations/redshift/__init__.py b/dlt/destinations/impl/redshift/__init__.py similarity index 52% rename from dlt/destinations/redshift/__init__.py rename to dlt/destinations/impl/redshift/__init__.py index 96741e86cd..8a8cae84b4 100644 --- a/dlt/destinations/redshift/__init__.py +++ b/dlt/destinations/impl/redshift/__init__.py @@ -1,20 +1,7 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.data_writers.escape import escape_redshift_identifier, escape_redshift_literal from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration - - -@with_config(spec=RedshiftClientConfiguration, sections=(known_sections.DESTINATION, "redshift",)) -def _configure(config: RedshiftClientConfiguration = config.value) -> RedshiftClientConfiguration: - return config - def capabilities() -> DestinationCapabilitiesContext: caps = DestinationCapabilitiesContext() @@ -36,14 +23,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.alter_add_multi_column = False return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.redshift.redshift import RedshiftClient - - return RedshiftClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return RedshiftClientConfiguration diff --git a/dlt/destinations/redshift/configuration.py b/dlt/destinations/impl/redshift/configuration.py similarity index 88% rename from dlt/destinations/redshift/configuration.py rename to dlt/destinations/impl/redshift/configuration.py index 7cb13b996f..7018445773 100644 --- a/dlt/destinations/redshift/configuration.py +++ b/dlt/destinations/impl/redshift/configuration.py @@ -4,7 +4,7 @@ from dlt.common.configuration import configspec from dlt.common.utils import digest128 -from dlt.destinations.postgres.configuration import PostgresCredentials, PostgresClientConfiguration +from dlt.destinations.impl.postgres.configuration import PostgresCredentials, PostgresClientConfiguration @configspec diff --git a/dlt/destinations/impl/redshift/factory.py b/dlt/destinations/impl/redshift/factory.py new file mode 100644 index 0000000000..7648b35851 --- /dev/null +++ b/dlt/destinations/impl/redshift/factory.py @@ -0,0 +1,45 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.redshift.configuration import RedshiftCredentials, RedshiftClientConfiguration +from dlt.destinations.impl.redshift import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.redshift.redshift import RedshiftClient + + +class redshift(Destination[RedshiftClientConfiguration, "RedshiftClient"]): + + spec = RedshiftClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["RedshiftClient"]: + from dlt.destinations.impl.redshift.redshift import RedshiftClient + + return RedshiftClient + + def __init__( + self, + credentials: t.Union[RedshiftCredentials, t.Dict[str, t.Any], str] = None, + create_indexes: bool = True, + staging_iam_role: t.Optional[str] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Redshift destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the redshift database. Can be an instance of `RedshiftCredentials` or + a connection string in the format `redshift://user:password@host:port/database` + create_indexes: Should unique indexes be created + staging_iam_role: IAM role to use for staging data in S3 + **kwargs: Additional arguments passed to the destination config + """ + super().__init__( + credentials=credentials, create_indexes=create_indexes, staging_iam_role=staging_iam_role, **kwargs + ) diff --git a/dlt/destinations/redshift/redshift.py b/dlt/destinations/impl/redshift/redshift.py similarity index 97% rename from dlt/destinations/redshift/redshift.py rename to dlt/destinations/impl/redshift/redshift.py index 888f27ae7c..2124807bc1 100644 --- a/dlt/destinations/redshift/redshift.py +++ b/dlt/destinations/impl/redshift/redshift.py @@ -1,7 +1,7 @@ import platform import os -from dlt.destinations.postgres.sql_client import Psycopg2SqlClient +from dlt.destinations.impl.postgres.sql_client import Psycopg2SqlClient from dlt.common.schema.utils import table_schema_has_type, table_schema_has_type_with_precision if platform.python_implementation() == "PyPy": @@ -25,8 +25,8 @@ from dlt.destinations.exceptions import DatabaseTerminalException, LoadJobTerminalException from dlt.destinations.job_client_impl import CopyRemoteFileLoadJob, LoadJob -from dlt.destinations.redshift import capabilities -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration +from dlt.destinations.impl.redshift import capabilities +from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/snowflake/__init__.py b/dlt/destinations/impl/snowflake/__init__.py similarity index 52% rename from dlt/destinations/snowflake/__init__.py rename to dlt/destinations/impl/snowflake/__init__.py index 5d32bc41fd..12e118eeab 100644 --- a/dlt/destinations/snowflake/__init__.py +++ b/dlt/destinations/impl/snowflake/__init__.py @@ -1,20 +1,8 @@ -from typing import Type from dlt.common.data_writers.escape import escape_bigquery_identifier - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import JobClientBase, DestinationClientConfiguration from dlt.common.data_writers.escape import escape_snowflake_identifier from dlt.common.arithmetics import DEFAULT_NUMERIC_PRECISION, DEFAULT_NUMERIC_SCALE -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration - - -@with_config(spec=SnowflakeClientConfiguration, sections=(known_sections.DESTINATION, "snowflake",)) -def _configure(config: SnowflakeClientConfiguration = config.value) -> SnowflakeClientConfiguration: - return config def capabilities() -> DestinationCapabilitiesContext: @@ -35,14 +23,3 @@ def capabilities() -> DestinationCapabilitiesContext: caps.supports_ddl_transactions = True caps.alter_add_multi_column = True return caps - - -def client(schema: Schema, initial_config: DestinationClientConfiguration = config.value) -> JobClientBase: - # import client when creating instance so capabilities and config specs can be accessed without dependencies installed - from dlt.destinations.snowflake.snowflake import SnowflakeClient - - return SnowflakeClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[DestinationClientConfiguration]: - return SnowflakeClientConfiguration diff --git a/dlt/destinations/snowflake/configuration.py b/dlt/destinations/impl/snowflake/configuration.py similarity index 100% rename from dlt/destinations/snowflake/configuration.py rename to dlt/destinations/impl/snowflake/configuration.py diff --git a/dlt/destinations/impl/snowflake/factory.py b/dlt/destinations/impl/snowflake/factory.py new file mode 100644 index 0000000000..1201f406b0 --- /dev/null +++ b/dlt/destinations/impl/snowflake/factory.py @@ -0,0 +1,41 @@ +import typing as t + +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials, SnowflakeClientConfiguration +from dlt.destinations.impl.snowflake import capabilities +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +if t.TYPE_CHECKING: + from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient + + +class snowflake(Destination[SnowflakeClientConfiguration, "SnowflakeClient"]): + + spec = SnowflakeClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["SnowflakeClient"]: + from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient + + return SnowflakeClient + + def __init__( + self, + credentials: t.Union[SnowflakeCredentials, t.Dict[str, t.Any], str] = None, + stage_name: t.Optional[str] = None, + keep_staged_files: bool = True, + **kwargs: t.Any, + ) -> None: + """Configure the Snowflake destination to use in a pipeline. + + All arguments provided here supersede other configuration sources such as environment variables and dlt config files. + + Args: + credentials: Credentials to connect to the snowflake database. Can be an instance of `SnowflakeCredentials` or + a connection string in the format `snowflake://user:password@host:port/database` + stage_name: Name of an existing stage to use for loading data. Default uses implicit stage per table + keep_staged_files: Whether to delete or keep staged files after loading + """ + super().__init__(credentials=credentials, stage_name=stage_name, keep_staged_files=keep_staged_files, **kwargs) diff --git a/dlt/destinations/snowflake/snowflake.py b/dlt/destinations/impl/snowflake/snowflake.py similarity index 97% rename from dlt/destinations/snowflake/snowflake.py rename to dlt/destinations/impl/snowflake/snowflake.py index f433ec7e7d..ead3e810d2 100644 --- a/dlt/destinations/snowflake/snowflake.py +++ b/dlt/destinations/impl/snowflake/snowflake.py @@ -14,11 +14,11 @@ from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.exceptions import LoadJobTerminalException -from dlt.destinations.snowflake import capabilities -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration -from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient +from dlt.destinations.impl.snowflake import capabilities +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration +from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.sql_jobs import SqlStagingCopyJob, SqlJobParams -from dlt.destinations.snowflake.sql_client import SnowflakeSqlClient +from dlt.destinations.impl.snowflake.sql_client import SnowflakeSqlClient from dlt.destinations.job_impl import NewReferenceJob from dlt.destinations.sql_client import SqlClientBase from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/snowflake/sql_client.py b/dlt/destinations/impl/snowflake/sql_client.py similarity index 98% rename from dlt/destinations/snowflake/sql_client.py rename to dlt/destinations/impl/snowflake/sql_client.py index 40cdc990a0..139a5ebb7a 100644 --- a/dlt/destinations/snowflake/sql_client.py +++ b/dlt/destinations/impl/snowflake/sql_client.py @@ -7,8 +7,8 @@ from dlt.destinations.exceptions import DatabaseTerminalException, DatabaseTransientException, DatabaseUndefinedRelation from dlt.destinations.sql_client import DBApiCursorImpl, SqlClientBase, raise_database_error, raise_open_connection_error from dlt.destinations.typing import DBApi, DBApiCursor, DBTransaction, DataFrame -from dlt.destinations.snowflake.configuration import SnowflakeCredentials -from dlt.destinations.snowflake import capabilities +from dlt.destinations.impl.snowflake.configuration import SnowflakeCredentials +from dlt.destinations.impl.snowflake import capabilities class SnowflakeCursorImpl(DBApiCursorImpl): native_cursor: snowflake_lib.cursor.SnowflakeCursor # type: ignore[assignment] diff --git a/dlt/destinations/weaviate/README.md b/dlt/destinations/impl/weaviate/README.md similarity index 100% rename from dlt/destinations/weaviate/README.md rename to dlt/destinations/impl/weaviate/README.md diff --git a/dlt/destinations/impl/weaviate/__init__.py b/dlt/destinations/impl/weaviate/__init__.py new file mode 100644 index 0000000000..143e0260d2 --- /dev/null +++ b/dlt/destinations/impl/weaviate/__init__.py @@ -0,0 +1,19 @@ +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.destinations.impl.weaviate.weaviate_adapter import weaviate_adapter + + +def capabilities() -> DestinationCapabilitiesContext: + caps = DestinationCapabilitiesContext() + caps.preferred_loader_file_format = "jsonl" + caps.supported_loader_file_formats = ["jsonl"] + + caps.max_identifier_length = 200 + caps.max_column_identifier_length = 1024 + caps.max_query_length = 8 * 1024 * 1024 + caps.is_max_query_length_in_bytes = False + caps.max_text_data_type_length = 8 * 1024 * 1024 + caps.is_max_text_data_type_length_in_bytes = False + caps.supports_ddl_transactions = False + caps.naming_convention = "dlt.destinations.impl.weaviate.naming" + + return caps diff --git a/dlt/destinations/weaviate/ci_naming.py b/dlt/destinations/impl/weaviate/ci_naming.py similarity index 100% rename from dlt/destinations/weaviate/ci_naming.py rename to dlt/destinations/impl/weaviate/ci_naming.py diff --git a/dlt/destinations/weaviate/configuration.py b/dlt/destinations/impl/weaviate/configuration.py similarity index 100% rename from dlt/destinations/weaviate/configuration.py rename to dlt/destinations/impl/weaviate/configuration.py diff --git a/dlt/destinations/weaviate/exceptions.py b/dlt/destinations/impl/weaviate/exceptions.py similarity index 100% rename from dlt/destinations/weaviate/exceptions.py rename to dlt/destinations/impl/weaviate/exceptions.py diff --git a/dlt/destinations/impl/weaviate/factory.py b/dlt/destinations/impl/weaviate/factory.py new file mode 100644 index 0000000000..b29d02b1a7 --- /dev/null +++ b/dlt/destinations/impl/weaviate/factory.py @@ -0,0 +1,47 @@ +import typing as t + +from dlt.common.destination import Destination, DestinationCapabilitiesContext + +from dlt.destinations.impl.weaviate.configuration import WeaviateCredentials, WeaviateClientConfiguration +from dlt.destinations.impl.weaviate import capabilities + +if t.TYPE_CHECKING: + from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient + + +class weaviate(Destination[WeaviateClientConfiguration, "WeaviateClient"]): + + spec = WeaviateClientConfiguration + + def capabilities(self) -> DestinationCapabilitiesContext: + return capabilities() + + @property + def client_class(self) -> t.Type["WeaviateClient"]: + from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient + + return WeaviateClient + + def __init__( + self, + credentials: t.Union[WeaviateCredentials, t.Dict[str, t.Any]] = None, + vectorizer: str = None, + module_config: t.Dict[str, t.Dict[str, str]] = None, + **kwargs: t.Any, + ) -> None: + """Configure the Weaviate destination to use in a pipeline. + + All destination config parameters can be provided as arguments here and will supersede other config sources (such as dlt config files and environment variables). + + Args: + credentials: Weaviate credentials containing URL, API key and optional headers + vectorizer: The name of the Weaviate vectorizer to use + module_config: The configuration for the Weaviate modules + **kwargs: Additional arguments forwarded to the destination config + """ + super().__init__( + credentials=credentials, + vectorizer=vectorizer, + module_config=module_config, + **kwargs + ) diff --git a/dlt/destinations/weaviate/naming.py b/dlt/destinations/impl/weaviate/naming.py similarity index 100% rename from dlt/destinations/weaviate/naming.py rename to dlt/destinations/impl/weaviate/naming.py diff --git a/dlt/destinations/weaviate/weaviate_adapter.py b/dlt/destinations/impl/weaviate/weaviate_adapter.py similarity index 97% rename from dlt/destinations/weaviate/weaviate_adapter.py rename to dlt/destinations/impl/weaviate/weaviate_adapter.py index 6829197273..bbb3f1c9da 100644 --- a/dlt/destinations/weaviate/weaviate_adapter.py +++ b/dlt/destinations/impl/weaviate/weaviate_adapter.py @@ -1,8 +1,7 @@ from typing import Dict, Any, Literal, Set, get_args from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns -from dlt.extract.decorators import resource as make_resource -from dlt.extract.source import DltResource +from dlt.extract import DltResource, resource as make_resource TTokenizationTMethod = Literal["word", "lowercase", "whitespace", "field"] TOKENIZATION_METHODS: Set[TTokenizationTMethod] = set(get_args(TTokenizationTMethod)) diff --git a/dlt/destinations/weaviate/weaviate_client.py b/dlt/destinations/impl/weaviate/weaviate_client.py similarity index 98% rename from dlt/destinations/weaviate/weaviate_client.py rename to dlt/destinations/impl/weaviate/weaviate_client.py index d47f08ab59..099cdc7368 100644 --- a/dlt/destinations/weaviate/weaviate_client.py +++ b/dlt/destinations/impl/weaviate/weaviate_client.py @@ -41,13 +41,13 @@ from dlt.common.data_types import TDataType from dlt.common.storages import FileStorage -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations.job_client_impl import StorageSchemaInfo, StateInfo -from dlt.destinations.weaviate import capabilities -from dlt.destinations.weaviate.configuration import WeaviateClientConfiguration -from dlt.destinations.weaviate.exceptions import PropertyNameConflict, WeaviateBatchError +from dlt.destinations.impl.weaviate import capabilities +from dlt.destinations.impl.weaviate.configuration import WeaviateClientConfiguration +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict, WeaviateBatchError from dlt.destinations.type_mapping import TypeMapper diff --git a/dlt/destinations/qdrant/__init__.py b/dlt/destinations/qdrant/__init__.py deleted file mode 100644 index 7a8619ffcd..0000000000 --- a/dlt/destinations/qdrant/__init__.py +++ /dev/null @@ -1,53 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination.reference import ( - JobClientBase, - DestinationClientConfiguration, -) -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.destinations.qdrant.qdrant_adapter import qdrant_adapter - -from dlt.destinations.qdrant.configuration import QdrantClientConfiguration - - -@with_config( - spec=QdrantClientConfiguration, - sections=( - known_sections.DESTINATION, - "qdrant", - ), -) -def _configure( - config: QdrantClientConfiguration = config.value, -) -> QdrantClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - - return caps - - -def client( - schema: Schema, initial_config: DestinationClientConfiguration = config.value -) -> JobClientBase: - from dlt.destinations.qdrant.qdrant_client import QdrantClient - return QdrantClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[QdrantClientConfiguration]: - return QdrantClientConfiguration diff --git a/dlt/destinations/weaviate/__init__.py b/dlt/destinations/weaviate/__init__.py deleted file mode 100644 index ebd87aea0c..0000000000 --- a/dlt/destinations/weaviate/__init__.py +++ /dev/null @@ -1,55 +0,0 @@ -from typing import Type - -from dlt.common.schema.schema import Schema -from dlt.common.configuration import with_config, known_sections -from dlt.common.configuration.accessors import config -from dlt.common.destination.reference import ( - JobClientBase, - DestinationClientConfiguration, -) -from dlt.common.destination import DestinationCapabilitiesContext - -from dlt.destinations.weaviate.weaviate_adapter import weaviate_adapter -from dlt.destinations.weaviate.configuration import WeaviateClientConfiguration - - -@with_config( - spec=WeaviateClientConfiguration, - sections=( - known_sections.DESTINATION, - "weaviate", - ), -) -def _configure( - config: WeaviateClientConfiguration = config.value, -) -> WeaviateClientConfiguration: - return config - - -def capabilities() -> DestinationCapabilitiesContext: - caps = DestinationCapabilitiesContext() - caps.preferred_loader_file_format = "jsonl" - caps.supported_loader_file_formats = ["jsonl"] - - caps.max_identifier_length = 200 - caps.max_column_identifier_length = 1024 - caps.max_query_length = 8 * 1024 * 1024 - caps.is_max_query_length_in_bytes = False - caps.max_text_data_type_length = 8 * 1024 * 1024 - caps.is_max_text_data_type_length_in_bytes = False - caps.supports_ddl_transactions = False - caps.naming_convention = "dlt.destinations.weaviate.naming" - - return caps - - -def client( - schema: Schema, initial_config: DestinationClientConfiguration = config.value -) -> JobClientBase: - from dlt.destinations.weaviate.weaviate_client import WeaviateClient - - return WeaviateClient(schema, _configure(initial_config)) # type: ignore - - -def spec() -> Type[WeaviateClientConfiguration]: - return WeaviateClientConfiguration diff --git a/dlt/extract/__init__.py b/dlt/extract/__init__.py index e69de29bb2..cc6ff15759 100644 --- a/dlt/extract/__init__.py +++ b/dlt/extract/__init__.py @@ -0,0 +1,7 @@ +from dlt.extract.resource import DltResource, with_table_name +from dlt.extract.source import DltSource +from dlt.extract.decorators import source, resource, transformer, defer +from dlt.extract.incremental import Incremental +from dlt.extract.wrappers import wrap_additional_type + +__all__ = ["DltResource", "DltSource", "with_table_name", "source", "resource", "transformer", "defer", "Incremental", "wrap_additional_type"] diff --git a/dlt/extract/decorators.py b/dlt/extract/decorators.py index fd06df4d16..fbe712fae8 100644 --- a/dlt/extract/decorators.py +++ b/dlt/extract/decorators.py @@ -15,7 +15,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.source import _SOURCES, SourceInfo from dlt.common.schema.schema import Schema -from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat +from dlt.common.schema.typing import TColumnNames, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TSchemaContract, TTableFormat from dlt.extract.utils import ensure_table_schema_columns_hint, simulate_func_call, wrap_compat_transformer, wrap_resource_gen from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages.schema_storage import SchemaStorage @@ -25,7 +25,8 @@ from dlt.extract.incremental import IncrementalResourceWrapper from dlt.extract.typing import TTableHintTemplate -from dlt.extract.source import DltResource, DltSource, TUnboundDltResource +from dlt.extract.source import DltSource +from dlt.extract.resource import DltResource, TUnboundDltResource @configspec @@ -53,9 +54,10 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] -) -> Callable[TSourceFunParams, TDltSourceImpl]: +) -> Callable[TSourceFunParams, DltSource]: ... @overload @@ -67,6 +69,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ) -> Callable[[Callable[TSourceFunParams, Any]], Callable[TSourceFunParams, TDltSourceImpl]]: @@ -80,6 +83,7 @@ def source( max_table_nesting: int = None, root_key: bool = False, schema: Schema = None, + schema_contract: TSchemaContract = None, spec: Type[BaseConfiguration] = None, _impl_cls: Type[TDltSourceImpl] = DltSource # type: ignore[assignment] ) -> Any: @@ -115,6 +119,8 @@ def source( schema (Schema, optional): An explicit `Schema` instance to be associated with the source. If not present, `dlt` creates a new `Schema` object with provided `name`. If such `Schema` already exists in the same folder as the module containing the decorated function, such schema will be loaded from file. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to this resource. + spec (Type[BaseConfiguration], optional): A specification of configuration and secret values required by the source. _impl_cls (Type[TDltSourceImpl], optional): A custom implementation of DltSource, may be also used to providing just a typing stub @@ -122,7 +128,6 @@ def source( Returns: `DltSource` instance """ - if name and schema: raise ArgumentsOverloadException("'name' has no effect when `schema` argument is present", source.__name__) @@ -172,6 +177,7 @@ def _wrap(*args: Any, **kwargs: Any) -> TDltSourceImpl: # apply hints if max_table_nesting is not None: s.max_table_nesting = max_table_nesting + s.schema_contract = schema_contract # enable root propagation s.root_key = root_key return s @@ -203,6 +209,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -219,6 +226,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -235,6 +243,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, @@ -253,6 +262,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None @@ -269,6 +279,7 @@ def resource( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None, selected: bool = True, spec: Type[BaseConfiguration] = None, @@ -319,6 +330,7 @@ def resource( merge_key (str | Sequence[str]): A column name or a list of column names that define a merge key. Typically used with "merge" write disposition to remove overlapping data ranges ie. to keep a single record for a given day. This argument also accepts a callable that is used to dynamically create tables for stream-like resources yielding many datatypes. + schema_contract (TSchemaContract, optional): Schema contract settings that will be applied to all resources of this source (if not overridden in the resource itself) table_format (Literal["iceberg"], optional): Defines the storage format of the table. Currently only "iceberg" is supported on Athena, other destinations ignore this hint. selected (bool, optional): When `True` `dlt pipeline` will extract and load this resource, if `False`, the resource will be ignored. @@ -343,6 +355,7 @@ def make_resource(_name: str, _section: str, _data: Any, incremental: Incrementa columns=columns, primary_key=primary_key, merge_key=merge_key, + schema_contract=schema_contract, table_format=table_format ) return DltResource.from_data(_data, _name, _section, table_template, selected, cast(DltResource, data_from), incremental=incremental) diff --git a/dlt/extract/exceptions.py b/dlt/extract/exceptions.py index e540a2468f..351b85a9d8 100644 --- a/dlt/extract/exceptions.py +++ b/dlt/extract/exceptions.py @@ -264,11 +264,3 @@ def __init__(self, source_name: str, schema_name: str) -> None: class IncrementalUnboundError(DltResourceException): def __init__(self, cursor_path: str) -> None: super().__init__("", f"The incremental definition with cursor path {cursor_path} is used without being bound to the resource. This most often happens when you create dynamic resource from a generator function that uses incremental. See https://dlthub.com/docs/general-usage/incremental-loading#incremental-loading-with-last-value for an example.") - - -class ValidationError(ValueError, DltException): - def __init__(self, validator: ValidateItem, data_item: TDataItems, original_exception: Exception) ->None: - self.original_exception = original_exception - self.validator = validator - self.data_item = data_item - super().__init__(f"Extracted data item could not be validated with {validator}. Original message: {original_exception}") diff --git a/dlt/extract/extract.py b/dlt/extract/extract.py index 3f71943579..1276f1b1f5 100644 --- a/dlt/extract/extract.py +++ b/dlt/extract/extract.py @@ -1,258 +1,22 @@ import contextlib -import os -from typing import ClassVar, List, Set, Dict, Type, Any, Sequence, Optional -from collections import defaultdict +from typing import Set, Dict, Optional, Set from dlt.common.configuration.container import Container from dlt.common.configuration.resolve import inject_section -from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.configuration.specs import ConfigSectionContext, known_sections from dlt.common.pipeline import reset_resource_state from dlt.common.data_writers import TLoaderFileFormat -from dlt.common.exceptions import MissingDependencyException from dlt.common.runtime import signals from dlt.common.runtime.collector import Collector, NULL_COLLECTOR -from dlt.common.utils import uniq_id -from dlt.common.typing import TDataItems, TDataItem -from dlt.common.schema import Schema, utils, TSchemaUpdate -from dlt.common.schema.typing import TColumnSchema, TTableSchemaColumns -from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage -from dlt.common.configuration.specs import known_sections +from dlt.common.schema import utils from dlt.extract.decorators import SourceSchemaInjectableContext from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints from dlt.extract.pipe import PipeIterator -from dlt.extract.source import DltResource, DltSource -from dlt.extract.typing import TableNameMeta -try: - from dlt.common.libs import pyarrow - from dlt.common.libs.pyarrow import pyarrow as pa -except MissingDependencyException: - pyarrow = None -try: - import pandas as pd -except ModuleNotFoundError: - pd = None - - -class ExtractorItemStorage(DataItemStorage): - load_file_type: TLoaderFileFormat - - def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: - # data item storage with jsonl with pua encoding - super().__init__(self.load_file_type) - self.extract_folder = extract_folder - self.storage = storage - - - def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: - template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") - return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.extract_folder, extract_id) - - -class JsonLExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "puae-jsonl" - - -class ArrowExtractorStorage(ExtractorItemStorage): - load_file_type: TLoaderFileFormat = "arrow" - - -class ExtractorStorage(NormalizeStorage): - EXTRACT_FOLDER: ClassVar[str] = "extract" - - """Wrapper around multiple extractor storages with different file formats""" - def __init__(self, C: NormalizeStorageConfiguration) -> None: - super().__init__(True, C) - self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { - "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), - "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) - } - - def _get_extract_path(self, extract_id: str) -> str: - return os.path.join(self.EXTRACT_FOLDER, extract_id) - - def create_extract_id(self) -> str: - extract_id = uniq_id() - self.storage.create_folder(self._get_extract_path(extract_id)) - return extract_id - - def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: - return self._item_storages[loader_file_format] - - def close_writers(self, extract_id: str) -> None: - for storage in self._item_storages.values(): - storage.close_writers(extract_id) - - def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: - extract_path = self._get_extract_path(extract_id) - for file in self.storage.list_folder_files(extract_path, to_root=False): - from_file = os.path.join(extract_path, file) - to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) - if with_delete: - self.storage.atomic_rename(from_file, to_file) - else: - # create hardlink which will act as a copy - self.storage.link_hard(from_file, to_file) - if with_delete: - self.storage.delete_folder(extract_path, recursively=True) - - def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: - self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) - - - -class Extractor: - file_format: TLoaderFileFormat - dynamic_tables: TSchemaUpdate - def __init__( - self, - extract_id: str, - storage: ExtractorStorage, - schema: Schema, - resources_with_items: Set[str], - dynamic_tables: TSchemaUpdate, - collector: Collector = NULL_COLLECTOR - ) -> None: - self._storage = storage - self.schema = schema - self.dynamic_tables = dynamic_tables - self.collector = collector - self.resources_with_items = resources_with_items - self.extract_id = extract_id - - @property - def storage(self) -> ExtractorItemStorage: - return self._storage.get_storage(self.file_format) - - @staticmethod - def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: - """Detect the loader file format of the data items based on type. - Currently this is either 'arrow' or 'puae-jsonl' - - Returns: - The loader file format or `None` if if can't be detected. - """ - for item in items if isinstance(items, list) else [items]: - # Assume all items in list are the same type - if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): - return "arrow" - return "puae-jsonl" - return None # Empty list is unknown format - - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - if isinstance(meta, TableNameMeta): - table_name = meta.table_name - self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) - else: - if resource._table_name_hint_fun: - if isinstance(items, list): - for item in items: - self._write_dynamic_table(resource, item) - else: - self._write_dynamic_table(resource, items) - else: - # write item belonging to table with static name - table_name = resource.table_name # type: ignore[assignment] - self._write_static_table(resource, table_name, items) - self._write_item(table_name, resource.name, items) - - def write_empty_file(self, table_name: str) -> None: - table_name = self.schema.naming.normalize_table_identifier(table_name) - self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - # normalize table name before writing so the name match the name in schema - # note: normalize function should be cached so there's almost no penalty on frequent calling - # note: column schema is not required for jsonl writer used here - table_name = self.schema.naming.normalize_identifier(table_name) - self.collector.update(table_name) - self.resources_with_items.add(resource_name) - self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) - - def _write_dynamic_table(self, resource: DltResource, item: TDataItem) -> None: - table_name = resource._table_name_hint_fun(item) - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - self.dynamic_tables[table_name] = [resource.compute_table_schema(item)] - else: - # quick check if deep table merge is required - if resource._table_has_other_dynamic_hints: - new_table = resource.compute_table_schema(item) - # this merges into existing table in place - utils.merge_tables(existing_table[0], new_table) - else: - # if there are no other dynamic hints besides name then we just leave the existing partial table - pass - # write to storage with inferred table name - self._write_item(table_name, resource.name, item) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is None: - static_table = resource.compute_table_schema() - static_table["name"] = table_name - self.dynamic_tables[table_name] = [static_table] - - -class JsonLExtractor(Extractor): - file_format = "puae-jsonl" - - -class ArrowExtractor(Extractor): - file_format = "arrow" - - def _rename_columns(self, items: List[TDataItem], new_column_names: List[str]) -> List[TDataItem]: - """Rename arrow columns to normalized schema column names""" - if not items: - return items - if items[0].schema.names == new_column_names: - # No need to rename - return items - if isinstance(items[0], pyarrow.pyarrow.Table): - return [item.rename_columns(new_column_names) for item in items] - elif isinstance(items[0], pyarrow.pyarrow.RecordBatch): - # Convert the batches to table -> rename -> then back to batches - return pa.Table.from_batches(items).rename_columns(new_column_names).to_batches() # type: ignore[no-any-return] - else: - raise TypeError(f"Unsupported data item type {type(items[0])}") - - def write_table(self, resource: DltResource, items: TDataItems, meta: Any) -> None: - items = [ - # 2. Remove null-type columns from the table(s) as they can't be loaded - pyarrow.remove_null_columns(tbl) for tbl in ( - # 1. Convert pandas frame(s) to arrow Table - pyarrow.pyarrow.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item - for item in (items if isinstance(items, list) else [items]) - ) - ] - super().write_table(resource, items, meta) - - def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: - # Note: `items` is always a list here due to the conversion in `write_table` - new_columns = list(self.dynamic_tables[table_name][0]["columns"].keys()) - super()._write_item(table_name, resource_name, self._rename_columns(items, new_columns), self.dynamic_tables[table_name][0]["columns"]) - - def _write_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: - existing_table = self.dynamic_tables.get(table_name) - if existing_table is not None: - return - static_table = resource.compute_table_schema() - if isinstance(items, list): - item = items[0] - else: - item = items - # Merge the columns to include primary_key and other hints that may be set on the resource - arrow_columns = pyarrow.py_arrow_to_table_schema_columns(item.schema) - for key, value in static_table["columns"].items(): - arrow_columns[key] = utils.merge_columns(value, arrow_columns.get(key, {})) - static_table["columns"] = arrow_columns - static_table["name"] = table_name - self.dynamic_tables[table_name] = [self.schema.normalize_table_identifiers(static_table)] +from dlt.extract.source import DltSource +from dlt.extract.storage import ExtractorStorage +from dlt.extract.extractors import JsonLExtractor, ArrowExtractor, Extractor def extract( @@ -264,16 +28,15 @@ def extract( max_parallel_items: int = None, workers: int = None, futures_poll_interval: float = None -) -> TSchemaUpdate: - dynamic_tables: TSchemaUpdate = {} +) -> None: schema = source.schema resources_with_items: Set[str] = set() extractors: Dict[TLoaderFileFormat, Extractor] = { "puae-jsonl": JsonLExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ), "arrow": ArrowExtractor( - extract_id, storage, schema, resources_with_items, dynamic_tables, collector=collector + extract_id, storage, schema, resources_with_items, collector=collector ) } last_item_format: Optional[TLoaderFileFormat] = None @@ -296,7 +59,7 @@ def extract( resource = source.resources[pipe_item.pipe.name] # Fallback to last item's format or default (puae-jsonl) if the current item is an empty list item_format = Extractor.item_format(pipe_item.item) or last_item_format or "puae-jsonl" - extractors[item_format].write_table(resource, pipe_item.item, pipe_item.meta) + extractors[item_format].write_items(resource, pipe_item.item, pipe_item.meta) last_item_format = item_format # find defined resources that did not yield any pipeitems and create empty jobs for them @@ -310,7 +73,7 @@ def extract( for table in tables_by_resources[resource.name]: # we only need to write empty files for the top tables if not table.get("parent", None): - extractors[last_item_format or "puae-jsonl"].write_empty_file(table["name"]) + extractors["puae-jsonl"].write_empty_file(table["name"]) if left_gens > 0: # go to 100% @@ -319,21 +82,17 @@ def extract( # flush all buffered writers storage.close_writers(extract_id) - # returns set of partial tables - return dynamic_tables - def extract_with_schema( storage: ExtractorStorage, source: DltSource, - schema: Schema, collector: Collector, max_parallel_items: int, - workers: int + workers: int, ) -> str: # generate extract_id to be able to commit all the sources together later extract_id = storage.create_extract_id() - with Container().injectable_context(SourceSchemaInjectableContext(schema)): + with Container().injectable_context(SourceSchemaInjectableContext(source.schema)): # inject the config section with the current source name with inject_section(ConfigSectionContext(sections=(known_sections.SOURCES, source.section, source.name), source_state_key=source.name)): # reset resource states, the `extracted` list contains all the explicit resources and all their parents @@ -341,11 +100,6 @@ def extract_with_schema( with contextlib.suppress(DataItemRequiredForDynamicTableHints): if resource.write_disposition == "replace": reset_resource_state(resource.name) - - extractor = extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) - # iterate over all items in the pipeline and update the schema if dynamic table hints were present - for _, partials in extractor.items(): - for partial in partials: - schema.update_table(schema.normalize_table_identifiers(partial)) + extract(extract_id, source, storage, collector, max_parallel_items=max_parallel_items, workers=workers) return extract_id diff --git a/dlt/extract/extractors.py b/dlt/extract/extractors.py new file mode 100644 index 0000000000..0ec8aed968 --- /dev/null +++ b/dlt/extract/extractors.py @@ -0,0 +1,246 @@ +from copy import copy +from typing import Set, Dict, Any, Optional, Set + +from dlt.common.configuration.inject import with_config +from dlt.common.configuration.specs import BaseConfiguration, configspec +from dlt.common.destination.capabilities import DestinationCapabilitiesContext +from dlt.common.data_writers import TLoaderFileFormat +from dlt.common.exceptions import MissingDependencyException + +from dlt.common.runtime.collector import Collector, NULL_COLLECTOR +from dlt.common.utils import update_dict_nested +from dlt.common.typing import TDataItems, TDataItem +from dlt.common.schema import Schema, utils +from dlt.common.schema.typing import TSchemaContractDict, TSchemaEvolutionMode, TTableSchema, TTableSchemaColumns, TPartialTableSchema + +from dlt.extract.resource import DltResource +from dlt.extract.typing import TableNameMeta +from dlt.extract.storage import ExtractorStorage, ExtractorItemStorage +try: + from dlt.common.libs import pyarrow + from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem +except MissingDependencyException: + pyarrow = None + +try: + import pandas as pd +except ModuleNotFoundError: + pd = None + + +class Extractor: + file_format: TLoaderFileFormat + + @configspec + class ExtractorConfiguration(BaseConfiguration): + _caps: Optional[DestinationCapabilitiesContext] = None + + @with_config(spec=ExtractorConfiguration) + def __init__( + self, + extract_id: str, + storage: ExtractorStorage, + schema: Schema, + resources_with_items: Set[str], + collector: Collector = NULL_COLLECTOR, + *, + _caps: DestinationCapabilitiesContext = None + ) -> None: + self.schema = schema + self.naming = schema.naming + self.collector = collector + self.resources_with_items = resources_with_items + self.extract_id = extract_id + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + self._storage = storage + self._caps = _caps or DestinationCapabilitiesContext.generic_capabilities() + + @property + def storage(self) -> ExtractorItemStorage: + return self._storage.get_storage(self.file_format) + + @staticmethod + def item_format(items: TDataItems) -> Optional[TLoaderFileFormat]: + """Detect the loader file format of the data items based on type. + Currently this is either 'arrow' or 'puae-jsonl' + + Returns: + The loader file format or `None` if if can't be detected. + """ + for item in items if isinstance(items, list) else [items]: + # Assume all items in list are the same type + if (pyarrow and pyarrow.is_arrow_item(item)) or (pd and isinstance(item, pd.DataFrame)): + return "arrow" + return "puae-jsonl" + return None # Empty list is unknown format + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + """Write `items` to `resource` optionally computing table schemas and revalidating/filtering data""" + if table_name := self._get_static_table_name(resource, meta): + # write item belonging to table with static name + self._write_to_static_table(resource, table_name, items) + else: + # table has name or other hints depending on data items + self._write_to_dynamic_table(resource, items) + + def write_empty_file(self, table_name: str) -> None: + table_name = self.naming.normalize_table_identifier(table_name) + self.storage.write_empty_file(self.extract_id, self.schema.name, table_name, None) + + def _get_static_table_name(self, resource: DltResource, meta: Any) -> Optional[str]: + if resource._table_name_hint_fun: + return None + if isinstance(meta, TableNameMeta): + table_name = meta.table_name + else: + table_name = resource.table_name # type: ignore[assignment] + return self.naming.normalize_table_identifier(table_name) + + def _get_dynamic_table_name(self, resource: DltResource, item: TDataItem) -> str: + return self.naming.normalize_table_identifier(resource._table_name_hint_fun(item)) + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + new_rows_count = self.storage.write_data_item(self.extract_id, self.schema.name, table_name, items, columns) + self.collector.update(table_name, inc=new_rows_count) + self.resources_with_items.add(resource_name) + + def _write_to_dynamic_table(self, resource: DltResource, items: TDataItems) -> None: + if not isinstance(items, list): + items = [items] + + for item in items: + table_name = self._get_dynamic_table_name(resource, item) + if table_name in self._filtered_tables: + continue + if table_name not in self._table_contracts or resource._table_has_other_dynamic_hints: + item = self._compute_and_update_table(resource, table_name, item) + # write to storage with inferred table name + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, item) + + def _write_to_static_table(self, resource: DltResource, table_name: str, items: TDataItems) -> None: + if table_name not in self._table_contracts: + items = self._compute_and_update_table(resource, table_name, items) + if table_name not in self._filtered_tables: + self._write_item(table_name, resource.name, items) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TTableSchema: + """Computes a schema for a new or dynamic table and normalizes identifiers""" + return self.schema.normalize_table_identifiers( + resource.compute_table_schema(items) + ) + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + """ + Computes new table and does contract checks, if false is returned, the table may not be created and not items should be written + """ + computed_table = self._compute_table(resource, items) + # overwrite table name (if coming from meta) + computed_table["name"] = table_name + # get or compute contract + schema_contract = self._table_contracts.setdefault( + table_name, + self.schema.resolve_contract_settings_for_table(table_name, computed_table) + ) + + # this is a new table so allow evolve once + if schema_contract["columns"] != "evolve" and self.schema.is_new_table(table_name): + computed_table["x-normalizer"] = {"evolve-columns-once": True} # type: ignore[typeddict-unknown-key] + existing_table = self.schema._schema_tables.get(table_name, None) + if existing_table: + diff_table = utils.diff_tables(existing_table, computed_table) + else: + diff_table = computed_table + + # apply contracts + diff_table, filters = self.schema.apply_schema_contract(schema_contract, diff_table, data_item=items) + + # merge with schema table + if diff_table: + self.schema.update_table(diff_table) + + # process filters + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + return items + + +class JsonLExtractor(Extractor): + file_format = "puae-jsonl" + + +class ArrowExtractor(Extractor): + file_format = "arrow" + + def write_items(self, resource: DltResource, items: TDataItems, meta: Any) -> None: + static_table_name = self._get_static_table_name(resource, meta) + items = [ + # 3. remove columns and rows in data contract filters + # 2. Remove null-type columns from the table(s) as they can't be loaded + self._apply_contract_filters(pyarrow.remove_null_columns(tbl), resource, static_table_name) for tbl in ( + # 1. Convert pandas frame(s) to arrow Table + pa.Table.from_pandas(item) if (pd and isinstance(item, pd.DataFrame)) else item + for item in (items if isinstance(items, list) else [items]) + ) + ] + super().write_items(resource, items, meta) + + def _apply_contract_filters(self, item: "TAnyArrowItem", resource: DltResource, static_table_name: Optional[str]) -> "TAnyArrowItem": + """Removes the columns (discard value) or rows (discard rows) as indicated by contract filters.""" + # convert arrow schema names into normalized names + rename_mapping = pyarrow.get_normalized_arrow_fields_mapping(item, self.naming) + # find matching columns and delete by original name + table_name = static_table_name or self._get_dynamic_table_name(resource, item) + filtered_columns = self._filtered_columns.get(table_name) + if filtered_columns: + # remove rows where columns have non null values + # create a mask where rows will be False if any of the specified columns are non-null + mask = None + rev_mapping = {v: k for k, v in rename_mapping.items()} + for column in [name for name, mode in filtered_columns.items() if mode == "discard_row"]: + is_null = pyarrow.pyarrow.compute.is_null(item[rev_mapping[column]]) + mask = is_null if mask is None else pyarrow.pyarrow.compute.and_(mask, is_null) + # filter the table using the mask + if mask is not None: + item = item.filter(mask) + + # remove value actually removes the whole columns from the table + # NOTE: filtered columns has normalized column names so we need to go through mapping + removed_columns = [name for name in rename_mapping if filtered_columns.get(rename_mapping[name]) is not None] + if removed_columns: + item = pyarrow.remove_columns(item, removed_columns) + + return item + + def _write_item(self, table_name: str, resource_name: str, items: TDataItems, columns: TTableSchemaColumns = None) -> None: + columns = columns or self.schema.tables[table_name]["columns"] + # Note: `items` is always a list here due to the conversion in `write_table` + items = [pyarrow.normalize_py_arrow_schema(item, columns, self.naming, self._caps) for item in items] + super()._write_item(table_name, resource_name, items, columns) + + def _compute_table(self, resource: DltResource, items: TDataItems) -> TPartialTableSchema: + items = items[0] + computed_table = super()._compute_table(resource, items) + + # Merge the columns to include primary_key and other hints that may be set on the resource + arrow_table = copy(computed_table) + arrow_table["columns"] = pyarrow.py_arrow_to_table_schema_columns(items.schema) + # normalize arrow table before merging + arrow_table = self.schema.normalize_table_identifiers(arrow_table) + # we must override the columns to preserve the order in arrow table + arrow_table["columns"] = update_dict_nested(arrow_table["columns"], computed_table["columns"]) + + return arrow_table + + def _compute_and_update_table(self, resource: DltResource, table_name: str, items: TDataItems) -> TDataItems: + items = super()._compute_and_update_table(resource, table_name, items) + # filter data item as filters could be updated in compute table + items = [self._apply_contract_filters(item, resource, table_name) for item in items] + return items diff --git a/dlt/extract/schema.py b/dlt/extract/hints.py similarity index 74% rename from dlt/extract/schema.py rename to dlt/extract/hints.py index c1dfd1f7f5..19d503f970 100644 --- a/dlt/extract/schema.py +++ b/dlt/extract/hints.py @@ -1,9 +1,8 @@ from copy import copy, deepcopy -from collections.abc import Mapping as C_Mapping from typing import List, TypedDict, cast, Any from dlt.common.schema.utils import DEFAULT_WRITE_DISPOSITION, merge_columns, new_column, new_table -from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat +from dlt.common.schema.typing import TColumnNames, TColumnProp, TColumnSchema, TPartialTableSchema, TTableSchema, TTableSchemaColumns, TWriteDisposition, TAnySchemaColumns, TTableFormat, TSchemaContract from dlt.common.typing import TDataItem from dlt.common.utils import update_dict_nested from dlt.common.validation import validate_dict_ignoring_xkeys @@ -12,7 +11,7 @@ from dlt.extract.typing import TFunHintTemplate, TTableHintTemplate, ValidateItem from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, TableNameMissing from dlt.extract.utils import ensure_table_schema_columns, ensure_table_schema_columns_hint -from dlt.extract.validation import get_column_validator +from dlt.extract.validation import create_item_validator class TTableSchemaTemplate(TypedDict, total=False): @@ -25,10 +24,12 @@ class TTableSchemaTemplate(TypedDict, total=False): primary_key: TTableHintTemplate[TColumnNames] merge_key: TTableHintTemplate[TColumnNames] incremental: Incremental[Any] + schema_contract: TTableHintTemplate[TSchemaContract] validator: ValidateItem + original_columns: TTableHintTemplate[TAnySchemaColumns] -class DltResourceSchema: +class DltResourceHints: def __init__(self, table_schema_template: TTableSchemaTemplate = None): self.__qualname__ = self.__name__ = self.name self._table_name_hint_fun: TFunHintTemplate[str] = None @@ -70,7 +71,11 @@ def columns(self) -> TTableHintTemplate[TTableSchemaColumns]: return None return self._table_schema_template.get("columns") - def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: + @property + def schema_contract(self) -> TTableHintTemplate[TSchemaContract]: + return self._table_schema_template.get("schema_contract") + + def compute_table_schema(self, item: TDataItem = None) -> TTableSchema: """Computes the table schema based on hints and column definitions passed during resource creation. `item` parameter is used to resolve table hints based on data""" if not self._table_schema_template: return new_table(self.name, resource=self.name) @@ -85,13 +90,11 @@ def compute_table_schema(self, item: TDataItem = None) -> TPartialTableSchema: if self._table_name_hint_fun and item is None: raise DataItemRequiredForDynamicTableHints(self.name) # resolve - resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items()} # type: ignore - resolved_template.pop("incremental", None) - resolved_template.pop("validator", None) + resolved_template: TTableSchemaTemplate = {k: self._resolve_hint(item, v) for k, v in table_template.items() if k not in ["incremental", "validator", "original_columns"]} # type: ignore table_schema = self._merge_keys(resolved_template) table_schema["resource"] = self.name validate_dict_ignoring_xkeys( - spec=TPartialTableSchema, + spec=TTableSchema, doc=table_schema, path=f"new_table/{self.name}", ) @@ -105,7 +108,8 @@ def apply_hints( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, - incremental: Incremental[Any] = None + incremental: Incremental[Any] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, ) -> None: """Creates or modifies existing table schema by setting provided hints. Accepts both static and dynamic hints based on data. @@ -122,10 +126,10 @@ def apply_hints( t = None if not self._table_schema_template: # if there's no template yet, create and set new one - t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key) + t = self.new_table_template(table_name, parent_table_name, write_disposition, columns, primary_key, merge_key, schema_contract) else: # set single hints - t = deepcopy(self._table_schema_template) + t = self._clone_table_template(self._table_schema_template) if table_name is not None: if table_name: t["name"] = table_name @@ -139,7 +143,8 @@ def apply_hints( if write_disposition: t["write_disposition"] = write_disposition if columns is not None: - t['validator'] = get_column_validator(columns) + # keep original columns: ie in case it is a Pydantic model + t["original_columns"] = columns # if callable then override existing if callable(columns) or callable(t["columns"]): t["columns"] = ensure_table_schema_columns_hint(columns) @@ -151,7 +156,6 @@ def apply_hints( else: # set to empty columns t["columns"] = ensure_table_schema_columns(columns) - if primary_key is not None: if primary_key: t["primary_key"] = primary_key @@ -162,13 +166,27 @@ def apply_hints( t["merge_key"] = merge_key else: t.pop("merge_key", None) + if schema_contract is not None: + if schema_contract: + t["schema_contract"] = schema_contract + else: + t.pop("schema_contract", None) + # recreate validator if columns definition or contract changed + if schema_contract is not None or columns is not None: + t["validator"], schema_contract = create_item_validator(t.get("original_columns"), t.get("schema_contract")) + if schema_contract is not None: + t["schema_contract"] = schema_contract # set properties that cannot be passed to new_table_template - t["incremental"] = incremental + if incremental is not None: + if incremental is Incremental.EMPTY: + t["incremental"] = None + else: + t["incremental"] = incremental self.set_template(t) def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - DltResourceSchema.validate_dynamic_hints(table_schema_template) + DltResourceHints.validate_dynamic_hints(table_schema_template) # if "name" is callable in the template then the table schema requires actual data item to be inferred name_hint = table_schema_template.get("name") if callable(name_hint): @@ -179,13 +197,21 @@ def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: self._table_has_other_dynamic_hints = any(callable(v) for k, v in table_schema_template.items() if k != "name") self._table_schema_template = table_schema_template + @staticmethod + def _clone_table_template(template: TTableSchemaTemplate) -> TTableSchemaTemplate: + t_ = copy(template) + t_["columns"] = deepcopy(template["columns"]) + if "schema_contract" in template: + t_["schema_contract"] = deepcopy(template["schema_contract"]) + return t_ + @staticmethod def _resolve_hint(item: TDataItem, hint: TTableHintTemplate[Any]) -> Any: - """Calls each dynamic hint passing a data item""" - if callable(hint): - return hint(item) - else: - return hint + """Calls each dynamic hint passing a data item""" + if callable(hint): + return hint(item) + else: + return hint @staticmethod def _merge_key(hint: TColumnProp, keys: TColumnNames, partial: TPartialTableSchema) -> None: @@ -205,9 +231,9 @@ def _merge_keys(t_: TTableSchemaTemplate) -> TPartialTableSchema: # assert not callable(t_["merge_key"]) # assert not callable(t_["primary_key"]) if "primary_key" in t_: - DltResourceSchema._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore + DltResourceHints._merge_key("primary_key", t_.pop("primary_key"), partial) # type: ignore if "merge_key" in t_: - DltResourceSchema._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore + DltResourceHints._merge_key("merge_key", t_.pop("merge_key"), partial) # type: ignore return partial @@ -219,21 +245,29 @@ def new_table_template( columns: TTableHintTemplate[TAnySchemaColumns] = None, primary_key: TTableHintTemplate[TColumnNames] = None, merge_key: TTableHintTemplate[TColumnNames] = None, + schema_contract: TTableHintTemplate[TSchemaContract] = None, table_format: TTableHintTemplate[TTableFormat] = None - ) -> TTableSchemaTemplate: + ) -> TTableSchemaTemplate: + validator, schema_contract = create_item_validator(columns, schema_contract) + clean_columns = columns if columns is not None: - validator = get_column_validator(columns) - columns = ensure_table_schema_columns_hint(columns) - if not callable(columns): - columns = columns.values() # type: ignore - else: - validator = None + clean_columns = ensure_table_schema_columns_hint(columns) + if not callable(clean_columns): + clean_columns = clean_columns.values() # type: ignore # create a table schema template where hints can be functions taking TDataItem new_template: TTableSchemaTemplate = new_table( - table_name, parent_table_name, write_disposition=write_disposition, columns=columns, table_format=table_format # type: ignore + table_name, # type: ignore + parent_table_name, # type: ignore + write_disposition=write_disposition, # type: ignore + columns=clean_columns, # type: ignore + schema_contract=schema_contract, # type: ignore + table_format=table_format # type: ignore ) if not table_name: new_template.pop("name") + # remember original columns + if columns is not None: + new_template["original_columns"] = columns # always remove resource new_template.pop("resource", None) # type: ignore if primary_key: @@ -242,12 +276,12 @@ def new_table_template( new_template["merge_key"] = merge_key if validator: new_template["validator"] = validator - DltResourceSchema.validate_dynamic_hints(new_template) + DltResourceHints.validate_dynamic_hints(new_template) return new_template @staticmethod def validate_dynamic_hints(template: TTableSchemaTemplate) -> None: table_name = template.get("name") # if any of the hints is a function then name must be as well - if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator"]) and not callable(table_name): + if any(callable(v) for k, v in template.items() if k not in ["name", "incremental", "validator", "original_columns"]) and not callable(table_name): raise InconsistentTableTemplate(f"Table name {table_name} must be a function if any other table hint is a function") diff --git a/dlt/extract/incremental/__init__.py b/dlt/extract/incremental/__init__.py index 6d042aa15d..1c5fa7ab38 100644 --- a/dlt/extract/incremental/__init__.py +++ b/dlt/extract/incremental/__init__.py @@ -1,5 +1,5 @@ import os -from typing import Generic, Any, Optional, get_args, get_origin, Type, Dict +from typing import Generic, ClassVar, Any, Optional, get_args, get_origin, Type, Dict import inspect from functools import wraps @@ -69,11 +69,15 @@ class Incremental(ItemTransform[TDataItem], BaseConfiguration, Generic[TCursorVa The values passed explicitly to Incremental will be ignored. Note that if logical "end date" is present then also "end_value" will be set which means that resource state is not used and exactly this range of date will be loaded """ + # this is config/dataclass so declare members cursor_path: str = None # TODO: Support typevar here initial_value: Optional[Any] = None end_value: Optional[Any] = None + # incremental acting as empty + EMPTY: ClassVar["Incremental[Any]"] = None + def __init__( self, cursor_path: str = dlt.config.value, @@ -336,6 +340,8 @@ def __call__(self, rows: TDataItems, meta: Any = None) -> Optional[TDataItems]: return [item for item in (self._transform_item(transformer, row) for row in rows) if item is not None] return self._transform_item(transformer, rows) +Incremental.EMPTY = Incremental[Any]("") + class IncrementalResourceWrapper(ItemTransform[TDataItem]): _incremental: Optional[Incremental[Any]] = None diff --git a/dlt/extract/incremental/transform.py b/dlt/extract/incremental/transform.py index af45736da4..44538aa3f5 100644 --- a/dlt/extract/incremental/transform.py +++ b/dlt/extract/incremental/transform.py @@ -23,9 +23,11 @@ from dlt.extract.typing import TTableHintTemplate from dlt.common.schema.typing import TColumnNames try: + from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa, TAnyArrowItem except MissingDependencyException: pa = None + pyarrow = None class IncrementalTransform: @@ -182,24 +184,7 @@ def _deduplicate(self, tbl: "pa.Table", unique_columns: Optional[List[str]], agg """Creates unique index if necessary.""" # create unique index if necessary if self._dlt_index not in tbl.schema.names: - tbl = tbl.append_column(self._dlt_index, pa.array(np.arange(tbl.num_rows))) - # code below deduplicates groups that include the cursor column in the group id. that was just artifact of - # json incremental and there's no need to duplicate it here - - # if unique_columns is None: - # return tbl - # group_cols = unique_columns + [cursor_path] - # try: - # tbl = tbl.filter( - # pa.compute.is_in( - # tbl[self._dlt_index], - # tbl.group_by(group_cols).aggregate( - # [(self._dlt_index, "one"), (cursor_path, aggregate)] - # )[f'{self._dlt_index}_one'] - # ) - # ) - # except KeyError as e: - # raise IncrementalPrimaryKeyMissing(self.resource_name, unique_columns[0], tbl) from e + tbl = pyarrow.append_column(tbl, self._dlt_index, pa.array(np.arange(tbl.num_rows))) return tbl def __call__( @@ -225,7 +210,7 @@ def __call__( if isinstance(primary_key, str): self._dlt_index = primary_key elif primary_key is None: - unique_columns = tbl.column_names + unique_columns = tbl.schema.names else: # deduplicating is disabled unique_columns = None @@ -312,7 +297,7 @@ def __call__( if len(tbl) == 0: return None, start_out_of_range, end_out_of_range try: - tbl = tbl.drop(["_dlt_index"]) + tbl = pyarrow.remove_columns(tbl, ["_dlt_index"]) except KeyError: pass if is_pandas: diff --git a/dlt/extract/resource.py b/dlt/extract/resource.py new file mode 100644 index 0000000000..2c3018e77d --- /dev/null +++ b/dlt/extract/resource.py @@ -0,0 +1,494 @@ +from copy import deepcopy +import inspect +from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Iterable, Iterator, Union, Any, Optional + +from dlt.common.configuration.resolve import inject_section +from dlt.common.configuration.specs import known_sections +from dlt.common.configuration.specs.config_section_context import ConfigSectionContext +from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.configuration.container import Container +from dlt.common.pipeline import PipelineContext, StateInjectableContext, resource_state, pipeline_state +from dlt.common.utils import flatten_list_or_items, get_callable_name, uniq_id + +from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TableNameMeta, + FilterItem, MapItem, YieldMapItem, ValidateItem) +from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep +from dlt.extract.hints import DltResourceHints, TTableSchemaTemplate +from dlt.extract.incremental import Incremental, IncrementalResourceWrapper +from dlt.extract.exceptions import ( + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, + InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, + InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, + InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer +) +from dlt.extract.wrappers import wrap_additional_type + + +def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: + """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" + return DataItemWithMeta(TableNameMeta(table_name), item) + + +class DltResource(Iterable[TDataItem], DltResourceHints): + """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" + Empty: ClassVar["DltResource"] = None + source_name: str + """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" + section: str + """A config section name""" + + def __init__( + self, + pipe: Pipe, + table_schema_template: TTableSchemaTemplate, + selected: bool, + incremental: IncrementalResourceWrapper = None, + section: str = None, + args_bound: bool = False + ) -> None: + self.section = section + self.selected = selected + self._pipe = pipe + self._args_bound = args_bound + self._explicit_args: DictStrAny = None + if incremental and not self.incremental: + self.add_step(incremental) + self.source_name = None + super().__init__(table_schema_template) + + @classmethod + def from_data( + cls, + data: Any, + name: str = None, + section: str = None, + table_schema_template: TTableSchemaTemplate = None, + selected: bool = True, + data_from: Union["DltResource", Pipe] = None, + incremental: IncrementalResourceWrapper = None + ) -> "DltResource": + if data is None: + raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore + + if isinstance(data, DltResource): + return data + + if isinstance(data, Pipe): + return cls(data, table_schema_template, selected, incremental=incremental, section=section) + + if callable(data): + name = name or get_callable_name(data) + + # if generator, take name from it + if inspect.isgenerator(data): + name = name or get_callable_name(data) # type: ignore + + # name is mandatory + if not name: + raise ResourceNameMissing() + + # wrap additional types + data = wrap_additional_type(data) + + # several iterable types are not allowed and must be excluded right away + if isinstance(data, (AsyncIterator, AsyncIterable)): + raise InvalidResourceDataTypeAsync(name, data, type(data)) + if isinstance(data, (str, dict)): + raise InvalidResourceDataTypeBasic(name, data, type(data)) + + # check if depends_on is a valid resource + parent_pipe: Pipe = None + if data_from is not None: + DltResource._ensure_valid_transformer_resource(name, data) + parent_pipe = DltResource._get_parent_pipe(name, data_from) + + # create resource from iterator, iterable or generator function + if isinstance(data, (Iterable, Iterator)) or callable(data): + pipe = Pipe.from_data(name, data, parent=parent_pipe) + return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) + else: + # some other data type that is not supported + raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") + + @property + def name(self) -> str: + """Resource name inherited from the pipe""" + return self._pipe.name + + def with_name(self, new_name: str) -> "DltResource": + """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" + return self._clone(new_name=new_name, with_parent=True) + + @property + def is_transformer(self) -> bool: + """Checks if the resource is a transformer that takes data from another resource""" + return self._pipe.has_parent + + @property + def requires_args(self) -> bool: + """Checks if resource has unbound arguments""" + try: + self._pipe.ensure_gen_bound() + return False + except (TypeError, ParametrizedResourceUnbound): + return True + + @property + def incremental(self) -> IncrementalResourceWrapper: + """Gets incremental transform if it is in the pipe""" + incremental: IncrementalResourceWrapper = None + step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) + if step_no >= 0: + incremental = self._pipe.steps[step_no] # type: ignore + return incremental + + @property + def validator(self) -> Optional[ValidateItem]: + """Gets validator transform if it is in the pipe""" + validator: ValidateItem = None + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + validator = self._pipe.steps[step_no] # type: ignore[assignment] + return validator + + @validator.setter + def validator(self, validator: Optional[ValidateItem]) -> None: + """Add/remove or replace the validator in pipe""" + step_no = self._pipe.find(ValidateItem) + if step_no >= 0: + self._pipe.remove_step(step_no) + if validator: + self.add_step(validator, insert_at=step_no if step_no >= 0 else None) + + def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: + """Replaces the parent in the transformer resource pipe from which the data is piped.""" + if self.is_transformer: + DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) + else: + raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") + parent_pipe = self._get_parent_pipe(self.name, data_from) + self._pipe.parent = parent_pipe + + def add_pipe(self, data: Any) -> None: + """Creates additional pipe for the resource from the specified data""" + # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer + raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) + + def select_tables(self, *table_names: Iterable[str]) -> "DltResource": + """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. + + Both `with_table_name` marker and data-based (function) table name hints are supported. + """ + def _filter(item: TDataItem, meta: Any = None) -> bool: + is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names + is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names + return is_in_meta or is_in_dyn + + # add filtering function at the end of pipe + self.add_filter(_filter) + return self + + def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. + insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(MapItem(item_map)) + else: + self._pipe.insert_step(MapItem(item_map), insert_at) + return self + + def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` + + `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to + ie. pivot an item into sequence of rows. + + Args: + item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. + insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step + + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(YieldMapItem(item_map)) + else: + self._pipe.insert_step(YieldMapItem(item_map), insert_at) + return self + + def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 + """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` + + `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically + + Args: + item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept + insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step + Returns: + "DltResource": returns self + """ + if insert_at is None: + self._pipe.append_step(FilterItem(item_filter)) + else: + self._pipe.insert_step(FilterItem(item_filter), insert_at) + return self + + def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 + """Adds a limit `max_items` to the resource pipe + + This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is + a no-op for transformers. Those should be limited by their input data. + + Args: + max_items (int): The maximum number of items to yield + Returns: + "DltResource": returns self + """ + def _gen_wrap(gen: TPipeStep) -> TPipeStep: + """Wrap a generator to take the first `max_items` records""" + nonlocal max_items + count = 0 + if inspect.isfunction(gen): + gen = gen() + try: + for i in gen: # type: ignore # TODO: help me fix this later + yield i + count += 1 + if count == max_items: + return + finally: + if inspect.isgenerator(gen): + gen.close() + return + # transformers should be limited by their input, so we only limit non-transformers + if not self.is_transformer: + self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) + return self + + def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 + if insert_at is None: + self._pipe.append_step(item_transform) + else: + self._pipe.insert_step(item_transform, insert_at) + return self + + def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: + super().set_template(table_schema_template) + incremental = self.incremental + # try to late assign incremental + if table_schema_template.get("incremental") is not None: + if incremental: + incremental._incremental = table_schema_template["incremental"] + else: + # if there's no wrapper add incremental as a transform + incremental = table_schema_template["incremental"] # type: ignore + self.add_step(incremental) + + if incremental: + primary_key = table_schema_template.get("primary_key", incremental.primary_key) + if primary_key is not None: + incremental.primary_key = primary_key + + if table_schema_template.get('validator') is not None: + self.validator = table_schema_template['validator'] + + def bind(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + orig_gen = self._pipe.gen + gen = self._pipe.bind_gen(*args, **kwargs) + if isinstance(gen, DltResource): + # the resource returned resource: update in place + old_pipe = self._pipe + self.__dict__.clear() + self.__dict__.update(gen.__dict__) + # keep old pipe instance + self._pipe = old_pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen._pipe.__dict__) + elif isinstance(gen, Pipe): + # the resource returned pipe: just replace pipe + self._pipe.__dict__.clear() + # write props from new pipe instance + self._pipe.__dict__.update(gen.__dict__) + else: + self._args_bound = True + self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore + return self + + @property + def explicit_args(self) -> StrAny: + """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" + if not self._args_bound: + raise TypeError(f"Resource {self.name} is not yet parametrized") + return self._explicit_args + + @property + def state(self) -> StrAny: + """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" + with inject_section(self._get_config_section_context()): + return resource_state(self.name) + + def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": + """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" + if self._args_bound: + raise TypeError(f"Parametrized resource {self.name} is not callable") + r = self._clone() + return r.bind(*args, **kwargs) + + def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": + """Allows to pipe data from across resources and transform functions with | operator""" + # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") + if isinstance(transform, DltResource): + transform.pipe_data_from(self) + # return transformed resource for chaining + return transform + else: + # map or yield map + if inspect.isgeneratorfunction(inspect.unwrap(transform)): + return self.add_yield_map(transform) + else: + return self.add_map(transform) + + def __iter__(self) -> Iterator[TDataItem]: + """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. + + A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. + """ + # use the same state dict when opening iterator and when iterator is iterated + container = Container() + state, _ = pipeline_state(container, {}) + state_context = StateInjectableContext(state=state) + section_context = self._get_config_section_context() + + # managed pipe iterator will set the context on each call to __next__ + with inject_section(section_context), Container().injectable_context(state_context): + pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore + + pipe_iterator.set_context([state_context, section_context]) + _iter = map(lambda item: item.item, pipe_iterator) + return flatten_list_or_items(_iter) + + def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: + try: + sig = sig or inspect.signature(f) + self._explicit_args = sig.bind_partial(*args, **kwargs).arguments + except Exception: + pass + + def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": + """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source + """ + pipe = self._pipe + if self._pipe and not self._pipe.is_empty: + pipe = pipe._clone(new_name=new_name, with_parent=with_parent) + # incremental and parent are already in the pipe (if any) + return DltResource( + pipe, + deepcopy(self._table_schema_template), + selected=self.selected, + section=self.section + ) + + def _get_config_section_context(self) -> ConfigSectionContext: + container = Container() + proxy = container[PipelineContext] + pipeline = None if not proxy.is_active() else proxy.pipeline() + if pipeline: + pipeline_name = pipeline.pipeline_name + else: + pipeline_name = None + if pipeline: + default_schema_name = pipeline.default_schema_name + else: + default_schema_name = None + if not default_schema_name and pipeline_name: + default_schema_name = pipeline._make_schema_with_default_name().name + return ConfigSectionContext( + pipeline_name=pipeline_name, + # do not emit middle config section to not overwrite the resource section + # only sources emit middle config section + sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), + source_state_key=self.source_name or default_schema_name or self.section or uniq_id() + ) + + def __str__(self) -> str: + info = f"DltResource [{self.name}]" + if self.section: + info += f" in section [{self.section}]" + if self.source_name: + info += f" added to source [{self.source_name}]:" + else: + info += ":" + + if self.is_transformer: + info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" + else: + if self._pipe.is_data_bound: + if self.requires_args: + head_sig = inspect.signature(self._pipe.gen) # type: ignore + info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." + else: + info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." + else: + info += "\nThis resource is not bound to the data" + info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" + return info + + @staticmethod + def _ensure_valid_transformer_resource(name: str, data: Any) -> None: + # resource must be a callable with single argument + if callable(data): + valid_code = DltResource.validate_transformer_generator_function(data) + if valid_code != 0: + raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) + else: + raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) + + @staticmethod + def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: + # parent resource + if isinstance(data_from, Pipe): + return data_from + elif isinstance(data_from, DltResource): + return data_from._pipe + else: + # if this is generator function provide nicer exception + if callable(data_from): + raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) + else: + raise InvalidParentResourceDataType(name, data_from, type(data_from)) + + @staticmethod + def validate_transformer_generator_function(f: AnyFun) -> int: + sig = inspect.signature(f) + if len(sig.parameters) == 0: + return 1 + # transformer may take only one positional only argument + pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) + if pos_only_len > 1: + return 2 + first_ar = next(iter(sig.parameters.values())) + # and pos only must be first + if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: + return 2 + # first arg must be positional or kw_pos + if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): + return 3 + return 0 + + +# produce Empty resource singleton +DltResource.Empty = DltResource(Pipe(None), None, False) +TUnboundDltResource = Callable[..., DltResource] diff --git a/dlt/extract/source.py b/dlt/extract/source.py index db4f4db454..ace76cee2b 100644 --- a/dlt/extract/source.py +++ b/dlt/extract/source.py @@ -1,502 +1,27 @@ import warnings import contextlib -from copy import copy, deepcopy +from copy import copy import makefun import inspect -from typing import AsyncIterable, AsyncIterator, ClassVar, Callable, Dict, Iterable, Iterator, List, Sequence, Tuple, Union, Any, Optional +from typing import Dict, Iterable, Iterator, List, Sequence, Tuple, Any from typing_extensions import Self from dlt.common.configuration.resolve import inject_section from dlt.common.configuration.specs import known_sections from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer, RelationalNormalizerConfigPropagation +from dlt.common.normalizers.json.relational import DataItemNormalizer as RelationalNormalizer from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnName -from dlt.common.typing import AnyFun, DictStrAny, StrAny, TDataItem, TDataItems, NoneType +from dlt.common.schema.typing import TColumnName, TSchemaContract +from dlt.common.typing import StrAny, TDataItem from dlt.common.configuration.container import Container -from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, resource_state, source_state, pipeline_state -from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, get_callable_name, graph_edges_to_nodes, multi_context_manager, uniq_id - -from dlt.extract.typing import (DataItemWithMeta, ItemTransformFunc, ItemTransformFunctionWithMeta, TDecompositionStrategy, TableNameMeta, - FilterItem, MapItem, YieldMapItem, ValidateItem) -from dlt.extract.pipe import Pipe, ManagedPipeIterator, TPipeStep -from dlt.extract.schema import DltResourceSchema, TTableSchemaTemplate -from dlt.extract.incremental import Incremental, IncrementalResourceWrapper -from dlt.extract.exceptions import ( - InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataType, InvalidResourceDataTypeIsNone, InvalidTransformerGeneratorFunction, - DataItemRequiredForDynamicTableHints, InvalidResourceDataTypeAsync, InvalidResourceDataTypeBasic, - InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, ResourceNameMissing, ResourceNotATransformer, ResourcesNotFoundError, DeletingResourcesNotSupported) -from dlt.extract.wrappers import wrap_additional_type - - -def with_table_name(item: TDataItems, table_name: str) -> DataItemWithMeta: - """Marks `item` to be dispatched to table `table_name` when yielded from resource function.""" - return DataItemWithMeta(TableNameMeta(table_name), item) - - -class DltResource(Iterable[TDataItem], DltResourceSchema): - """Implements dlt resource. Contains a data pipe that wraps a generating item and table schema that can be adjusted""" - Empty: ClassVar["DltResource"] = None - source_name: str - """Name of the source that contains this instance of the source, set when added to DltResourcesDict""" - section: str - """A config section name""" - - def __init__( - self, - pipe: Pipe, - table_schema_template: TTableSchemaTemplate, - selected: bool, - incremental: IncrementalResourceWrapper = None, - section: str = None, - args_bound: bool = False - ) -> None: - self.section = section - self.selected = selected - self._pipe = pipe - self._args_bound = args_bound - self._explicit_args: DictStrAny = None - if incremental and not self.incremental: - self.add_step(incremental) - self.source_name = None - super().__init__(table_schema_template) - - @classmethod - def from_data( - cls, - data: Any, - name: str = None, - section: str = None, - table_schema_template: TTableSchemaTemplate = None, - selected: bool = True, - data_from: Union["DltResource", Pipe] = None, - incremental: IncrementalResourceWrapper = None - ) -> "DltResource": - if data is None: - raise InvalidResourceDataTypeIsNone(name, data, NoneType) # type: ignore - - if isinstance(data, DltResource): - return data - - if isinstance(data, Pipe): - return cls(data, table_schema_template, selected, incremental=incremental, section=section) - - if callable(data): - name = name or get_callable_name(data) - - # if generator, take name from it - if inspect.isgenerator(data): - name = name or get_callable_name(data) # type: ignore - - # name is mandatory - if not name: - raise ResourceNameMissing() - - # wrap additional types - data = wrap_additional_type(data) - - # several iterable types are not allowed and must be excluded right away - if isinstance(data, (AsyncIterator, AsyncIterable)): - raise InvalidResourceDataTypeAsync(name, data, type(data)) - if isinstance(data, (str, dict)): - raise InvalidResourceDataTypeBasic(name, data, type(data)) - - # check if depends_on is a valid resource - parent_pipe: Pipe = None - if data_from is not None: - DltResource._ensure_valid_transformer_resource(name, data) - parent_pipe = DltResource._get_parent_pipe(name, data_from) - - # create resource from iterator, iterable or generator function - if isinstance(data, (Iterable, Iterator)) or callable(data): - pipe = Pipe.from_data(name, data, parent=parent_pipe) - return cls(pipe, table_schema_template, selected, incremental=incremental, section=section, args_bound=not callable(data)) - else: - # some other data type that is not supported - raise InvalidResourceDataType(name, data, type(data), f"The data type of supplied type is {type(data).__name__}") - - @property - def name(self) -> str: - """Resource name inherited from the pipe""" - return self._pipe.name - - def with_name(self, new_name: str) -> "DltResource": - """Clones the resource with a new name. Such resource keeps separate state and loads data to `new_name` table by default.""" - return self._clone(new_name=new_name, with_parent=True) - - @property - def is_transformer(self) -> bool: - """Checks if the resource is a transformer that takes data from another resource""" - return self._pipe.has_parent - - @property - def requires_args(self) -> bool: - """Checks if resource has unbound arguments""" - try: - self._pipe.ensure_gen_bound() - return False - except (TypeError, ParametrizedResourceUnbound): - return True - - @property - def incremental(self) -> IncrementalResourceWrapper: - """Gets incremental transform if it is in the pipe""" - incremental: IncrementalResourceWrapper = None - step_no = self._pipe.find(IncrementalResourceWrapper, Incremental) - if step_no >= 0: - incremental = self._pipe.steps[step_no] # type: ignore - return incremental - - @property - def validator(self) -> Optional[ValidateItem]: - """Gets validator transform if it is in the pipe""" - validator: ValidateItem = None - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - validator = self._pipe.steps[step_no] # type: ignore[assignment] - return validator - - @validator.setter - def validator(self, validator: Optional[ValidateItem]) -> None: - """Add/remove or replace the validator in pipe""" - step_no = self._pipe.find(ValidateItem) - if step_no >= 0: - self._pipe.remove_step(step_no) - if validator: - self.add_step(validator, insert_at=step_no if step_no >= 0 else None) - - def pipe_data_from(self, data_from: Union["DltResource", Pipe]) -> None: - """Replaces the parent in the transformer resource pipe from which the data is piped.""" - if self.is_transformer: - DltResource._ensure_valid_transformer_resource(self.name, self._pipe.gen) - else: - raise ResourceNotATransformer(self.name, "Cannot pipe data into resource that is not a transformer.") - parent_pipe = self._get_parent_pipe(self.name, data_from) - self._pipe.parent = parent_pipe - - def add_pipe(self, data: Any) -> None: - """Creates additional pipe for the resource from the specified data""" - # TODO: (1) self resource cannot be a transformer (2) if data is resource both self must and it must be selected/unselected + cannot be tranformer - raise InvalidResourceDataTypeMultiplePipes(self.name, data, type(data)) - - def select_tables(self, *table_names: Iterable[str]) -> "DltResource": - """For resources that dynamically dispatch data to several tables allows to select tables that will receive data, effectively filtering out other data items. - - Both `with_table_name` marker and data-based (function) table name hints are supported. - """ - def _filter(item: TDataItem, meta: Any = None) -> bool: - is_in_meta = isinstance(meta, TableNameMeta) and meta.table_name in table_names - is_in_dyn = self._table_name_hint_fun and self._table_name_hint_fun(item) in table_names - return is_in_meta or is_in_dyn - - # add filtering function at the end of pipe - self.add_filter(_filter) - return self - - def add_map(self, item_map: ItemTransformFunc[TDataItem], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds mapping function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_map (ItemTransformFunc[TDataItem]): A function taking a single data item and optional meta argument. Returns transformed data item. - insert_at (int, optional): At which step in pipe to insert the mapping. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(MapItem(item_map)) - else: - self._pipe.insert_step(MapItem(item_map), insert_at) - return self - - def add_yield_map(self, item_map: ItemTransformFunc[Iterator[TDataItem]], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds generating function defined in `item_map` to the resource pipe at position `inserted_at` - - `item_map` receives single data items, `dlt` will enumerate any lists of data items automatically. It may yield 0 or more data items and be used to - ie. pivot an item into sequence of rows. - - Args: - item_map (ItemTransformFunc[Iterator[TDataItem]]): A function taking a single data item and optional meta argument. Yields 0 or more data items. - insert_at (int, optional): At which step in pipe to insert the generator. Defaults to None which inserts after last step - - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(YieldMapItem(item_map)) - else: - self._pipe.insert_step(YieldMapItem(item_map), insert_at) - return self - - def add_filter(self, item_filter: ItemTransformFunc[bool], insert_at: int = None) -> "DltResource": # noqa: A003 - """Adds filter defined in `item_filter` to the resource pipe at position `inserted_at` - - `item_filter` receives single data items, `dlt` will enumerate any lists of data items automatically - - Args: - item_filter (ItemTransformFunc[bool]): A function taking a single data item and optional meta argument. Returns bool. If True, item is kept - insert_at (int, optional): At which step in pipe to insert the filter. Defaults to None which inserts after last step - Returns: - "DltResource": returns self - """ - if insert_at is None: - self._pipe.append_step(FilterItem(item_filter)) - else: - self._pipe.insert_step(FilterItem(item_filter), insert_at) - return self - - def add_limit(self, max_items: int) -> "DltResource": # noqa: A003 - """Adds a limit `max_items` to the resource pipe - - This mutates the encapsulated generator to stop after `max_items` items are yielded. This is useful for testing and debugging. It is - a no-op for transformers. Those should be limited by their input data. - - Args: - max_items (int): The maximum number of items to yield - Returns: - "DltResource": returns self - """ - def _gen_wrap(gen: TPipeStep) -> TPipeStep: - """Wrap a generator to take the first `max_items` records""" - nonlocal max_items - count = 0 - if inspect.isfunction(gen): - gen = gen() - try: - for i in gen: # type: ignore # TODO: help me fix this later - yield i - count += 1 - if count == max_items: - return - finally: - if inspect.isgenerator(gen): - gen.close() - return - # transformers should be limited by their input, so we only limit non-transformers - if not self.is_transformer: - self._pipe.replace_gen(_gen_wrap(self._pipe.gen)) - return self - - def add_step(self, item_transform: ItemTransformFunctionWithMeta[TDataItems], insert_at: int = None) -> "DltResource": # noqa: A003 - if insert_at is None: - self._pipe.append_step(item_transform) - else: - self._pipe.insert_step(item_transform, insert_at) - return self - - def set_template(self, table_schema_template: TTableSchemaTemplate) -> None: - super().set_template(table_schema_template) - incremental = self.incremental - # try to late assign incremental - if table_schema_template.get("incremental") is not None: - if incremental: - incremental._incremental = table_schema_template["incremental"] - else: - # if there's no wrapper add incremental as a transform - incremental = table_schema_template["incremental"] # type: ignore - self.add_step(incremental) - - if incremental: - primary_key = table_schema_template.get("primary_key", incremental.primary_key) - if primary_key is not None: - incremental.primary_key = primary_key - - if table_schema_template.get('validator') is not None: - self.validator = table_schema_template['validator'] - - def bind(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resource to passed arguments. Modifies resource pipe in place. Does not evaluate generators or iterators.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - orig_gen = self._pipe.gen - gen = self._pipe.bind_gen(*args, **kwargs) - if isinstance(gen, DltResource): - # the resource returned resource: update in place - old_pipe = self._pipe - self.__dict__.clear() - self.__dict__.update(gen.__dict__) - # keep old pipe instance - self._pipe = old_pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen._pipe.__dict__) - elif isinstance(gen, Pipe): - # the resource returned pipe: just replace pipe - self._pipe.__dict__.clear() - # write props from new pipe instance - self._pipe.__dict__.update(gen.__dict__) - else: - self._args_bound = True - self._set_explicit_args(orig_gen, None, *args, **kwargs) # type: ignore - return self - - @property - def explicit_args(self) -> StrAny: - """Returns a dictionary of arguments used to parametrize the resource. Does not include defaults and injected args.""" - if not self._args_bound: - raise TypeError(f"Resource {self.name} is not yet parametrized") - return self._explicit_args - - @property - def state(self) -> StrAny: - """Gets resource-scoped state from the active pipeline. PipelineStateNotAvailable is raised if pipeline context is not available""" - with inject_section(self._get_config_section_context()): - return resource_state(self.name) - - def __call__(self, *args: Any, **kwargs: Any) -> "DltResource": - """Binds the parametrized resources to passed arguments. Creates and returns a bound resource. Generators and iterators are not evaluated.""" - if self._args_bound: - raise TypeError(f"Parametrized resource {self.name} is not callable") - r = self._clone() - return r.bind(*args, **kwargs) - - def __or__(self, transform: Union["DltResource", AnyFun]) -> "DltResource": - """Allows to pipe data from across resources and transform functions with | operator""" - # print(f"{resource.name} | {self.name} -> {resource.name}[{resource.is_transformer}]") - if isinstance(transform, DltResource): - transform.pipe_data_from(self) - # return transformed resource for chaining - return transform - else: - # map or yield map - if inspect.isgeneratorfunction(inspect.unwrap(transform)): - return self.add_yield_map(transform) - else: - return self.add_map(transform) - - def __iter__(self) -> Iterator[TDataItem]: - """Opens iterator that yields the data items from the resources in the same order as in Pipeline class. - - A read-only state is provided, initialized from active pipeline state. The state is discarded after the iterator is closed. - """ - # use the same state dict when opening iterator and when iterator is iterated - container = Container() - state, _ = pipeline_state(container, {}) - state_context = StateInjectableContext(state=state) - section_context = self._get_config_section_context() - - # managed pipe iterator will set the context on each call to __next__ - with inject_section(section_context), Container().injectable_context(state_context): - pipe_iterator: ManagedPipeIterator = ManagedPipeIterator.from_pipes([self._pipe]) # type: ignore - - pipe_iterator.set_context([state_context, section_context]) - _iter = map(lambda item: item.item, pipe_iterator) - return flatten_list_or_items(_iter) - - def _set_explicit_args(self, f: AnyFun, sig: inspect.Signature = None, *args: Any, **kwargs: Any) -> None: - try: - sig = sig or inspect.signature(f) - self._explicit_args = sig.bind_partial(*args, **kwargs).arguments - except Exception: - pass - - def _clone(self, new_name: str = None, with_parent: bool = False) -> "DltResource": - """Creates a deep copy of a current resource, optionally renaming the resource. The clone will not be part of the source - """ - pipe = self._pipe - if self._pipe and not self._pipe.is_empty: - pipe = pipe._clone(new_name=new_name, with_parent=with_parent) - # incremental and parent are already in the pipe (if any) - return DltResource( - pipe, - deepcopy(self._table_schema_template), - selected=self.selected, - section=self.section - ) - - def _get_config_section_context(self) -> ConfigSectionContext: - container = Container() - proxy = container[PipelineContext] - pipeline = None if not proxy.is_active() else proxy.pipeline() - if pipeline: - pipeline_name = pipeline.pipeline_name - else: - pipeline_name = None - if pipeline: - default_schema_name = pipeline.default_schema_name - else: - default_schema_name = None - if not default_schema_name and pipeline_name: - default_schema_name = pipeline._make_schema_with_default_name().name - return ConfigSectionContext( - pipeline_name=pipeline_name, - # do not emit middle config section to not overwrite the resource section - # only sources emit middle config section - sections=(known_sections.SOURCES, "", self.source_name or default_schema_name or self.name), - source_state_key=self.source_name or default_schema_name or self.section or uniq_id() - ) +from dlt.common.pipeline import PipelineContext, StateInjectableContext, SupportsPipelineRun, source_state, pipeline_state +from dlt.common.utils import graph_find_scc_nodes, flatten_list_or_items, graph_edges_to_nodes - def __str__(self) -> str: - info = f"DltResource [{self.name}]" - if self.section: - info += f" in section [{self.section}]" - if self.source_name: - info += f" added to source [{self.source_name}]:" - else: - info += ":" - - if self.is_transformer: - info += f"\nThis resource is a transformer and takes data items from {self._pipe.parent.name}" - else: - if self._pipe.is_data_bound: - if self.requires_args: - head_sig = inspect.signature(self._pipe.gen) # type: ignore - info += f"\nThis resource is parametrized and takes the following arguments {head_sig}. You must call this resource before loading." - else: - info += "\nIf you want to see the data items in the resource you must iterate it or convert to list ie. list(resource). Note that, like any iterator, you can iterate the resource only once." - else: - info += "\nThis resource is not bound to the data" - info += f"\nInstance: info: (data pipe id:{id(self._pipe)}) at {id(self)}" - return info - - @staticmethod - def _ensure_valid_transformer_resource(name: str, data: Any) -> None: - # resource must be a callable with single argument - if callable(data): - valid_code = DltResource.validate_transformer_generator_function(data) - if valid_code != 0: - raise InvalidTransformerGeneratorFunction(name, get_callable_name(data), inspect.signature(data), valid_code) - else: - raise InvalidTransformerDataTypeGeneratorFunctionRequired(name, data, type(data)) - - @staticmethod - def _get_parent_pipe(name: str, data_from: Union["DltResource", Pipe]) -> Pipe: - # parent resource - if isinstance(data_from, Pipe): - return data_from - elif isinstance(data_from, DltResource): - return data_from._pipe - else: - # if this is generator function provide nicer exception - if callable(data_from): - raise InvalidParentResourceIsAFunction(name, get_callable_name(data_from)) - else: - raise InvalidParentResourceDataType(name, data_from, type(data_from)) - - @staticmethod - def validate_transformer_generator_function(f: AnyFun) -> int: - sig = inspect.signature(f) - if len(sig.parameters) == 0: - return 1 - # transformer may take only one positional only argument - pos_only_len = sum(1 for p in sig.parameters.values() if p.kind == p.POSITIONAL_ONLY) - if pos_only_len > 1: - return 2 - first_ar = next(iter(sig.parameters.values())) - # and pos only must be first - if pos_only_len == 1 and first_ar.kind != first_ar.POSITIONAL_ONLY: - return 2 - # first arg must be positional or kw_pos - if first_ar.kind not in (first_ar.POSITIONAL_ONLY, first_ar.POSITIONAL_OR_KEYWORD): - return 3 - return 0 - - -# produce Empty resource singleton -DltResource.Empty = DltResource(Pipe(None), None, False) -TUnboundDltResource = Callable[..., DltResource] +from dlt.extract.typing import TDecompositionStrategy +from dlt.extract.pipe import Pipe, ManagedPipeIterator +from dlt.extract.hints import DltResourceHints +from dlt.extract.resource import DltResource +from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, ResourcesNotFoundError, DeletingResourcesNotSupported class DltResourceDict(Dict[str, DltResource]): @@ -529,7 +54,7 @@ def extracted(self) -> Dict[str, DltResource]: resource = self[pipe.name] except KeyError: # resource for pipe not found: return mock resource - mock_template = DltResourceSchema.new_table_template( + mock_template = DltResourceHints.new_table_template( pipe.name, write_disposition=resource.write_disposition ) @@ -681,6 +206,14 @@ def max_table_nesting(self) -> int: def max_table_nesting(self, value: int) -> None: RelationalNormalizer.update_normalizer_config(self._schema, {"max_nesting": value}) + @property + def schema_contract(self) -> TSchemaContract: + return self.schema.settings["schema_contract"] + + @schema_contract.setter + def schema_contract(self, settings: TSchemaContract) -> None: + self.schema.set_schema_contract(settings) + @property def exhausted(self) -> bool: """check all selected pipes wether one of them has started. if so, the source is exhausted.""" diff --git a/dlt/extract/storage.py b/dlt/extract/storage.py new file mode 100644 index 0000000000..ddda064aa4 --- /dev/null +++ b/dlt/extract/storage.py @@ -0,0 +1,78 @@ +import os +from typing import ClassVar, Dict + +from dlt.common.data_writers import TLoaderFileFormat + +from dlt.common.utils import uniq_id +from dlt.common.typing import TDataItems +from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.storages import NormalizeStorageConfiguration, NormalizeStorage, DataItemStorage, FileStorage + + +class ExtractorItemStorage(DataItemStorage): + load_file_type: TLoaderFileFormat + + def __init__(self, storage: FileStorage, extract_folder: str="extract") -> None: + # data item storage with jsonl with pua encoding + super().__init__(self.load_file_type) + self.extract_folder = extract_folder + self.storage = storage + + + def _get_data_item_path_template(self, load_id: str, schema_name: str, table_name: str) -> str: + template = NormalizeStorage.build_extracted_file_stem(schema_name, table_name, "%s") + return self.storage.make_full_path(os.path.join(self._get_extract_path(load_id), template)) + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.extract_folder, extract_id) + + +class JsonLExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "puae-jsonl" + + +class ArrowExtractorStorage(ExtractorItemStorage): + load_file_type: TLoaderFileFormat = "arrow" + + +class ExtractorStorage(NormalizeStorage): + EXTRACT_FOLDER: ClassVar[str] = "extract" + + """Wrapper around multiple extractor storages with different file formats""" + def __init__(self, C: NormalizeStorageConfiguration) -> None: + super().__init__(True, C) + self._item_storages: Dict[TLoaderFileFormat, ExtractorItemStorage] = { + "puae-jsonl": JsonLExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER), + "arrow": ArrowExtractorStorage(self.storage, extract_folder=self.EXTRACT_FOLDER) + } + + def _get_extract_path(self, extract_id: str) -> str: + return os.path.join(self.EXTRACT_FOLDER, extract_id) + + def create_extract_id(self) -> str: + extract_id = uniq_id() + self.storage.create_folder(self._get_extract_path(extract_id)) + return extract_id + + def get_storage(self, loader_file_format: TLoaderFileFormat) -> ExtractorItemStorage: + return self._item_storages[loader_file_format] + + def close_writers(self, extract_id: str) -> None: + for storage in self._item_storages.values(): + storage.close_writers(extract_id) + + def commit_extract_files(self, extract_id: str, with_delete: bool = True) -> None: + extract_path = self._get_extract_path(extract_id) + for file in self.storage.list_folder_files(extract_path, to_root=False): + from_file = os.path.join(extract_path, file) + to_file = os.path.join(NormalizeStorage.EXTRACTED_FOLDER, file) + if with_delete: + self.storage.atomic_rename(from_file, to_file) + else: + # create hardlink which will act as a copy + self.storage.link_hard(from_file, to_file) + if with_delete: + self.storage.delete_folder(extract_path, recursively=True) + + def write_data_item(self, file_format: TLoaderFileFormat, load_id: str, schema_name: str, table_name: str, item: TDataItems, columns: TTableSchemaColumns) -> None: + self.get_storage(file_format).write_data_item(load_id, schema_name, table_name, item, columns) diff --git a/dlt/extract/typing.py b/dlt/extract/typing.py index ad4e23b84f..646267c539 100644 --- a/dlt/extract/typing.py +++ b/dlt/extract/typing.py @@ -138,3 +138,8 @@ class ValidateItem(ItemTransform[TDataItem]): Subclass should implement the `__call__` method to either return the data item(s) or raise `extract.exceptions.ValidationError`. See `PydanticValidator` for possible implementation. """ + table_name: str + + def bind(self, pipe: SupportsPipe) -> ItemTransform[TDataItem]: + self.table_name = pipe.name + return self diff --git a/dlt/extract/validation.py b/dlt/extract/validation.py index c8e30d0eb2..8bd6c7afb9 100644 --- a/dlt/extract/validation.py +++ b/dlt/extract/validation.py @@ -1,13 +1,13 @@ -from typing import Optional, Protocol, TypeVar, Generic, Type, Union, Any, List +from typing import Optional, Tuple, TypeVar, Generic, Type, Union, Any, List +from dlt.common.schema.schema import Schema try: - from pydantic import BaseModel as PydanticBaseModel, ValidationError as PydanticValidationError, create_model + from pydantic import BaseModel as PydanticBaseModel except ModuleNotFoundError: - PydanticBaseModel = None # type: ignore[misc] + PydanticBaseModel = Any # type: ignore[misc, assignment] -from dlt.extract.exceptions import ValidationError from dlt.common.typing import TDataItems -from dlt.common.schema.typing import TAnySchemaColumns +from dlt.common.schema.typing import TAnySchemaColumns, TSchemaContract, TSchemaEvolutionMode from dlt.extract.typing import TTableHintTemplate, ValidateItem @@ -16,31 +16,54 @@ class PydanticValidator(ValidateItem, Generic[_TPydanticModel]): model: Type[_TPydanticModel] - def __init__(self, model: Type[_TPydanticModel]) -> None: - self.model = model - # Create a model for validating list of items in batch - self.list_model = create_model( - "List" + model.__name__, - items=(List[model], ...) # type: ignore[valid-type] - ) + def __init__(self, model: Type[_TPydanticModel], column_mode: TSchemaEvolutionMode, data_mode: TSchemaEvolutionMode) -> None: + from dlt.common.libs.pydantic import apply_schema_contract_to_model, create_list_model + + self.column_mode: TSchemaEvolutionMode = column_mode + self.data_mode: TSchemaEvolutionMode = data_mode + self.model = apply_schema_contract_to_model(model, column_mode, data_mode) + self.list_model = create_list_model(self.model, data_mode) def __call__(self, item: TDataItems, meta: Any = None) -> Union[_TPydanticModel, List[_TPydanticModel]]: """Validate a data item against the pydantic model""" if item is None: return None - try: - if isinstance(item, list): - return self.list_model(items=item).items # type: ignore[attr-defined, no-any-return] - return self.model.parse_obj(item) - except PydanticValidationError as e: - raise ValidationError(self, item, e) from e + + from dlt.common.libs.pydantic import validate_item, validate_items + + if isinstance(item, list): + return validate_items(self.table_name, self.list_model, item, self.column_mode, self.data_mode) + return validate_item(self.table_name, self.model, item, self.column_mode, self.data_mode) def __str__(self, *args: Any, **kwargs: Any) -> str: return f"PydanticValidator(model={self.model.__qualname__})" -def get_column_validator(columns: TTableHintTemplate[TAnySchemaColumns]) -> Optional[ValidateItem]: +def create_item_validator( + columns: TTableHintTemplate[TAnySchemaColumns], + schema_contract: TTableHintTemplate[TSchemaContract] = None +) -> Tuple[Optional[ValidateItem], TTableHintTemplate[TSchemaContract]]: + """Creates item validator for a `columns` definition and a `schema_contract` + + Returns a tuple (validator, schema contract). If validator could not be created, returns None at first position. + If schema_contract was not specified a default schema contract for given validator will be returned + """ if PydanticBaseModel is not None and isinstance(columns, type) and issubclass(columns, PydanticBaseModel): - return PydanticValidator(columns) - return None + assert not callable(schema_contract), "schema_contract cannot be dynamic for Pydantic item validator" + + from dlt.common.libs.pydantic import extra_to_column_mode, get_extra_from_model + # freeze the columns if we have a fully defined table and no other explicit contract + expanded_schema_contract = Schema.expand_schema_contract_settings( + schema_contract, + # corresponds to default Pydantic behavior + default={"tables": "evolve", "columns": extra_to_column_mode(get_extra_from_model(columns)), "data_type": "freeze"} + ) + return (PydanticValidator( + columns, + expanded_schema_contract["columns"], + expanded_schema_contract["data_type"] + ), + schema_contract or expanded_schema_contract + ) + return None, schema_contract diff --git a/dlt/helpers/airflow_helper.py b/dlt/helpers/airflow_helper.py index 2a9c76cc76..e0329d583c 100644 --- a/dlt/helpers/airflow_helper.py +++ b/dlt/helpers/airflow_helper.py @@ -25,7 +25,7 @@ from dlt.common.configuration.specs.config_providers_context import ConfigProvidersContext from dlt.common.runtime.collector import NULL_COLLECTOR -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.helpers import retry_load from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import log diff --git a/dlt/helpers/streamlit_helper.py b/dlt/helpers/streamlit_helper.py index 7921e4e2e1..e43e794bf6 100644 --- a/dlt/helpers/streamlit_helper.py +++ b/dlt/helpers/streamlit_helper.py @@ -120,7 +120,7 @@ def _query_data_live(query: str, schema_name: str = None) -> pd.DataFrame: schema_names = ", ".join(sorted(pipeline.schema_names)) st.markdown(f""" * pipeline name: **{pipeline.pipeline_name}** - * destination: **{str(credentials)}** in **{pipeline.destination.__name__}** + * destination: **{str(credentials)}** in **{pipeline.destination.name}** * dataset name: **{pipeline.dataset_name}** * default schema name: **{pipeline.default_schema_name}** * all schema names: **{schema_names}** diff --git a/dlt/load/load.py b/dlt/load/load.py index d27274ffb1..725f8589f5 100644 --- a/dlt/load/load.py +++ b/dlt/load/load.py @@ -20,7 +20,7 @@ from dlt.common.schema import Schema, TSchemaTables from dlt.common.schema.typing import TTableSchema, TWriteDisposition from dlt.common.storages import LoadStorage -from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, DestinationReference, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination +from dlt.common.destination.reference import DestinationClientDwhConfiguration, FollowupJob, JobClientBase, WithStagingDataset, Destination, LoadJob, NewLoadJob, TLoadJobState, DestinationClientConfiguration, SupportsStagingDestination, TDestination from dlt.destinations.job_impl import EmptyLoadJob @@ -34,8 +34,8 @@ class Load(Runnable[Executor]): @with_config(spec=LoaderConfiguration, sections=(known_sections.LOAD,)) def __init__( self, - destination: DestinationReference, - staging_destination: DestinationReference = None, + destination: TDestination, + staging_destination: TDestination = None, collector: Collector = NULL_COLLECTOR, is_storage_owner: bool = False, config: LoaderConfiguration = config.value, @@ -54,7 +54,6 @@ def __init__( self._processed_load_ids: Dict[str, str] = {} """Load ids to dataset name""" - def create_storage(self, is_storage_owner: bool) -> LoadStorage: supported_file_formats = self.capabilities.supported_loader_file_formats if self.staging_destination: @@ -387,7 +386,7 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: logger.info("Running file loading") # get list of loads and order by name ASC to execute schema updates - loads = self.load_storage.list_packages() + loads = self.load_storage.list_normalized_packages() logger.info(f"Found {len(loads)} load packages") if len(loads) == 0: return TRunMetrics(True, 0) @@ -404,7 +403,7 @@ def run(self, pool: Optional[Executor]) -> TRunMetrics: with self.collector(f"Load {schema.name} in {load_id}"): self.load_single_package(load_id, schema) - return TRunMetrics(False, len(self.load_storage.list_packages())) + return TRunMetrics(False, len(self.load_storage.list_normalized_packages())) def get_load_info(self, pipeline: SupportsPipeline, started_at: datetime.datetime = None) -> LoadInfo: # TODO: LoadInfo should hold many datasets diff --git a/dlt/normalize/configuration.py b/dlt/normalize/configuration.py index f34f8b6fdc..13b408945c 100644 --- a/dlt/normalize/configuration.py +++ b/dlt/normalize/configuration.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from dlt.common.configuration import configspec from dlt.common.configuration.specs import BaseConfiguration @@ -6,6 +6,7 @@ from dlt.common.runners.configuration import PoolRunnerConfiguration, TPoolType from dlt.common.storages import LoadStorageConfiguration, NormalizeStorageConfiguration, SchemaStorageConfiguration + @configspec class ItemsNormalizerConfiguration(BaseConfiguration): add_dlt_id: bool = False diff --git a/dlt/normalize/exceptions.py b/dlt/normalize/exceptions.py index e69de29bb2..79da16b925 100644 --- a/dlt/normalize/exceptions.py +++ b/dlt/normalize/exceptions.py @@ -0,0 +1,5 @@ +from dlt.common.exceptions import DltException + +class NormalizeException(DltException): + def __init__(self, msg: str) -> None: + super().__init__(msg) diff --git a/dlt/normalize/items_normalizers.py b/dlt/normalize/items_normalizers.py index b9bd5468dc..6146d864b6 100644 --- a/dlt/normalize/items_normalizers.py +++ b/dlt/normalize/items_normalizers.py @@ -1,20 +1,20 @@ import os -from typing import List, Dict, Tuple, Protocol, Any -from pathlib import Path +from typing import List, Dict, Set, Tuple, Any from abc import abstractmethod from dlt.common import json, logger -from dlt.common.json import custom_pua_decode +from dlt.common.json import custom_pua_decode, may_have_pua from dlt.common.runtime import signals -from dlt.common.schema.typing import TTableSchemaColumns -from dlt.common.storages import NormalizeStorage, LoadStorage, NormalizeStorageConfiguration, FileStorage -from dlt.common.typing import TDataItem +from dlt.common.schema.typing import TSchemaEvolutionMode, TTableSchemaColumns, TSchemaContractDict +from dlt.common.storages import NormalizeStorage, LoadStorage, FileStorage +from dlt.common.typing import DictStrAny, TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.utils import TRowCount, merge_row_count, increase_row_count -from dlt.normalize.configuration import NormalizeConfiguration from dlt.common.exceptions import MissingDependencyException from dlt.common.normalizers.utils import generate_dlt_ids +from dlt.normalize.configuration import NormalizeConfiguration + try: from dlt.common.libs import pyarrow from dlt.common.libs.pyarrow import pyarrow as pa @@ -44,54 +44,136 @@ def __call__(self, extracted_items_file: str, root_table_name: str) -> Tuple[Lis class JsonLItemsNormalizer(ItemsNormalizer): - def _normalize_chunk(self, root_table_name: str, items: List[TDataItem]) -> Tuple[TSchemaUpdate, int, TRowCount]: - column_schemas: Dict[ - str, TTableSchemaColumns - ] = {} # quick access to column schema for writers below + def __init__( + self, + load_storage: LoadStorage, + normalize_storage: NormalizeStorage, + schema: Schema, + load_id: str, + config: NormalizeConfiguration + ) -> None: + super().__init__(load_storage, normalize_storage, schema, load_id, config) + self._table_contracts: Dict[str, TSchemaContractDict] = {} + self._filtered_tables: Set[str] = set() + self._filtered_tables_columns: Dict[str, Dict[str, TSchemaEvolutionMode]] = {} + # quick access to column schema for writers below + self._column_schemas: Dict[str, TTableSchemaColumns] = {} + + def _filter_columns(self, filtered_columns: Dict[str, TSchemaEvolutionMode], row: DictStrAny) -> DictStrAny: + for name, mode in filtered_columns.items(): + if name in row: + if mode == "discard_row": + return None + elif mode == "discard_value": + row.pop(name) + return row + + def _normalize_chunk(self, root_table_name: str, items: List[TDataItem], may_have_pua: bool) -> Tuple[TSchemaUpdate, int, TRowCount]: + column_schemas = self._column_schemas schema_update: TSchemaUpdate = {} schema = self.schema schema_name = schema.name items_count = 0 row_counts: TRowCount = {} + normalize_data_fun = self.schema.normalize_data_item for item in items: - for (table_name, parent_table), row in self.schema.normalize_data_item( - item, self.load_id, root_table_name - ): - # filter row, may eliminate some or all fields - row = schema.filter_row(table_name, row) - # do not process empty rows - if row: + items_gen = normalize_data_fun(item, self.load_id, root_table_name) + try: + should_descend: bool = None + # use send to prevent descending into child rows when row was discarded + while row_info := items_gen.send(should_descend): + should_descend = True + (table_name, parent_table), row = row_info + + # rows belonging to filtered out tables are skipped + if table_name in self._filtered_tables: + # stop descending into further rows + should_descend = False + continue + + # filter row, may eliminate some or all fields + row = schema.filter_row(table_name, row) + # do not process empty rows + if not row: + should_descend = False + continue + + # filter columns or full rows if schema contract said so + # do it before schema inference in `coerce_row` to not trigger costly migration code + filtered_columns = self._filtered_tables_columns.get(table_name, None) + if filtered_columns: + row = self._filter_columns(filtered_columns, row) # type: ignore[arg-type] + # if whole row got dropped + if not row: + should_descend = False + continue + # decode pua types - for k, v in row.items(): - row[k] = custom_pua_decode(v) # type: ignore + if may_have_pua: + for k, v in row.items(): + row[k] = custom_pua_decode(v) # type: ignore + # coerce row of values into schema table, generating partial table with new columns if any row, partial_table = schema.coerce_row( table_name, parent_table, row ) - # theres a new table or new columns in existing table + + # if we detect a migration, check schema contract if partial_table: + schema_contract = self._table_contracts.setdefault( + table_name, + schema.resolve_contract_settings_for_table(parent_table or table_name) # parent_table, if present, exists in the schema + ) + partial_table, filters = schema.apply_schema_contract(schema_contract, partial_table, data_item=row) + if filters: + for entity, name, mode in filters: + if entity == "tables": + self._filtered_tables.add(name) + elif entity == "columns": + filtered_columns = self._filtered_tables_columns.setdefault(table_name, {}) + filtered_columns[name] = mode + + if partial_table is None: + # discard migration and row + should_descend = False + continue + # theres a new table or new columns in existing table # update schema and save the change schema.update_table(partial_table) table_updates = schema_update.setdefault(table_name, []) table_updates.append(partial_table) + # update our columns column_schemas[table_name] = schema.get_table_columns( table_name ) + + # apply new filters + if filtered_columns and filters: + row = self._filter_columns(filtered_columns, row) + # do not continue if new filters skipped the full row + if not row: + should_descend = False + continue + # get current columns schema columns = column_schemas.get(table_name) if not columns: columns = schema.get_table_columns(table_name) column_schemas[table_name] = columns # store row - # TODO: it is possible to write to single file from many processes using this: https://gitlab.com/warsaw/flufl.lock + # TODO: store all rows for particular items all together after item is fully completed + # will be useful if we implement bad data sending to a table self.load_storage.write_data_item( self.load_id, schema_name, table_name, row, columns ) # count total items + # TODO: take counts and bytes from buffered file writers instead of taking those here items_count += 1 increase_row_count(row_counts, table_name, 1) + except StopIteration: + pass signals.raise_if_signalled() return schema_update, items_count, row_counts @@ -102,12 +184,13 @@ def __call__( ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: schema_updates: List[TSchemaUpdate] = [] row_counts: TRowCount = {} - with self.normalize_storage.storage.open_file(extracted_items_file) as f: + with self.normalize_storage.storage.open_file(extracted_items_file, "rb") as f: # enumerate jsonl file line by line items_count = 0 + line: bytes for line_no, line in enumerate(f): - items: List[TDataItem] = json.loads(line) - partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items) + items: List[TDataItem] = json.loadb(line) + partial_update, items_count, r_counts = self._normalize_chunk(root_table_name, items, may_have_pua(line)) schema_updates.append(partial_update) merge_row_count(row_counts, r_counts) logger.debug( @@ -134,6 +217,7 @@ def _write_with_dlt_columns( table_updates.append(table_update) load_id_type = pa.dictionary(pa.int8(), pa.string()) new_columns.append(( + -1, pa.field("_dlt_load_id", load_id_type, nullable=False), lambda batch: pa.array([load_id] * batch.num_rows, type=load_id_type) )) @@ -143,6 +227,7 @@ def _write_with_dlt_columns( table_updates = schema_update.setdefault(root_table_name, []) table_updates.append(table_update) new_columns.append(( + -1, pa.field("_dlt_id", pyarrow.pyarrow.string(), nullable=False), lambda batch: pa.array(generate_dlt_ids(batch.num_rows)) )) @@ -186,7 +271,6 @@ def __call__( self, extracted_items_file: str, root_table_name: str ) -> Tuple[List[TSchemaUpdate], int, TRowCount]: base_schema_update = self._fix_schema_precisions(root_table_name) - import pyarrow as pa add_dlt_id = self.config.parquet_normalizer.add_dlt_id add_dlt_load_id = self.config.parquet_normalizer.add_dlt_load_id diff --git a/dlt/normalize/normalize.py b/dlt/normalize/normalize.py index d60ea05965..ab87a5a2a1 100644 --- a/dlt/normalize/normalize.py +++ b/dlt/normalize/normalize.py @@ -14,7 +14,6 @@ from dlt.common.schema.utils import merge_schema_updates from dlt.common.storages.exceptions import SchemaNotFoundError from dlt.common.storages import NormalizeStorage, SchemaStorage, LoadStorage, LoadStorageConfiguration, NormalizeStorageConfiguration -from dlt.common.typing import TDataItem from dlt.common.schema import TSchemaUpdate, Schema from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.pipeline import NormalizeInfo @@ -52,7 +51,12 @@ def create_storages(self) -> None: # pass initial normalize storage config embedded in normalize config self.normalize_storage = NormalizeStorage(True, config=self.config._normalize_storage_config) # normalize saves in preferred format but can read all supported formats - self.load_storage = LoadStorage(True, self.config.destination_capabilities.preferred_loader_file_format, LoadStorage.ALL_SUPPORTED_FILE_FORMATS, config=self.config._load_storage_config) + self.load_storage = LoadStorage( + True, + self.config.destination_capabilities.preferred_loader_file_format, + LoadStorage.ALL_SUPPORTED_FILE_FORMATS, + config=self.config._load_storage_config + ) @staticmethod def load_or_create_schema(schema_storage: SchemaStorage, schema_name: str) -> Schema: @@ -237,7 +241,7 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap self.load_storage.config, schema.to_dict(), load_id, - files, + files ) self.update_table(schema, result[0]) self.collector.update("Files", len(result[2])) @@ -246,14 +250,14 @@ def map_single(self, schema: Schema, load_id: str, files: Sequence[str]) -> TMap def spool_files(self, schema_name: str, load_id: str, map_f: TMapFuncType, files: Sequence[str]) -> None: schema = Normalize.load_or_create_schema(self.schema_storage, schema_name) - # process files in parallel or in single thread, depending on map_f schema_updates, row_counts = map_f(schema, load_id, files) - # logger.metrics("Normalize metrics", extra=get_logging_extras([self.schema_version_gauge.labels(schema_name)])) - if len(schema_updates) > 0: - logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") - # schema is updated, save it to schema volume - self.schema_storage.save_schema(schema) + # remove normalizer specific info + for table in schema.tables.values(): + table.pop("x-normalizer", None) # type: ignore[typeddict-item] + logger.info(f"Saving schema {schema_name} with version {schema.version}, writing manifest files") + # schema is updated, save it to schema volume + self.schema_storage.save_schema(schema) # save schema to temp load folder self.load_storage.save_temp_schema(schema, load_id) # save schema updates even if empty diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 71c37c40ba..0f173307a0 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -1,18 +1,19 @@ from typing import Sequence, cast, overload from dlt.common.schema import Schema -from dlt.common.schema.typing import TColumnSchema, TWriteDisposition +from dlt.common.schema.typing import TColumnSchema, TWriteDisposition, TSchemaContract from dlt.common.typing import TSecretValue, Any from dlt.common.configuration import with_config from dlt.common.configuration.container import Container from dlt.common.configuration.inject import get_orig_args, last_config -from dlt.common.destination.reference import DestinationReference, TDestinationReferenceArg +from dlt.common.destination import TLoaderFileFormat, Destination, TDestinationReferenceArg from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.progress import _from_name as collector_from_name, TCollectorArg, _NULL_COLLECTOR +from dlt.pipeline.deprecations import credentials_argument_deprecated @overload @@ -104,6 +105,8 @@ def pipeline( # is any of the arguments different from defaults has_arguments = bool(orig_args[0]) or any(orig_args[1].values()) + credentials_argument_deprecated("pipeline", credentials, destination) + if not has_arguments: context = Container()[PipelineContext] # if pipeline instance is already active then return it, otherwise create a new one @@ -116,8 +119,8 @@ def pipeline( if not pipelines_dir: pipelines_dir = get_dlt_pipelines_dir() - destination = DestinationReference.from_name(destination or kwargs["destination_name"]) - staging = DestinationReference.from_name(staging or kwargs.get("staging_name", None)) if staging is not None else None + destination = Destination.from_reference(destination or kwargs["destination_name"]) + staging = Destination.from_reference(staging or kwargs.get("staging_name", None)) if staging is not None else None progress = collector_from_name(progress) # create new pipeline instance @@ -174,7 +177,9 @@ def run( table_name: str = None, write_disposition: TWriteDisposition = None, columns: Sequence[TColumnSchema] = None, - schema: Schema = None + schema: Schema = None, + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None, ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -224,7 +229,7 @@ def run( Returns: LoadInfo: Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. """ - destination = DestinationReference.from_name(destination) + destination = Destination.from_reference(destination, credentials=credentials) return pipeline().run( data, destination=destination, @@ -234,7 +239,9 @@ def run( table_name=table_name, write_disposition=write_disposition, columns=columns, - schema=schema + schema=schema, + loader_file_format=loader_file_format, + schema_contract=schema_contract ) # plug default tracking module diff --git a/dlt/pipeline/deprecations.py b/dlt/pipeline/deprecations.py new file mode 100644 index 0000000000..138167c8d3 --- /dev/null +++ b/dlt/pipeline/deprecations.py @@ -0,0 +1,20 @@ +import typing as t +import warnings + +from dlt.common.destination import Destination, TDestinationReferenceArg + + +def credentials_argument_deprecated( + caller_name: str, credentials: t.Optional[t.Any], destination: TDestinationReferenceArg = None +) -> None: + if credentials is None: + return + + dest_name = Destination.to_name(destination) if destination else "postgres" + + warnings.warn( + f"The `credentials argument` to {caller_name} is deprecated and will be removed in a future version. " + f"Pass the same credentials to the `destination` instance instead, e.g. {caller_name}(destination=dlt.destinations.{dest_name}(credentials=...))", + DeprecationWarning, + stacklevel=2, + ) diff --git a/dlt/pipeline/exceptions.py b/dlt/pipeline/exceptions.py index 4b283a17e7..0289c07158 100644 --- a/dlt/pipeline/exceptions.py +++ b/dlt/pipeline/exceptions.py @@ -55,6 +55,14 @@ def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: ) super().__init__(pipeline_name, msg) +class PipelineNeverRan(PipelineException): + def __init__(self, pipeline_name: str, pipelines_dir: str) -> None: + msg = ( + f" Operation failed because pipeline with name {pipeline_name} in working directory {pipelines_dir} was never run or never synced with destination. " + "Use `dlt pipeline sync` to synchronize." + ) + super().__init__(pipeline_name, msg) + class PipelineNotActive(PipelineException): def __init__(self, pipeline_name: str) -> None: diff --git a/dlt/pipeline/helpers.py b/dlt/pipeline/helpers.py index ef4fe70664..ebb85f5e23 100644 --- a/dlt/pipeline/helpers.py +++ b/dlt/pipeline/helpers.py @@ -11,7 +11,7 @@ from dlt.common.destination.reference import WithStagingDataset from dlt.destinations.exceptions import DatabaseUndefinedRelation -from dlt.pipeline.exceptions import PipelineStepFailed, PipelineHasPendingDataException +from dlt.pipeline.exceptions import PipelineNeverRan, PipelineStepFailed, PipelineHasPendingDataException from dlt.pipeline.typing import TPipelineStep from dlt.pipeline import Pipeline @@ -71,6 +71,9 @@ def __init__( if isinstance(state_paths, str): state_paths = [state_paths] + if not pipeline.default_schema_name: + raise PipelineNeverRan(pipeline.pipeline_name, pipeline.pipelines_dir) + self.schema = pipeline.schemas[schema_name or pipeline.default_schema_name].clone() self.schema_tables = self.schema.tables self.drop_tables = not state_only diff --git a/dlt/pipeline/mark.py b/dlt/pipeline/mark.py index 5f880d8711..14a7108683 100644 --- a/dlt/pipeline/mark.py +++ b/dlt/pipeline/mark.py @@ -1,2 +1,2 @@ """Module with market functions that make data to be specially processed""" -from dlt.extract.source import with_table_name \ No newline at end of file +from dlt.extract import with_table_name \ No newline at end of file diff --git a/dlt/pipeline/pipeline.py b/dlt/pipeline/pipeline.py index a719c0ce18..b9eb958027 100644 --- a/dlt/pipeline/pipeline.py +++ b/dlt/pipeline/pipeline.py @@ -5,7 +5,6 @@ from functools import wraps from collections.abc import Sequence as C_Sequence from typing import Any, Callable, ClassVar, List, Iterator, Optional, Sequence, Tuple, cast, get_type_hints, ContextManager -from concurrent.futures import Executor from dlt import version from dlt.common import json, logger, pendulum @@ -19,13 +18,14 @@ MissingDependencyException, DestinationUndefinedEntity, DestinationIncompatibleLoaderFileFormatException) from dlt.common.normalizers import explicit_normalizers, import_normalizers from dlt.common.runtime import signals, initialize_runtime -from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns +from dlt.common.schema.typing import TColumnNames, TColumnSchema, TSchemaTables, TWriteDisposition, TAnySchemaColumns, TSchemaContract +from dlt.common.schema.utils import normalize_schema_name from dlt.common.storages.load_storage import LoadJobInfo, LoadPackageInfo from dlt.common.typing import TFun, TSecretValue, is_optional_type from dlt.common.runners import pool_runner as runner from dlt.common.storages import LiveSchemaStorage, NormalizeStorage, LoadStorage, SchemaStorage, FileStorage, NormalizeStorageConfiguration, SchemaStorageConfiguration, LoadStorageConfiguration -from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.reference import (DestinationClientDwhConfiguration, WithStateSync, DestinationReference, JobClientBase, DestinationClientConfiguration, +from dlt.common.destination import DestinationCapabilitiesContext, TDestination +from dlt.common.destination.reference import (DestinationClientDwhConfiguration, WithStateSync, Destination, JobClientBase, DestinationClientConfiguration, TDestinationReferenceArg, DestinationClientStagingConfiguration, DestinationClientStagingConfiguration, DestinationClientDwhWithStagingConfiguration) from dlt.common.destination.capabilities import INTERNAL_LOADER_FILE_FORMATS @@ -34,9 +34,9 @@ from dlt.common.utils import is_interactive from dlt.common.data_writers import TLoaderFileFormat +from dlt.extract import DltResource, DltSource from dlt.extract.exceptions import SourceExhausted from dlt.extract.extract import ExtractorStorage, extract_with_schema -from dlt.extract.source import DltResource, DltSource from dlt.normalize import Normalize from dlt.normalize.configuration import NormalizeConfiguration from dlt.destinations.sql_client import SqlClientBase @@ -50,8 +50,7 @@ from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace, load_trace, merge_traces, start_trace, start_trace_step, end_trace_step, end_trace, describe_extract_data from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.state_sync import STATE_ENGINE_VERSION, load_state_from_destination, merge_state_if_changed, migrate_state, state_resource, json_encode_state, json_decode_state - -from dlt.common.schema.utils import normalize_schema_name +from dlt.pipeline.deprecations import credentials_argument_deprecated def with_state_sync(may_extract_state: bool = False) -> Callable[[TFun], TFun]: @@ -81,8 +80,11 @@ def _wrap(self: "Pipeline", *args: Any, **kwargs: Any) -> Any: # refresh live schemas in storage or import schema path self._schema_storage.commit_live_schema(name) rv = f(self, *args, **kwargs) + # save modified live schemas + for name in self._schema_storage.live_schemas: + self._schema_storage.commit_live_schema(name) # refresh list of schemas if any new schemas are added - self.schema_names = self._schema_storage.list_schemas() + self.schema_names = self._list_schemas_sorted() return rv return _wrap # type: ignore @@ -166,9 +168,9 @@ class Pipeline(SupportsPipeline): """A directory where the pipelines' working directories are created""" working_dir: str """A working directory of the pipeline""" - destination: DestinationReference = None - staging: DestinationReference = None - """The destination reference which is ModuleType. `destination.__name__` returns the name string""" + destination: TDestination = None + staging: TDestination = None + """The destination reference which is ModuleType. `destination.name` returns the name string""" dataset_name: str = None """Name of the dataset to which pipeline will be loaded to""" credentials: Any = None @@ -183,8 +185,8 @@ def __init__( pipeline_name: str, pipelines_dir: str, pipeline_salt: TSecretValue, - destination: DestinationReference, - staging: DestinationReference, + destination: TDestination, + staging: TDestination, dataset_name: str, credentials: Any, import_schema_path: str, @@ -267,7 +269,8 @@ def extract( primary_key: TColumnNames = None, schema: Schema = None, max_parallel_items: int = None, - workers: int = None + workers: int = None, + schema_contract: TSchemaContract = None ) -> ExtractInfo: """Extracts the `data` and prepare it for the normalization. Does not require destination or credentials to be configured. See `run` method for the arguments' description.""" # create extract storage to which all the sources will be extracted @@ -276,7 +279,7 @@ def extract( try: with self._maybe_destination_capabilities(): # extract all sources - for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key): + for source in self._data_to_sources(data, schema, table_name, parent_table_name, write_disposition, columns, primary_key, schema_contract): if source.exhausted: raise SourceExhausted(source.name) # TODO: merge infos for all the sources @@ -287,6 +290,7 @@ def extract( # TODO: if we fail here we should probably wipe out the whole extract folder for extract_id in extract_ids: storage.commit_extract_files(extract_id) + return ExtractInfo(describe_extract_data(data)) except Exception as exc: # TODO: provide metrics from extractor @@ -342,6 +346,9 @@ def load( # set destination and default dataset if provided self._set_destinations(destination, None) self._set_dataset_name(dataset_name) + + credentials_argument_deprecated("pipeline.load", credentials, destination) + self.credentials = credentials or self.credentials # check if any schema is present, if not then no data was extracted @@ -390,7 +397,8 @@ def run( columns: TAnySchemaColumns = None, primary_key: TColumnNames = None, schema: Schema = None, - loader_file_format: TLoaderFileFormat = None + loader_file_format: TLoaderFileFormat = None, + schema_contract: TSchemaContract = None ) -> LoadInfo: """Loads the data from `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. @@ -440,6 +448,8 @@ def run( loader_file_format (Literal["jsonl", "insert_values", "parquet"], optional). The file format the loader will use to create the load package. Not all file_formats are compatible with all destinations. Defaults to the preferred file format of the selected destination. + schema_contract (TSchemaContract, optional): On override for the schema contract settings, this will replace the schema contract settings for all tables in the schema. Defaults to None. + Raises: PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. Returns: @@ -449,6 +459,8 @@ def run( self._set_destinations(destination, staging) self._set_dataset_name(dataset_name) + credentials_argument_deprecated("pipeline.run", credentials, self.destination) + # sync state with destination if self.config.restore_from_destination and not self.full_refresh and not self._state_restored and (self.destination or destination): self.sync_destination(destination, staging, dataset_name) @@ -464,9 +476,10 @@ def run( logger.warn("The pipeline `run` method will now load the pending load packages. The data you passed to the run function will not be loaded. In order to do that you must run the pipeline again") return self.load(destination, dataset_name, credentials=credentials) + # extract from the source if data is not None: - self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema) + self.extract(data, table_name=table_name, write_disposition=write_disposition, columns=columns, primary_key=primary_key, schema=schema, schema_contract=schema_contract) self.normalize(loader_file_format=loader_file_format) return self.load(destination, dataset_name, credentials=credentials) else: @@ -595,7 +608,7 @@ def has_data(self) -> bool: @property def has_pending_data(self) -> bool: """Tells if the pipeline contains any extracted files or pending load packages""" - return bool(self.list_normalized_load_packages() or self.list_extracted_resources()) + return len(self.list_normalized_load_packages()) > 0 or len(self.list_extracted_resources()) > 0 @property def schemas(self) -> SchemaStorage: @@ -623,7 +636,7 @@ def list_extracted_resources(self) -> Sequence[str]: def list_normalized_load_packages(self) -> Sequence[str]: """Returns a list of all load packages ids that are or will be loaded.""" - return self._get_load_storage().list_packages() + return self._get_load_storage().list_normalized_packages() def list_completed_load_packages(self) -> Sequence[str]: """Returns a list of all load package ids that are completely loaded""" @@ -637,6 +650,20 @@ def list_failed_jobs_in_package(self, load_id: str) -> Sequence[LoadJobInfo]: """List all failed jobs and associated error messages for a specified `load_id`""" return self._get_load_storage().get_load_package_info(load_id).jobs.get("failed_jobs", []) + def drop_pending_packages(self, with_partial_loads: bool = True) -> None: + """Deletes all extracted and normalized packages, including those that are partially loaded by default""" + # delete normalized packages + load_storage = self._get_load_storage() + for load_id in load_storage.list_normalized_packages(): + package_info = load_storage.get_load_package_info(load_id) + if LoadStorage.is_package_partially_loaded(package_info) and not with_partial_loads: + continue + package_path = load_storage.get_normalized_package_path(load_id) + load_storage.storage.delete_folder(package_path, recursively=True) + # delete extracted files + normalize_storage = self._get_normalize_storage() + normalize_storage.delete_extracted_files(normalize_storage.list_files_to_normalize_sorted()) + @with_schemas_sync def sync_schema(self, schema_name: str = None, credentials: Any = None) -> TSchemaTables: """Synchronizes the schema `schema_name` with the destination. If no name is provided, the default schema will be synchronized.""" @@ -718,7 +745,7 @@ def _sql_job_client(self, schema: Schema, credentials: Any = None) -> SqlJobClie if isinstance(client, SqlJobClientBase): return client else: - raise SqlClientNotAvailable(self.pipeline_name, self.destination.__name__) + raise SqlClientNotAvailable(self.pipeline_name, self.destination.name) def _get_normalize_storage(self) -> NormalizeStorage: return NormalizeStorage(True, self._normalize_storage_config) @@ -788,21 +815,34 @@ def _data_to_sources(self, parent_table_name: str = None, write_disposition: TWriteDisposition = None, columns: TAnySchemaColumns = None, - primary_key: TColumnNames = None + primary_key: TColumnNames = None, + schema_contract: TSchemaContract = None ) -> List[DltSource]: def apply_hint_args(resource: DltResource) -> None: - # apply hints only if any of the hints is present, table_name must be always present - if table_name or parent_table_name or write_disposition or columns or primary_key: - resource.apply_hints(table_name or resource.table_name or resource.name, parent_table_name, write_disposition, columns, primary_key) + resource.apply_hints( + table_name, + parent_table_name, + write_disposition, + columns, + primary_key, + schema_contract=schema_contract + ) + + def apply_settings(source_: DltSource) -> None: + # apply schema contract settings + if schema_contract: + source_.schema_contract = schema_contract def choose_schema() -> Schema: """Except of explicitly passed schema, use a clone that will get discarded if extraction fails""" if schema: - return schema - if self.default_schema_name: - return self.default_schema.clone() - return self._make_schema_with_default_name() + schema_ = schema + elif self.default_schema_name: + schema_ = self.default_schema.clone() + else: + schema_ = self._make_schema_with_default_name() + return schema_ effective_schema = choose_schema() @@ -815,26 +855,19 @@ def append_data(data_item: Any) -> None: # if schema is explicit then override source schema if schema: data_item.schema = schema - # try to apply hints to resources - _resources = data_item.resources.values() - for r in _resources: - apply_hint_args(r) sources.append(data_item) elif isinstance(data_item, DltResource): - # apply hints - apply_hint_args(data_item) # do not set section to prevent source that represent a standalone resource # to overwrite other standalone resources (ie. parents) in that source sources.append( - DltSource("", effective_schema, [data_item]) + DltSource(effective_schema.name, "", effective_schema, [data_item]) ) else: # iterator/iterable/generator # create resource first without table template - resource = DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) - # apply hints - apply_hint_args(resource) - resources.append(resource) + resources.append( + DltResource.from_data(data_item, name=table_name, section=self.pipeline_name) + ) if isinstance(data, C_Sequence) and len(data) > 0: # if first element is source or resource @@ -846,40 +879,46 @@ def append_data(data_item: Any) -> None: else: append_data(data) + # add all the appended resources in one source if resources: - # add all the appended resources in one source - sources.append(DltSource( self.pipeline_name, effective_schema, resources)) + sources.append(DltSource(effective_schema.name, self.pipeline_name, effective_schema, resources)) + + # apply hints and settings + for source in sources: + apply_settings(source) + for resource in source.selected_resources.values(): + apply_hint_args(resource) return sources def _extract_source(self, storage: ExtractorStorage, source: DltSource, max_parallel_items: int, workers: int) -> str: - # discover the schema from source - source_schema = source.schema - source_schema.update_normalizers() + # discover the existing pipeline schema + if source.schema.name in self.schemas: + # use clone until extraction complete + pipeline_schema = self.schemas[source.schema.name].clone() + # apply all changes in the source schema to pipeline schema + # NOTE: we do not apply contracts to changes done programmatically + pipeline_schema.update_schema(source.schema) + # replace schema in the source + source.schema = pipeline_schema # extract into pipeline schema - extract_id = extract_with_schema(storage, source, source_schema, self.collector, max_parallel_items, workers) + extract_id = extract_with_schema(storage, source, self.collector, max_parallel_items, workers) # save import with fully discovered schema - self._schema_storage.save_import_schema_if_not_exists(source_schema) + self._schema_storage.save_import_schema_if_not_exists(source.schema) - # if source schema does not exist in the pipeline - if source_schema.name not in self._schema_storage: - # create new schema - self._schema_storage.save_schema(source_schema) - - # update pipeline schema (do contract checks here) - pipeline_schema = self._schema_storage[source_schema.name] - pipeline_schema.update_schema(source_schema) + # update live schema but not update the store yet + self._schema_storage.update_live_schema(source.schema) # set as default if this is first schema in pipeline if not self.default_schema_name: # this performs additional validations as schema contains the naming module - self._set_default_schema_name(pipeline_schema) + self._set_default_schema_name(source.schema) return extract_id - def _get_destination_client_initial_config(self, destination: DestinationReference = None, credentials: Any = None, as_staging: bool = False) -> DestinationClientConfiguration: + def _get_destination_client_initial_config(self, destination: TDestination = None, credentials: Any = None, as_staging: bool = False) -> DestinationClientConfiguration: destination = destination or self.destination if not destination: raise PipelineConfigMissing( @@ -889,7 +928,7 @@ def _get_destination_client_initial_config(self, destination: DestinationReferen "Please provide `destination` argument to `pipeline`, `run` or `load` method directly or via .dlt config.toml file or environment variable." ) # create initial destination client config - client_spec = destination.spec() + client_spec = destination.spec # initialize explicit credentials if not as_staging: # explicit credentials passed to dlt.pipeline should not be applied to staging @@ -985,17 +1024,19 @@ def _set_context(self, is_active: bool) -> None: del self._container[DestinationCapabilitiesContext] def _set_destinations(self, destination: TDestinationReferenceArg, staging: TDestinationReferenceArg) -> None: - destination_mod = DestinationReference.from_name(destination) - self.destination = destination_mod or self.destination + # destination_mod = DestinationReference.from_name(destination) + if destination: + self.destination = Destination.from_reference(destination) if destination and not self.destination.capabilities().supported_loader_file_formats and not staging: - logger.warning(f"The destination {destination_mod.__name__} requires the filesystem staging destination to be set, but it was not provided. Setting it to 'filesystem'.") + logger.warning(f"The destination {self.destination.name} requires the filesystem staging destination to be set, but it was not provided. Setting it to 'filesystem'.") staging = "filesystem" if staging: - staging_module = DestinationReference.from_name(staging) - if staging_module and not issubclass(staging_module.spec(), DestinationClientStagingConfiguration): - raise DestinationNoStagingMode(staging_module.__name__) + # staging_module = DestinationReference.from_name(staging) + staging_module = Destination.from_reference(staging) + if staging_module and not issubclass(staging_module.spec, DestinationClientStagingConfiguration): + raise DestinationNoStagingMode(staging_module.name) self.staging = staging_module or self.staging with self._maybe_destination_capabilities(): @@ -1014,8 +1055,10 @@ def _maybe_destination_capabilities(self, loader_file_format: TLoaderFileFormat caps = injected_caps.__enter__() caps.preferred_loader_file_format = self._resolve_loader_file_format( - DestinationReference.to_name(self.destination), - DestinationReference.to_name(self.staging) if self.staging else None, + self.destination.name, + # DestinationReference.to_name(self.destination), + self.staging.name if self.staging else None, + # DestinationReference.to_name(self.staging) if self.staging else None, destination_caps, stage_caps, loader_file_format) caps.supported_loader_file_formats = ( destination_caps.supported_staging_file_formats if stage_caps else None @@ -1143,12 +1186,12 @@ def _restore_state_from_destination(self) -> Optional[TPipelineState]: if isinstance(job_client, WithStateSync): state = load_state_from_destination(self.pipeline_name, job_client) if state is None: - logger.info(f"The state was not found in the destination {self.destination.__name__}:{dataset_name}") + logger.info(f"The state was not found in the destination {self.destination.name}:{dataset_name}") else: - logger.info(f"The state was restored from the destination {self.destination.__name__}:{dataset_name}") + logger.info(f"The state was restored from the destination {self.destination.name}:{dataset_name}") else: state = None - logger.info(f"Destination does not support metadata storage {self.destination.__name__}:{dataset_name}") + logger.info(f"Destination does not support metadata storage {self.destination.name}:{dataset_name}") return state finally: # restore the use_single_dataset option @@ -1163,17 +1206,17 @@ def _get_schemas_from_destination(self, schema_names: Sequence[str], always_down if not self._schema_storage.has_schema(schema.name) or always_download: with self._get_destination_clients(schema)[0] as job_client: if not isinstance(job_client, WithStateSync): - logger.info(f"Destination does not support metadata storage {self.destination.__name__}") + logger.info(f"Destination does not support metadata storage {self.destination.name}") return restored_schemas schema_info = job_client.get_stored_schema() if schema_info is None: - logger.info(f"The schema {schema.name} was not found in the destination {self.destination.__name__}:{self.dataset_name}") + logger.info(f"The schema {schema.name} was not found in the destination {self.destination.name}:{self.dataset_name}") # try to import schema with contextlib.suppress(FileNotFoundError): self._schema_storage.load_schema(schema.name) else: schema = Schema.from_dict(json.loads(schema_info.schema)) - logger.info(f"The schema {schema.name} version {schema.version} hash {schema.stored_version_hash} was restored from the destination {self.destination.__name__}:{self.dataset_name}") + logger.info(f"The schema {schema.name} version {schema.version} hash {schema.stored_version_hash} was restored from the destination {self.destination.name}:{self.dataset_name}") restored_schemas.append(schema) return restored_schemas @@ -1188,12 +1231,6 @@ def managed_state(self, *, extract_state: bool = False) -> Iterator[TPipelineSta backup_state = self._get_state() # restore original pipeline props self._state_to_props(backup_state) - # synchronize schema storage with initial list of schemas, note that we'll not be able to synchronize the schema content - if self._schema_storage: - # TODO: we should restore schemas backup here - for existing_schema_name in self._schema_storage.list_schemas(): - if existing_schema_name not in self.schema_names: - self._schema_storage.remove_schema(existing_schema_name) # raise original exception raise else: @@ -1230,7 +1267,7 @@ def _state_to_props(self, state: TPipelineState) -> None: if prop in state["_local"] and not prop.startswith("_"): setattr(self, prop, state["_local"][prop]) # type: ignore if "destination" in state: - self._set_destinations(DestinationReference.from_name(self.destination), DestinationReference.from_name(self.staging) if "staging" in state else None ) + self._set_destinations(self.destination, self.staging if "staging" in state else None ) def _props_to_state(self, state: TPipelineState) -> None: """Write pipeline props to `state`""" @@ -1241,10 +1278,14 @@ def _props_to_state(self, state: TPipelineState) -> None: if not prop.startswith("_"): state["_local"][prop] = getattr(self, prop) # type: ignore if self.destination: - state["destination"] = self.destination.__name__ + state["destination"] = self.destination.name if self.staging: - state["staging"] = self.staging.__name__ - state["schema_names"] = self._schema_storage.list_schemas() + state["staging"] = self.staging.name + state["schema_names"] = self._list_schemas_sorted() + + def _list_schemas_sorted(self) -> List[str]: + """Lists schema names sorted to have deterministic state""" + return sorted(self._schema_storage.list_schemas()) def _save_state(self, state: TPipelineState) -> None: self._pipeline_storage.save(Pipeline.STATE_FILE, json_encode_state(state)) @@ -1252,9 +1293,9 @@ def _save_state(self, state: TPipelineState) -> None: def _extract_state(self, state: TPipelineState) -> TPipelineState: # this will extract the state into current load package and update the schema with the _dlt_pipeline_state table # note: the schema will be persisted because the schema saving decorator is over the state manager decorator for extract - state_source = DltSource(self.pipeline_name, self.default_schema, [state_resource(state)]) + state_source = DltSource(self.default_schema.name, self.pipeline_name, self.default_schema, [state_resource(state)]) storage = ExtractorStorage(self._normalize_storage_config) - extract_id = extract_with_schema(storage, state_source, self.default_schema, _NULL_COLLECTOR, 1, 1) + extract_id = extract_with_schema(storage, state_source, _NULL_COLLECTOR, 1, 1) storage.commit_extract_files(extract_id) return state diff --git a/dlt/pipeline/state_sync.py b/dlt/pipeline/state_sync.py index 581ed4c2bd..a9603b8f66 100644 --- a/dlt/pipeline/state_sync.py +++ b/dlt/pipeline/state_sync.py @@ -12,7 +12,7 @@ from dlt.common.schema.typing import STATE_TABLE_NAME, TTableSchemaColumns from dlt.common.destination.reference import JobClientBase, WithStateSync -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException from dlt.common.utils import compressed_b64decode, compressed_b64encode diff --git a/dlt/pipeline/trace.py b/dlt/pipeline/trace.py index 2ba71396f6..46ab524aa1 100644 --- a/dlt/pipeline/trace.py +++ b/dlt/pipeline/trace.py @@ -14,7 +14,7 @@ from dlt.common.typing import DictStrAny, StrAny from dlt.common.utils import uniq_id -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.exceptions import PipelineStepFailed diff --git a/dlt/pipeline/track.py b/dlt/pipeline/track.py index ec42bc788f..07e9a2d137 100644 --- a/dlt/pipeline/track.py +++ b/dlt/pipeline/track.py @@ -9,7 +9,7 @@ from dlt.common.runtime.segment import track as dlthub_telemetry_track from dlt.common.runtime.slack import send_slack_message from dlt.common.pipeline import LoadInfo, ExtractInfo, SupportsPipeline -from dlt.common.destination import DestinationReference +from dlt.common.destination import Destination from dlt.pipeline.typing import TPipelineStep from dlt.pipeline.trace import PipelineTrace, PipelineStepTrace @@ -21,7 +21,7 @@ def _add_sentry_tags(span: Span, pipeline: SupportsPipeline) -> None: span.set_tag("pipeline_name", pipeline.pipeline_name) if pipeline.destination: - span.set_tag("destination", pipeline.destination.__name__) + span.set_tag("destination", pipeline.destination.name) if pipeline.dataset_name: span.set_tag("dataset_name", pipeline.dataset_name) except ImportError: @@ -87,7 +87,7 @@ def on_end_trace_step(trace: PipelineTrace, step: PipelineStepTrace, pipeline: S props = { "elapsed": (step.finished_at - trace.started_at).total_seconds(), "success": step.step_exception is None, - "destination_name": DestinationReference.to_name(pipeline.destination) if pipeline.destination else None, + "destination_name": pipeline.destination.name if pipeline.destination else None, "pipeline_name_hash": digest128(pipeline.pipeline_name), "dataset_name_hash": digest128(pipeline.dataset_name) if pipeline.dataset_name else None, "default_schema_name_hash": digest128(pipeline.default_schema_name) if pipeline.default_schema_name else None, @@ -107,4 +107,4 @@ def on_end_trace(trace: PipelineTrace, pipeline: SupportsPipeline) -> None: if pipeline.runtime_config.sentry_dsn: # print(f"---END SENTRY TX: {trace.transaction_id} SCOPE: {Hub.current.scope}") with contextlib.suppress(Exception): - Hub.current.scope.span.__exit__(None, None, None) \ No newline at end of file + Hub.current.scope.span.__exit__(None, None, None) diff --git a/dlt/reflection/script_inspector.py b/dlt/reflection/script_inspector.py index 204135dcd7..9899e2b157 100644 --- a/dlt/reflection/script_inspector.py +++ b/dlt/reflection/script_inspector.py @@ -12,7 +12,8 @@ from dlt.common.typing import DictStrAny from dlt.pipeline import Pipeline -from dlt.extract.source import DltSource, ManagedPipeIterator +from dlt.extract import DltSource +from dlt.extract.pipe import ManagedPipeIterator def patch__init__(self: Any, *args: Any, **kwargs: Any) -> None: diff --git a/dlt/sources/__init__.py b/dlt/sources/__init__.py index 6e418a3cb2..465467db67 100644 --- a/dlt/sources/__init__.py +++ b/dlt/sources/__init__.py @@ -1,7 +1,6 @@ """Module with built in sources and source building blocks""" -from dlt.extract.incremental import Incremental as incremental -from dlt.extract.source import DltSource, DltResource from dlt.common.typing import TDataItem, TDataItems +from dlt.extract import DltSource, DltResource, Incremental as incremental from . import credentials from . import config from . import filesystem diff --git a/docs/examples/archive/sources/rasa/rasa.py b/docs/examples/archive/sources/rasa/rasa.py index aa31b3c482..b498f9c3de 100644 --- a/docs/examples/archive/sources/rasa/rasa.py +++ b/docs/examples/archive/sources/rasa/rasa.py @@ -3,7 +3,7 @@ import dlt from dlt.common.typing import StrAny, TDataItem, TDataItems from dlt.common.time import timestamp_within -from dlt.extract.source import DltResource +from dlt.extract.resource import DltResource @dlt.source diff --git a/docs/examples/incremental_loading/zendesk.py b/docs/examples/incremental_loading/zendesk.py index 6370f29811..3f433e3fef 100644 --- a/docs/examples/incremental_loading/zendesk.py +++ b/docs/examples/incremental_loading/zendesk.py @@ -1,10 +1,9 @@ -from typing import Iterator, Optional, Dict, Any, Tuple +from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime -from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime -from dlt.extract.source import DltResource +from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/docs/technical/secrets_and_config.md b/docs/technical/secrets_and_config.md deleted file mode 100644 index 423767293d..0000000000 --- a/docs/technical/secrets_and_config.md +++ /dev/null @@ -1,436 +0,0 @@ -# Secrets and Configs -marks features that are: - -⛔ not implemented, hard to add - -☮️ not implemented, easy to add - -## General Usage and an Example -The way config values and secrets are handled should promote correct behavior - -1. secret values should never be present in the pipeline code -2. pipeline may be reconfigured for production after it is deployed. deployed and local code should be identical -3. still it must be easy and intuitive - -For the source extractor function below (reads selected tab from google sheets) we can pass config values in following ways: - -```python - -import dlt - - -@dlt.source -def google_sheets(spreadsheet_id, tab_names=dlt.config.value, credentials=dlt.secrets.value, only_strings=False): - sheets = build('sheets', 'v4', credentials=Services.from_json(credentials)) - tabs = [] - for tab_name in tab_names: - data = sheets.get(spreadsheet_id, tab_name).execute().values() - tabs.append(dlt.resource(data, name=tab_name)) - return tabs - -# WRONG: provide all values directly - wrong but possible. secret values should never be present in the code! -google_sheets("23029402349032049", ["tab1", "tab2"], credentials={"private_key": ""}).run(destination="bigquery") - -# OPTION A: provide config values directly and secrets via automatic injection mechanism (see later) -# `credentials` value will be injected by the `source` decorator -# `spreadsheet_id` and `tab_names` take values from the arguments below -# `only_strings` will be injected by the source decorator or will get the default value False -google_sheets("23029402349032049", ["tab1", "tab2"]).run(destination="bigquery") - - -# OPTION B: use `dlt.secrets` and `dlt.config` to explicitly take those values from providers from the explicit keys -google_sheets(dlt.config["sheet_id"], dlt.config["my_section.tabs"], dlt.secrets["my_section.gcp_credentials"]).run(destination="bigquery") -``` - -> one of the principles is that configuration, credentials and secret values are may be passed explicitly as arguments to the functions. this makes the injection behavior optional. - -## Injection mechanism -Config and secret values are injected to the function arguments if the function is decorated with `@dlt.source` or `@dlt resource` (also `@with_config` which you can applu to any function - used havily in the dlt core) - -The signature of the function `google_sheets` is **explicitly accepting all the necessary configuration and secrets in its arguments**. During runtime, `dlt` tries to supply (`inject`) the required values via various config providers. The injection rules are: -1. if you call the decorated function, the arguments that are passed explicitly are **never injected** -this makes injection mechanism optional - -2. required arguments (ie. `spreadsheet_id`, `tab_names`) are not injected -3. arguments with default values are injected if present in config providers -4. arguments with the special default value `dlt.secrets.value` and `dlt.config.value` **must be injected** (or expicitly passed). If they are not found by the config providers the code raises exception. The code in the functions always receives those arguments. - -additionally `dlt.secrets.value` tells `dlt` that supplied value is a secret and it will be injected only from secure config providers - -## Passing config values and credentials explicitly - -```python -# OPTION B: use `dlt.secrets` and `dlt.config` to explicitly take those values from providers from the explicit keys -google_sheets(dlt.config["sheet_id"], dlt.config["tabs"], dlt.secrets["my_section.gcp_credentials"]).run(destination="bigquery") -``` - -[See example](/docs/examples/credentials/explicit.py) - - -## Typing the source and resource signatures - -You should type your function signatures! The effort is very low and it gives `dlt` much more information on what source/resource expects. -1. You'll never receive invalid type signatures -2. We can generate nice sample config and secret files for your source -3. You can request dictionaries or special values (ie. connection strings, service json) to be passed -4. ☮️ you can specify a set of possible types via `Union` ie. OAUTH or Api Key authorization - -```python -@dlt.source -def google_sheets(spreadsheet_id: str, tab_names: List[str] = dlt.config.value, credentials: GcpServiceAccountCredentials = dlt.secrets.value, only_strings: bool = False): - ... -``` -Now: -1. you are sure that you get a list of strings as `tab_names` -2. you will get actual google credentials (see `CredentialsConfiguration` later) and your users can pass them in many different forms. - -In case of `GcpServiceAccountCredentials` -* you may just pass the `service_json` as string or dictionary (in code and via config providers) -* you may pass a connection string (used in sql alchemy) (in code and via config providers) -* or default credentials will be used - - -## Providers -If function signature has arguments that may be injected, `dlt` looks for the argument values in providers. **The argument name is a key in the lookup**. In case of `google_sheets()` it will look for: `tab_names`, `credentials` and `strings_only`. - -Each provider has its own key naming convention and dlt is able to translate between them. - -Providers form a hierarchy. At the top are environment variables, then `secrets.toml` and `config.toml` files. Providers like google, aws, azure vaults can be inserted after the environment provider. - -For example if `spreadsheet_id` is in environment, dlt does not look into other providers. - -The values passed in the code explitly are the **highest** in provider hierarchy. -The default values of the arguments have the **lowest** priority in the provider hierarchy. - -> **Summary of the hierarchy** -> explicit args > env variables > ...vaults, airflow etc > secrets.toml > config.toml > default arg values - -Secrets are handled only by the providers supporting them. Some of the providers support only secrets (to reduce the number of requests done by `dlt` when searching sections) -1. `secrets.toml` and environment may hold both config and secret values -2. `config.toml` may hold only config values, no secrets -3. various vaults providers hold only secrets, `dlt` skips them when looking for values that are not secrets. - -⛔ Context aware providers will activate in right environments ie. on Airflow or AWS/GCP VMachines - -### Provider key formats. toml vs. environment variable - -Providers may use diffent formats for the keys. `dlt` will translate the standard format where sections and key names are separated by "." into the provider specific formats. - -1. for `toml` names are case sensitive and sections are separated with "." -2. for environment variables all names are capitalized and sections are separated with double underscore "__" - -Example: -When `dlt` evaluates the request `dlt.secrets["my_section.gcp_credentials"]` it must find the `private_key` for google credentials. It will look -1. first in env variable `MY_SECTION__GCP_CREDENTIALS__PRIVATE_KEY` and if not found -2. in `secrets.toml` with key `my_section.gcp_credentials.private_key` - - -### Environment provider -Looks for the values in the environment variables - -### Toml provider -Tomls provider uses two `toml` files: `secrets.toml` to store secrets and `config.toml` to store configuration values. The default `.gitignore` file prevents secrets from being added to source control and pushed. The `config.toml` may be freely added. - -**Toml provider always loads those files from `.dlt` folder** which is looked **relative to the current working directory**. Example: -if your working dir is `my_dlt_project` and you have: -``` -my_dlt_project: - | - pipelines/ - |---- .dlt/secrets.toml - |---- google_sheets.py -``` -in it and you run `python pipelines/google_sheets.py` then `dlt` will look for `secrets.toml` in `my_dlt_project/.dlt/secrets.toml` and ignore the existing `my_dlt_project/pipelines/.dlt/secrets.toml` - -if you change your working dir to `pipelines` and run `python google_sheets.py` it will look for `my_dlt_project/pipelines/.dlt/secrets.toml` a (probably) expected. - -*that was common problem on our workshop - but believe me all other layouts are even worse I've tried* - - -## Secret and config values layout. -`dlt` uses an layout of hierarchical sections to organize the config and secret values. This makes configurations and secrets easy to manage and disambiguates values with the same keys by placing them in the different sections - -> if you know how `toml` files are organized -> this is the same concept! - -> a lot of config values are dictionaries themselves (ie. most of the credentials) and you want the values corresponding to one component to be close together. - -> you can have a separate credentials for your destinations and each of source your pipeline uses, if you have many pipelines in single project, you can have a separate sections corresponding to them. - -Here is the simplest default layout for our `google_sheets` example. - -### OPTION A (default layout) - -**secrets.toml** -```toml -[credentials] -client_email = -private_key = -project_id = -``` -**config.toml** -```toml -tab_names=["tab1", "tab2"] -``` - -As you can see the details of gcp credentials are placed under `credentials` which is argument name to source function - -### OPTION B (explicit layout) - -Here user has full control over the layout - -**secrets.toml** -```toml -[my_section] - - [my_section.gcp_credentials] - client_email = - private_key = -``` -**config.toml** -```toml -[my_section] -tabs=["tab1", "tab2"] - - [my_section.gcp_credentials] - project_id = # I prefer to keep my project id in config file and private key in secrets -``` - -### Default layout and default key lookup during injection - -`dlt` arranges the sections into **default layout** that is used by injection mechanism. This layout makes it easy to configure simple cases but also provides a room for more explicit sections and complex cases ie. having several soures with different credentials or even hosting several pipelines in the same project sharing the same config and credentials. - -``` -pipeline_name - | - |-sources - |- - |- - |- {all source and resource options and secrets} - |- - |- {all source and resource options and secrets} - |- - |... - - |-extract - |- extract options for resources ie. parallelism settings, maybe retries - |-destination - |- - |- {destination options} - |-credentials - |-{credentials options} - |-schema - |- - |-schema settings: not implemented but I'll let people set nesting level, name convention, normalizer etc. here - |-load - |-normalize -``` - -Lookup rules: - -**Rule 1** All the sections above are optional. You are free to arrange your credentials and config without any additional sections -Example: OPTION A (default layout) - -**Rule 2** The lookup starts with the most specific possible path and if value is not found there, it removes the right-most section and tries again. -Example: In case of option A we have just one credentials. But what if `bigquery` credentials are different from `google sheets`? Then we need to allow some sections to separate them. - -```toml -# google sheet credentials -[credentials] -client_email = -private_key = -project_id = - -# bigquery credentials -[destination.credentials] -client_email = -private_key = -project_id = -``` -Now when `dlt` looks for destination credentials, it will encounter the `destination` section and stop there. -When looking for `sources` credentials it will get directly into `credentials` key (corresponding to function argument) - -> we could also rename the argument in the source function! but then we are **forcing** the user to have two copies of credentials. - -Example: let's be even more explicit and use full section path possible -```toml -# google sheet credentials -[sources.google_sheets.credentials] -client_email = -private_key = -project_id = - -# bigquery credentials -[destination.bigquery.credentials] -client_email = -private_key = -project_id = -``` -Where we add destination and source name to be very explicit. - -**Rule 3** You can use your pipeline name to have separate configurations for each pipeline in your project - -Pipeline created/obtained with `dlt.pipeline()` creates a global and optional namespace with the value of `pipeline_name`. All config values will be looked with pipeline name first and then again without it. - -Example: the pipeline is named `ML_sheets` -```toml -[ML_sheets.credentials] -client_email = -private_key = -project_id = -``` - -or maximum path: -```toml -[ML_sheets.sources.google_sheets.credentials] -client_email = -private_key = -project_id = -``` - -### The `sources` section -Config and secrets for decorated sources and resources are kept in `sources..` section. **All sections are optionsl**. For example if source module is named -`pipedrive` and the function decorated with `@dlt.source` is `deals(api_key: str=...)` then `dlt` will look for api key in: -1. `sources.pipedrive.deals.api_key` -2. `sources.pipedrive.api_key` -3. `sources.api_key` -4. `api_key` - -Step 2 in search path allows all the sources/resources in a module to share the same set of credentials. - -Also look at the following [test](/tests/extract/test_decorators.py) : `test_source_sections` - -## Understanding the exceptions -Now we can finally understand the `ConfigFieldMissingException`. Let's run `chess.py` example without providing the password: - -``` -$ CREDENTIALS="postgres://loader@localhost:5432/dlt_data" python chess.py -... -dlt.common.configuration.exceptions.ConfigFieldMissingException: Following fields are missing: ['password'] in configuration with spec PostgresCredentials - for field "password" config providers and keys were tried in following order: - In Environment Variables key CHESS_GAMES__DESTINATION__POSTGRES__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CHESS_GAMES__DESTINATION__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CHESS_GAMES__CREDENTIALS__PASSWORD was not found. - In secrets.toml key chess_games.destination.postgres.credentials.password was not found. - In secrets.toml key chess_games.destination.credentials.password was not found. - In secrets.toml key chess_games.credentials.password was not found. - In Environment Variables key DESTINATION__POSTGRES__CREDENTIALS__PASSWORD was not found. - In Environment Variables key DESTINATION__CREDENTIALS__PASSWORD was not found. - In Environment Variables key CREDENTIALS__PASSWORD was not found. - In secrets.toml key destination.postgres.credentials.password was not found. - In secrets.toml key destination.credentials.password was not found. - In secrets.toml key credentials.password was not found. -Please refer to https://dlthub.com/docs/general-usage/credentials for more information -``` - -It tells you exactly which paths `dlt` looked at, via which config providers and in which order. In the example above -1. First it looked in a big section `chess_games` which is name of the pipeline -2. In each case it starts with full paths and goes to minimum path `credentials.password` -3. First it looks into `environ` then in `secrets.toml`. It displays the exact keys tried. -4. Note that `config.toml` was skipped! It may not contain any secrets. - - -## Working with credentials (and other complex configuration values) - -`GcpServiceAccountCredentials` is an example of a **spec**: a Python `dataclass` that describes the configuration fields, their types and default values. It also allows to parse various native representations of the configuration. Credentials marked with `WithDefaults` mixin are also to instantiate itself from the machine/user default environment ie. googles `default()` or AWS `.aws/credentials`. - -As an example, let's use `ConnectionStringCredentials` which represents a database connection string. - -```python -@dlt.source -def query(sql: str, dsn: ConnectionStringCredentials = dlt.secrets.value): - ... -``` - -The source above executes the `sql` against database defined in `dsn`. `ConnectionStringCredentials` makes sure you get the correct values with correct types and understands the relevant native form of the credentials. - - -Example 1: use the dictionary form -```toml -[dsn] -database="dlt_data" -password="loader" -username="loader" -host="localhost" -``` - -Example:2: use the native form -```toml -dsn="postgres://loader:loader@localhost:5432/dlt_data" -``` - -Example 3: use mixed form: the password is missing in explicit dsn and will be taken from the `secrets.toml` -```toml -dsn.password="loader -``` -```python -query("SELECT * FROM customers", "postgres://loader@localhost:5432/dlt_data") -# or -query("SELECT * FROM customers", {"database": "dlt_data", "username": "loader"...}) -``` - -☮️ We will implement more credentials and let people reuse them when writing pipelines: -- to represent oauth credentials -- api key + api secret -- AWS credentials - - -### Working with alternatives of credentials (Union types) -If your source/resource allows for many authentication methods you can support those seamlessly for your user. The user just passes the right credentials and `dlt` will inject the right type into your decorated function. - -Example: - -> read the whole [test](/tests/common/configuration/test_spec_union.py), it shows how to create unions of credentials that derive from the common class so you can handle it seamlessly in your code. - -```python -@dlt.source -def zen_source(credentials: Union[ZenApiKeyCredentials, ZenEmailCredentials, str] = dlt.secrets.value, some_option: bool = False): - # depending on what the user provides in config, ZenApiKeyCredentials or ZenEmailCredentials will be injected in `credentials` argument - # both classes implement `auth` so you can always call it - credentials.auth() - return dlt.resource([credentials], name="credentials") - -# pass native value -os.environ["CREDENTIALS"] = "email:mx:pwd" -assert list(zen_source())[0].email == "mx" - -# pass explicit native value -assert list(zen_source("secret:🔑:secret"))[0].api_secret == "secret" - -# pass explicit dict -assert list(zen_source(credentials={"email": "emx", "password": "pass"}))[0].email == "emx" - -``` -> This applies not only to credentials but to all specs (see next chapter) - -## Writing own specs - -**specs** let you tak full control over the function arguments: -- which values should be injected, the types, default values. -- you can specify optional and final fields -- form hierarchical configurations (specs in specs). -- provide own handlers for `on_error` or `on_resolved` -- provide own native value parsers -- provide own default credentials logic -- adds all Python dataclass goodies to it -- adds all Python `dict` goodies to it (`specs` instances can be created from dicts and serialized from dicts) - -This is used a lot in the `dlt` core and may become useful for complicated sources. - -In fact for each decorated function a spec is synthesized. In case of `google_sheets` following class is created. -```python -@configspec -class GoogleSheetsConfiguration: - tab_names: List[str] = None # manadatory - credentials: GcpServiceAccountCredentials = None # mandatory secret - only_strings: Optional[bool] = False -``` - -> all specs derive from [BaseConfiguration](/dlt/common/configuration/specs//base_configuration.py) - -> all credentials derive from [CredentialsConfiguration](/dlt/common/configuration/specs//base_configuration.py) - -> Read the docstrings in the code above - -## Interesting / Advanced stuff. - -The approach above makes configs and secrets explicit and autogenerates required lookups. It lets me for example **generate deployments** and **code templates for pipeline scripts** automatically because I know what are the config parameters and I have total control over users code and final values via the decorator. diff --git a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md index 8c626266a4..fe7dafc243 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/weaviate.md +++ b/docs/website/docs/dlt-ecosystem/destinations/weaviate.md @@ -252,7 +252,7 @@ it will be normalized to: so your best course of action is to clean up the data yourself before loading and use default naming convention. Nevertheless you can configure the alternative in `config.toml`: ```toml [schema] -naming="dlt.destinations.weaviate.ci_naming" +naming="dlt.destinations.weaviate.impl.ci_naming" ``` ## Additional destination options diff --git a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py index 2d674407bc..4c3d3f0b3a 100644 --- a/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py +++ b/docs/website/docs/examples/incremental_loading/code/zendesk-snippets.py @@ -10,13 +10,12 @@ def incremental_snippet() -> None: # @@@DLT_SNIPPET_START example # @@@DLT_SNIPPET_START markdown_source - from typing import Iterator, Optional, Dict, Any, Tuple + from typing import Optional, Dict, Any, Tuple import dlt from dlt.common import pendulum from dlt.common.time import ensure_pendulum_datetime - from dlt.common.typing import TDataItem, TDataItems, TAnyDateTime - from dlt.extract.source import DltResource + from dlt.common.typing import TAnyDateTime from dlt.sources.helpers.requests import client diff --git a/docs/website/docs/general-usage/credentials/config_providers.md b/docs/website/docs/general-usage/credentials/config_providers.md index b3da2979a9..1edd6a6e9a 100644 --- a/docs/website/docs/general-usage/credentials/config_providers.md +++ b/docs/website/docs/general-usage/credentials/config_providers.md @@ -1,6 +1,6 @@ --- title: Configuration Providers -description: Configuration dlt Providers +description: Where dlt looks for config/secrets and in which order. keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables, provider] --- diff --git a/docs/website/docs/general-usage/credentials/config_specs.md b/docs/website/docs/general-usage/credentials/config_specs.md index 328d18d2a0..07e56b3e14 100644 --- a/docs/website/docs/general-usage/credentials/config_specs.md +++ b/docs/website/docs/general-usage/credentials/config_specs.md @@ -1,6 +1,6 @@ --- title: Configuration Specs -description: Overview configuration specs and how to create custom specs +description: How to specify complex custom configurations keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables, specs] --- diff --git a/docs/website/docs/general-usage/credentials/configuration.md b/docs/website/docs/general-usage/credentials/configuration.md index a92fb6fd0c..4cb3e17468 100644 --- a/docs/website/docs/general-usage/credentials/configuration.md +++ b/docs/website/docs/general-usage/credentials/configuration.md @@ -1,6 +1,6 @@ --- title: Secrets and Configs -description: Overview secrets and configs +description: What are secrets and configs and how sources and destinations read them. keywords: [credentials, secrets.toml, secrets, config, configuration, environment variables] --- @@ -11,7 +11,7 @@ Secrets and configs are two types of sensitive and non-sensitive information use 1. **Configs**: - Configs refer to non-sensitive configuration data. These are settings, parameters, or options that define the behavior of a data pipeline. - - They can include things like file paths, database connection strings, API endpoints, or any other settings that affect the pipeline's behavior. + - They can include things like file paths, database hosts and timeouts, API endpoints, or any other settings that affect the pipeline's behavior. 2. **Secrets**: - Secrets are sensitive information that should be kept confidential, such as passwords, API keys, private keys, and other confidential data. - It's crucial to never hard-code secrets directly into the code, as it can pose a security risk. Instead, they should be stored securely and accessed via a secure mechanism. @@ -210,6 +210,15 @@ You can pass destination credentials and ignore the default lookup: pipeline = dlt.pipeline(destination="postgres", credentials=dlt.secrets["postgres_dsn"]) ``` +:::Note +**dlt.config** and **dlt.secrets** can be also used as setters. For example: +```python +dlt.config["sheet_id"] = "23029402349032049" +dlt.secrets["destination.postgres.credentials"] = BaseHook.get_connection('postgres_dsn').extra +``` +Will mock the **toml** provider to desired values. +::: + ## Injection mechanism Config and secret values are injected to the function arguments if the function is decorated with diff --git a/docs/website/docs/general-usage/data-contracts.md b/docs/website/docs/general-usage/data-contracts.md new file mode 100644 index 0000000000..543edf2502 --- /dev/null +++ b/docs/website/docs/general-usage/data-contracts.md @@ -0,0 +1,81 @@ +--- +title: Data Contracts +description: Data contracts and controlling schema evolution +keywords: [data contracts, schema, dlt schema, pydantic] +--- + +## Data contracts and controlling schema evolution + +`dlt` will evolve the schema of the destination to accomodate the structure and data types of the extracted data. There are several settings +that you can use to control this automatic schema evolution, from the default settings where all changes to the schema are accepted to +a frozen schema that does not change at all. + +Consider this example: + +```py +@dlt.resource(schema_contract={"tables": "evolve", "columns": "freeze"}) +def items(): + ... +``` + +This resource will allow new subtables to be created, but will throw an exception if data is extracted for an existing table which +contains a new column. + +### Possible settings + +The `schema_contract` exists on the `source` decorator as a directive for all resources of that source and on the +`resource` decorator as a directive for the individual resource. Additionally it exists on the `pipeline.run()` method, which will override all existing settings. +The `schema_contract` is a dictionary with keys that control the following: + +* `table` creating of new tables and subtables +* `columns` creating of new columns on an existing table +* `data_type` creating of new variant columns, which happens if a different datatype is discovered in the extracted data than exists in the schema + +Each property can be set to one of three values: +* `freeze`: This will raise an exception if data is encountered that does not fit the existing schema, so no data will be loaded to the destination +* `discard_row`: This will discard any extracted row if it does not adhere to the existing schema, and this row will not be loaded to the destination. All other rows will be. +* `discard_value`: This will discard data in an extracted row that does not adhere to the existing schema and the row will be loaded without this data. + +If a table is a new table that has not been created on the destination yet, dlt will allow the creation of all columns and variants on the first run + +### Code Examples + +The below code will silently ignore new subtables, allow new columns to be added to existing tables and raise an error if a variant of a column is discovered. + +```py +@dlt.resource(schema_contract={"tables": "discard_row", "columns": "evolve", "data_type": "freeze"}) +def items(): + ... +``` + +The below Code will raise on any encountered schema change. Note: You can always set a string which will be interpreted as though all keys are set to these values. + +```py +pipeline.run(my_source(), schema_contract="freeze") +``` + +The below code defines some settings on the source which can be overwritten on the resource which in turn can be overwritten by the global override on the `run` method. +Here for all resources variant columns are frozen and raise an error if encountered, on `items` new columns are allowed but `other_items` inherits the `freeze` setting from +the source, thus new columns are frozen there. New tables are allowed. + +```py +@dlt.resource(schema_contract={"columns": "evolve"}) +def items(): + ... + +@dlt.resource() +def other_items(): + ... + +@dlt.source(schema_contract={"columns": "freeze", "data_type": "freeze"}): +def source(): + return [items(), other_items()] + + +# this will use the settings defined by the decorators +pipeline.run(source()) + +# this will freeze the whole schema, regardless of the decorator settings +pipeline.run(source(), schema_contract="freeze") + +``` \ No newline at end of file diff --git a/docs/website/docs/general-usage/resource.md b/docs/website/docs/general-usage/resource.md index e203b3d93a..a7f68fadd1 100644 --- a/docs/website/docs/general-usage/resource.md +++ b/docs/website/docs/general-usage/resource.md @@ -110,21 +110,27 @@ Things to note: - Fields with an `Optional` type are marked as `nullable` - Fields with a `Union` type are converted to the first (not `None`) type listed in the union. E.g. `status: Union[int, str]` results in a `bigint` column. -- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. You can override this by manually calling the pydantic helper with `skip_complex_types=True`, see below: +- `list`, `dict` and nested pydantic model fields will use the `complex` type which means they'll be stored as a JSON object in the database instead of creating child tables. + +You can override this by configuring the Pydantic model ```python -from dlt.common.lib.pydantic import pydantic_to_table_schema_columns +from typing import ClassVar +from dlt.common.libs.pydantic import DltConfig -... +class UserWithNesting(User): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} -@dlt.resource(name="user", columns=pydantic_to_table_schema_columns(User, skip_complex_types=True)) +@dlt.resource(name="user", columns=UserWithNesting) def get_users(): ... ``` -This omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default +`"skip_complex_types"` omits any `dict`/`list`/`BaseModel` type fields from the schema, so dlt will fall back on the default behaviour of creating child tables for these fields. +We do not support `RootModel` that validate simple types. You can add such validator yourself, see [data filtering section](#filter-transform-and-pivot-data). + ### Dispatch data to many tables You can load data to many tables from a single resource. The most common case is a stream of events diff --git a/docs/website/docs/getting-started-snippets.py b/docs/website/docs/getting-started-snippets.py index c4bd789834..be21a7f757 100644 --- a/docs/website/docs/getting-started-snippets.py +++ b/docs/website/docs/getting-started-snippets.py @@ -290,7 +290,7 @@ def pdf_to_weaviate_snippet() -> None: import os import dlt - from dlt.destinations.weaviate import weaviate_adapter + from dlt.destinations.impl.weaviate import weaviate_adapter from PyPDF2 import PdfReader diff --git a/docs/website/docs/reference/command-line-interface.md b/docs/website/docs/reference/command-line-interface.md index a2516a41de..d774d5faa6 100644 --- a/docs/website/docs/reference/command-line-interface.md +++ b/docs/website/docs/reference/command-line-interface.md @@ -236,3 +236,19 @@ dlt pipeline --list-pipelines This command lists all the pipelines executed on the local machine with their working data in the default pipelines folder. + +### Drop pending and partially loaded packages +```sh +dlt pipeline drop-pending-packages +``` +Removes all extracted and normalized packages in the pipeline's working dir. +`dlt` keeps extracted and normalized load packages in pipeline working directory. When `run` method is called, it will attempt to normalize and load +pending packages first. The command above removes such packages. Note that **pipeline state** is not reverted to the state at which the deleted package +were created. Use `dlt pipeline ... sync` is recommended if your destination supports state sync. + + +## Show stack traces +If the command fails and you want to see the full stack trace add `--debug` just after `dlt` executable. +```sh +dlt --debug pipeline github info +``` diff --git a/docs/website/docs/reference/tracing.md b/docs/website/docs/reference/tracing.md new file mode 100644 index 0000000000..0ad0a59912 --- /dev/null +++ b/docs/website/docs/reference/tracing.md @@ -0,0 +1,6 @@ +1. Identifiers + +2. Data Lineage + +3. Schema Lineage + diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 0dc7416caa..9ae94a8514 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -106,6 +106,7 @@ const sidebars = { 'general-usage/incremental-loading', 'general-usage/full-loading', 'general-usage/schema', + 'general-usage/data-contracts', { type: 'category', label: 'Configuration', diff --git a/poetry.lock b/poetry.lock index e925740fb7..018c1357fe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -136,6 +136,17 @@ python-versions = ">=3.7, <4" about-time = "4.2.1" grapheme = "0.6.0" +[[package]] +name = "annotated-types" +version = "0.6.0" +description = "Reusable constraint types to use with typing.Annotated" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.9\""} + [[package]] name = "ansicon" version = "1.89.0" @@ -164,7 +175,7 @@ trio = ["trio (>=0.22)"] [[package]] name = "apache-airflow" -version = "2.7.0" +version = "2.7.2" description = "Programmatically author, schedule and monitor data pipelines" category = "dev" optional = false @@ -191,7 +202,7 @@ cryptography = ">=0.9.3" deprecated = ">=1.2.13" dill = ">=0.2.2" flask = ">=2.2,<2.3" -flask-appbuilder = "4.3.3" +flask-appbuilder = "4.3.6" flask-caching = ">=1.5.0" flask-login = ">=0.6.2" flask-session = ">=0.4.0" @@ -200,7 +211,7 @@ google-re2 = ">=1.0" graphviz = ">=0.12" gunicorn = ">=20.1.0" httpx = "*" -importlib-metadata = {version = ">=1.7,<5.0.0", markers = "python_version < \"3.9\""} +importlib-metadata = {version = ">=1.7", markers = "python_version < \"3.9\""} importlib-resources = {version = ">=5.2", markers = "python_version < \"3.9\""} itsdangerous = ">=2.0" jinja2 = ">=3.0.0" @@ -213,14 +224,14 @@ markdown-it-py = ">=2.1.0" markupsafe = ">=1.1.1" marshmallow-oneofschema = ">=2.0.1" mdit-py-plugins = ">=0.3.0" -opentelemetry-api = "1.15.0" +opentelemetry-api = ">=1.15.0" opentelemetry-exporter-otlp = "*" packaging = ">=14.0" pathspec = ">=0.9.0" pendulum = ">=2.0" pluggy = ">=1.0" psutil = ">=4.2.0" -pydantic = ">=1.10.0,<2.0.0" +pydantic = ">=1.10.0" pygments = ">=2.0.1" pyjwt = ">=2.0.0" python-daemon = ">=3.0.0" @@ -231,7 +242,7 @@ rfc3339-validator = ">=0.1.4" rich = ">=12.4.4" rich-argparse = ">=1.0.0" setproctitle = ">=1.1.8" -sqlalchemy = ">=1.4,<2.0" +sqlalchemy = ">=1.4.28,<2.0" sqlalchemy-jsonfield = ">=1.0" tabulate = ">=0.7.5" tenacity = ">=6.2.0,<8.2.0 || >8.2.0" @@ -244,8 +255,8 @@ werkzeug = ">=2.0" aiobotocore = ["aiobotocore (>=2.1.1)"] airbyte = ["apache-airflow-providers-airbyte"] alibaba = ["apache-airflow-providers-alibaba"] -all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.24.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>7,<7.15.0)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "sasl (>=0.3.1)", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] -all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive] (>=0.6.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sasl (>=0.3.1)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] +all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "elasticsearch (>8,<9)", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "plyvel", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-toolbelt", "scrapbook[all]", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "zenpy (>=2.0.24)"] +all-dbs = ["aiohttp (>=3.6.3,<4)", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-arangodb", "apache-airflow-providers-cloudant", "apache-airflow-providers-common-sql (>=1.3.1)", "apache-airflow-providers-common-sql (>=1.5.0)", "apache-airflow-providers-databricks", "apache-airflow-providers-exasol", "apache-airflow-providers-influxdb", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "cassandra-driver (>=3.13.0)", "cloudant (>=2.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "dnspython (>=1.13.0)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "pandas (>=0.17.1)", "pinotdb (>0.4.7)", "presto-python-client (>=0.8.2)", "psycopg2-binary (>=2.8.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pyhive[hive-pure-sasl] (>=0.7.0)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "python-arango (>=7.3.2)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "sqlalchemy-drill (>=1.1.0)", "thrift (>=0.9.2)", "trino (>=0.318.0)", "vertica-python (>=0.5.1)"] amazon = ["apache-airflow-providers-amazon"] apache-atlas = ["atlasclient (>=0.1.2)"] apache-beam = ["apache-airflow-providers-apache-beam"] @@ -273,7 +284,7 @@ atlassian-jira = ["apache-airflow-providers-atlassian-jira"] aws = ["apache-airflow-providers-amazon"] azure = ["apache-airflow-providers-microsoft-azure"] cassandra = ["apache-airflow-providers-apache-cassandra"] -celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.2.3,<6)", "flower (>=1.0.0)"] +celery = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-celery", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "flower (>=1.0.0)"] cgroups = ["cgroupspy (>=0.2.2)"] cloudant = ["apache-airflow-providers-cloudant"] cncf-kubernetes = ["apache-airflow (>=2.4.0)", "apache-airflow-providers-cncf-kubernetes", "asgiref (>=3.5.2)", "cryptography (>=2.0.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)"] @@ -284,13 +295,13 @@ databricks = ["apache-airflow-providers-databricks"] datadog = ["apache-airflow-providers-datadog"] dbt-cloud = ["apache-airflow-providers-dbt-cloud"] deprecated-api = ["requests (>=2.26.0)"] -devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] -devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-qubole", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.24.0)", "bowler", "cassandra-driver (>=3.13.0)", "celery (>=5.2.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>7,<7.15.0)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.3)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "jira", "json-merge-patch (>=0.2)", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.24.0,<1.28.12)", "mypy-boto3-rds (>=1.24.0)", "mypy-boto3-redshift-data (>=1.24.0)", "mypy-boto3-s3 (>=1.24.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-jose", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "qds-sdk (>=1.10.4)", "qds-sdk (>=1.9.6)", "redis (>=3.2.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] -devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3)", "aws-xray-sdk", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "bowler", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jira", "jsondiff", "jsonpath-ng (>=1.5.3)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[cloudformation,glue] (>=4.0)", "mypy (==1.2.0)", "mysqlclient (>=1.3.6)", "pandas (>=0.17.1)", "paramiko", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive] (>=0.6.0)", "pykerberos (>=1.1.13)", "pypsrp", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-jose", "pywinrm", "qds-sdk (>=1.9.6)", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "sasl (>=0.3.1)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-boto", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-common-sql", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "pyarrow (>=9.0.0)", "pygithub", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] +devel-all = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-ci = ["PyGithub (!=1.58)", "PyOpenSSL", "adal (>=1.2.7)", "aiobotocore (>=2.1.1)", "aiohttp", "aiohttp (>=3.6.3,<4)", "aioresponses", "alibabacloud-adb20211201 (>=1.0.0)", "alibabacloud-tea-openapi (>=0.3.7)", "amqp", "analytics-python (>=1.2.9)", "apache-airflow (>=2.4.0)", "apache-airflow (>=2.7.0)", "apache-airflow-providers-airbyte", "apache-airflow-providers-alibaba", "apache-airflow-providers-amazon", "apache-airflow-providers-apache-beam", "apache-airflow-providers-apache-cassandra", "apache-airflow-providers-apache-drill", "apache-airflow-providers-apache-druid", "apache-airflow-providers-apache-flink", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-apache-impala", "apache-airflow-providers-apache-kafka", "apache-airflow-providers-apache-kylin", "apache-airflow-providers-apache-livy", "apache-airflow-providers-apache-pig", "apache-airflow-providers-apache-pinot", "apache-airflow-providers-apache-spark", "apache-airflow-providers-apache-sqoop", "apache-airflow-providers-apprise", "apache-airflow-providers-arangodb", "apache-airflow-providers-asana", "apache-airflow-providers-atlassian-jira", "apache-airflow-providers-celery", "apache-airflow-providers-cloudant", "apache-airflow-providers-cncf-kubernetes", "apache-airflow-providers-common-sql", "apache-airflow-providers-daskexecutor", "apache-airflow-providers-databricks", "apache-airflow-providers-datadog", "apache-airflow-providers-dbt-cloud", "apache-airflow-providers-dingding", "apache-airflow-providers-discord", "apache-airflow-providers-docker", "apache-airflow-providers-elasticsearch", "apache-airflow-providers-exasol", "apache-airflow-providers-facebook", "apache-airflow-providers-ftp", "apache-airflow-providers-github", "apache-airflow-providers-google", "apache-airflow-providers-grpc", "apache-airflow-providers-hashicorp", "apache-airflow-providers-http", "apache-airflow-providers-imap", "apache-airflow-providers-influxdb", "apache-airflow-providers-jdbc", "apache-airflow-providers-jenkins", "apache-airflow-providers-microsoft-azure", "apache-airflow-providers-microsoft-mssql", "apache-airflow-providers-microsoft-psrp", "apache-airflow-providers-microsoft-winrm", "apache-airflow-providers-mongo", "apache-airflow-providers-mysql", "apache-airflow-providers-neo4j", "apache-airflow-providers-odbc", "apache-airflow-providers-openfaas", "apache-airflow-providers-openlineage", "apache-airflow-providers-opsgenie", "apache-airflow-providers-oracle", "apache-airflow-providers-pagerduty", "apache-airflow-providers-papermill", "apache-airflow-providers-plexus", "apache-airflow-providers-postgres", "apache-airflow-providers-presto", "apache-airflow-providers-redis", "apache-airflow-providers-salesforce", "apache-airflow-providers-samba", "apache-airflow-providers-segment", "apache-airflow-providers-sendgrid", "apache-airflow-providers-sftp", "apache-airflow-providers-singularity", "apache-airflow-providers-slack", "apache-airflow-providers-smtp", "apache-airflow-providers-snowflake", "apache-airflow-providers-sqlite", "apache-airflow-providers-ssh", "apache-airflow-providers-tableau", "apache-airflow-providers-tabular", "apache-airflow-providers-telegram", "apache-airflow-providers-trino", "apache-airflow-providers-vertica", "apache-airflow-providers-zendesk", "apache-beam (>=2.47.0)", "apprise", "arrow (>=0.16.0)", "asana (>=0.10,<4.0.0)", "asgiref", "asgiref (>=3.5.2)", "astroid (>=2.12.3,<3.0)", "atlasclient (>=0.1.2)", "atlassian-python-api (>=1.14.2)", "attrs (>=22.2)", "authlib (>=1.0.0)", "aws-xray-sdk", "azure-batch (>=8.0.0)", "azure-cosmos (>=4.0.0)", "azure-datalake-store (>=0.0.45)", "azure-identity (>=1.3.1)", "azure-keyvault-secrets (>=4.1.0)", "azure-kusto-data (>=0.0.43,<0.1)", "azure-mgmt-containerinstance (>=1.5.0,<2.0)", "azure-mgmt-datafactory (>=1.0.0,<2.0)", "azure-mgmt-datalake-store (>=0.5.0)", "azure-mgmt-resource (>=2.2.0)", "azure-servicebus (>=7.6.1)", "azure-storage-blob (>=12.14.0)", "azure-storage-common (>=2.1.0)", "azure-storage-file (>=2.1.0)", "azure-storage-file-datalake (>=12.9.1)", "azure-synapse-spark", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "blinker (>=1.1)", "boto3 (>=1.28.0)", "botocore (>=1.31.0)", "cassandra-driver (>=3.13.0)", "celery (>=5.3.0,!=5.3.2,!=5.3.3,<6)", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "cloudant (>=2.0)", "cloudpickle (>=1.4.1)", "confluent-kafka (>=1.8.2)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "dask (>=2.9.0,!=2022.10.1,!=2023.5.0)", "databricks-sql-connector (>=2.0.0,<3.0.0)", "datadog (>=0.14.0)", "distributed (>=2.11.1,!=2023.5.0)", "dnspython (>=1.13.0)", "docker (>=5.0.3)", "docutils (<0.17.0)", "elasticsearch (>8,<9)", "eralchemy2", "eventlet (>=0.33.3)", "facebook-business (>=6.0.2)", "filelock", "flask-appbuilder[oauth] (==4.3.6)", "flask-bcrypt (>=0.7.1)", "flower (>=1.0.0)", "gcloud-aio-auth (>=4.0.0,<5.0.0)", "gcloud-aio-bigquery (>=6.1.2)", "gcloud-aio-storage", "gevent (>=0.13)", "gitpython", "google-ads (>=21.2.0)", "google-api-core (>=2.11.0)", "google-api-python-client (>=1.6.0)", "google-auth (>=1.0.0)", "google-auth (>=1.0.0,<3.0.0)", "google-auth-httplib2 (>=0.0.1)", "google-cloud-aiplatform (>=1.22.1)", "google-cloud-automl (>=2.11.0)", "google-cloud-bigquery-datatransfer (>=3.11.0)", "google-cloud-bigtable (>=2.17.0)", "google-cloud-build (>=3.13.0)", "google-cloud-compute (>=1.10.0)", "google-cloud-container (>=2.17.4)", "google-cloud-datacatalog (>=3.11.1)", "google-cloud-dataflow-client (>=0.8.2)", "google-cloud-dataform (>=0.5.0)", "google-cloud-dataplex (>=1.4.2)", "google-cloud-dataproc (>=5.4.0)", "google-cloud-dataproc-metastore (>=1.12.0)", "google-cloud-dlp (>=3.12.0)", "google-cloud-kms (>=2.15.0)", "google-cloud-language (>=2.9.0)", "google-cloud-logging (>=3.5.0)", "google-cloud-memcache (>=1.7.0)", "google-cloud-monitoring (>=2.14.1)", "google-cloud-orchestration-airflow (>=1.7.0)", "google-cloud-os-login (>=2.9.1)", "google-cloud-pubsub (>=2.15.0)", "google-cloud-redis (>=2.12.0)", "google-cloud-secret-manager (>=2.16.0)", "google-cloud-spanner (>=3.11.1)", "google-cloud-speech (>=2.18.0)", "google-cloud-storage (>=2.7.0)", "google-cloud-storage-transfer (>=1.4.1)", "google-cloud-tasks (>=2.13.0)", "google-cloud-texttospeech (>=2.14.1)", "google-cloud-translate (>=3.11.0)", "google-cloud-videointelligence (>=2.11.0)", "google-cloud-vision (>=3.4.0)", "google-cloud-workflows (>=1.10.0)", "greenlet (>=0.4.9)", "grpcio (>=1.15.0)", "grpcio-gcp (>=0.2.2)", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "httpx", "hvac (>=0.10)", "impyla (>=0.18.0,<1.0)", "influxdb-client (>=1.19.0)", "ipdb", "jaydebeapi (>=1.1.1)", "json-merge-patch (>=0.2)", "jsonpath-ng (>=1.5.3)", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "kubernetes-asyncio (>=18.20.1,<25)", "kylinpy (>=2.6)", "ldap3 (>=2.5.1)", "looker-sdk (>=22.2.0)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "neo4j (>=4.2.1)", "openapi-spec-validator (>=0.2.8)", "openlineage-integration-common (>=0.28.0)", "openlineage-python (>=0.28.0)", "opentelemetry-exporter-prometheus", "opsgenie-sdk (>=2.1.5)", "oracledb (>=1.0.0)", "oss2 (>=2.14.0)", "pandas (>=0.17.1)", "pandas-gbq", "papermill[all] (>=1.2.1)", "paramiko (>=2.6.0)", "pdpyras (>=4.1.2)", "pinotdb (>0.4.7)", "pipdeptree", "plyvel", "pre-commit", "presto-python-client (>=0.8.2)", "proto-plus (>=1.19.6)", "psycopg2-binary (>=2.8.0)", "pyarrow (>=9.0.0)", "pydruid (>=0.4.1)", "pyexasol (>=0.5.1)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pymongo (>=3.6.0)", "pymssql (>=2.1.5)", "pyodbc", "pypsrp (>=0.8.0)", "pyspark", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "python-arango (>=7.3.2)", "python-dotenv (>=0.21.0)", "python-jenkins (>=1.0.0)", "python-ldap", "python-telegram-bot (>=20.0.0)", "pywinrm", "pywinrm (>=0.4)", "redis (>=4.5.2,!=4.5.5,<5.0.0)", "redshift-connector (>=2.0.888)", "requests (>=2.26.0)", "requests (>=2.27,<3)", "requests-kerberos (>=0.10.0)", "requests-mock", "requests-toolbelt", "rich-click (>=1.5)", "ruff (>=0.0.219)", "scrapbook[all]", "semver", "sendgrid (>=6.0.0)", "sentry-sdk (>=0.8.0)", "simple-salesforce (>=1.0.0)", "slack-sdk (>=3.0.0)", "smbprotocol (>=1.5.0)", "snowflake-connector-python (>=2.4.1)", "snowflake-sqlalchemy (>=1.1.0)", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "spython (>=0.0.56)", "sqlalchemy-bigquery (>=1.2.1)", "sqlalchemy-drill (>=1.1.0)", "sqlalchemy-redshift (>=0.8.6)", "sqlalchemy-spanner (>=1.6.2)", "sqlparse (>=0.4.2)", "sshtunnel (>=0.3.2)", "statsd (>=3.3.0)", "tableauserverclient", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "trino (>=0.318.0)", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "vertica-python (>=0.5.1)", "virtualenv", "watchtower (>=2.0.1,<2.1.0)", "wheel", "yamllint", "zenpy (>=2.0.24)"] +devel-hadoop = ["aiobotocore (>=2.1.1)", "aioresponses", "apache-airflow (>=2.4.0)", "apache-airflow-providers-apache-hdfs", "apache-airflow-providers-apache-hive", "apache-airflow-providers-common-sql", "apache-airflow-providers-presto", "apache-airflow-providers-trino", "astroid (>=2.12.3,<3.0)", "aws-xray-sdk", "backports.zoneinfo (>=0.2.1)", "bcrypt (>=2.0.0)", "beautifulsoup4 (>=4.7.1)", "black", "blinker", "cgroupspy (>=0.2.2)", "checksumdir", "click (>=8.0)", "click (>=8.0,!=8.1.4,!=8.1.5)", "coverage (>=7.2)", "cryptography (>=2.0.0)", "docutils (<0.17.0)", "eralchemy2", "filelock", "flask-bcrypt (>=0.7.1)", "gitpython", "hdfs[avro,dataframe,kerberos] (>=2.0.4)", "hmsclient (>=0.1.0)", "impyla (>=0.18.0,<1.0)", "ipdb", "jsonschema (>=3.0)", "kubernetes (>=21.7.0,<24)", "mongomock", "moto[glue] (>=4.0)", "mypy (==1.2.0)", "mypy-boto3-appflow (>=1.28.0)", "mypy-boto3-rds (>=1.28.0)", "mypy-boto3-redshift-data (>=1.28.0)", "mypy-boto3-s3 (>=1.28.0)", "mysqlclient (>=1.3.6)", "openapi-spec-validator (>=0.2.8)", "pandas (>=0.17.1)", "pipdeptree", "pre-commit", "presto-python-client (>=0.8.2)", "pyarrow (>=9.0.0)", "pygithub", "pyhive[hive-pure-sasl] (>=0.7.0)", "pykerberos (>=1.1.13)", "pytest", "pytest-asyncio", "pytest-capture-warnings", "pytest-cov", "pytest-httpx", "pytest-instafail", "pytest-mock", "pytest-rerunfailures", "pytest-timeouts", "pytest-xdist", "pywinrm", "requests-kerberos (>=0.10.0)", "requests-mock", "rich-click (>=1.5)", "ruff (>=0.0.219)", "semver", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)", "thrift (>=0.9.2)", "thrift-sasl (>=0.2.0)", "time-machine", "towncrier", "twine", "types-Deprecated", "types-Markdown", "types-PyMySQL", "types-PyYAML", "types-certifi", "types-croniter", "types-docutils", "types-paramiko", "types-protobuf", "types-python-dateutil", "types-python-slugify", "types-pytz", "types-redis", "types-requests", "types-setuptools", "types-tabulate", "types-termcolor", "types-toml", "wheel", "yamllint"] dingding = ["apache-airflow-providers-dingding"] discord = ["apache-airflow-providers-discord"] -doc = ["astroid (>=2.12.3)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] +doc = ["astroid (>=2.12.3,<3.0)", "checksumdir", "click (>=8.0,!=8.1.4,!=8.1.5)", "docutils (<0.17.0)", "eralchemy2", "sphinx (>=5.2.0)", "sphinx-airflow-theme", "sphinx-argparse (>=0.1.13)", "sphinx-autoapi (>=2.0.0)", "sphinx-copybutton", "sphinx-jinja (>=2.0)", "sphinx-rtd-theme (>=0.1.6)", "sphinxcontrib-httpdomain (>=1.7.0)", "sphinxcontrib-redoc (>=1.6.0)", "sphinxcontrib-spelling (>=7.3)"] doc-gen = ["eralchemy2"] docker = ["apache-airflow-providers-docker"] druid = ["apache-airflow-providers-apache-druid"] @@ -301,9 +312,9 @@ ftp = ["apache-airflow-providers-ftp"] gcp = ["apache-airflow-providers-google"] gcp-api = ["apache-airflow-providers-google"] github = ["apache-airflow-providers-github"] -github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +github-enterprise = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] google = ["apache-airflow-providers-google"] -google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.3)"] +google-auth = ["authlib (>=1.0.0)", "flask-appbuilder[oauth] (==4.3.6)"] grpc = ["apache-airflow-providers-grpc"] hashicorp = ["apache-airflow-providers-hashicorp"] hdfs = ["apache-airflow-providers-apache-hdfs"] @@ -340,7 +351,6 @@ plexus = ["apache-airflow-providers-plexus"] postgres = ["apache-airflow-providers-postgres"] presto = ["apache-airflow-providers-presto"] qds = ["apache-airflow-providers-qubole"] -qubole = ["apache-airflow-providers-qubole"] rabbitmq = ["amqp"] redis = ["apache-airflow-providers-redis"] s3 = ["apache-airflow-providers-amazon"] @@ -1846,7 +1856,7 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-appbuilder" -version = "4.3.3" +version = "4.3.6" description = "Simple and rapid application development framework, built on top of Flask. includes detailed security, auto CRUD generation for your models, google charts and much more." category = "dev" optional = false @@ -1878,6 +1888,7 @@ WTForms = "<4" jmespath = ["jmespath (>=0.9.5)"] oauth = ["Authlib (>=0.14,<2.0.0)"] openid = ["Flask-OpenID (>=1.2.5,<2)"] +talisman = ["flask-talisman (>=1.0.0,<2.0)"] [[package]] name = "flask-babel" @@ -3415,17 +3426,6 @@ category = "main" optional = false python-versions = ">=3.7" -[[package]] -name = "oscrypto" -version = "1.3.0" -description = "TLS (SSL) sockets, key generation, encryption, decryption, signing, verification and KDFs using the OS crypto libraries. Does not require a compiler, and relies on the OS for patching. Works on Windows, OS X and Linux/BSD." -category = "main" -optional = true -python-versions = "*" - -[package.dependencies] -asn1crypto = ">=1.5.1" - [[package]] name = "packaging" version = "23.1" @@ -3436,7 +3436,7 @@ python-versions = ">=3.7" [[package]] name = "pandas" -version = "1.5.3" +version = "2.0.3" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -3448,11 +3448,32 @@ numpy = [ {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] -python-dateutil = ">=2.8.1" +python-dateutil = ">=2.8.2" pytz = ">=2020.1" - -[package.extras] -test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"] +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] [[package]] name = "parsedatetime" @@ -3658,11 +3679,11 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "pyarrow" -version = "10.0.1" +version = "14.0.1" description = "Python library for Apache Arrow" category = "main" optional = true -python-versions = ">=3.7" +python-versions = ">=3.8" [package.dependencies] numpy = ">=1.16.6" @@ -3722,28 +3743,32 @@ category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -[[package]] -name = "pycryptodomex" -version = "3.18.0" -description = "Cryptographic library for Python" -category = "main" -optional = true -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" - [[package]] name = "pydantic" -version = "1.10.12" -description = "Data validation and settings management using python type hints" +version = "2.5.0" +description = "Data validation using Python type hints" category = "main" optional = false python-versions = ">=3.7" [package.dependencies] -typing-extensions = ">=4.2.0" +annotated-types = ">=0.4.0" +pydantic-core = "2.14.1" +typing-extensions = ">=4.6.1" [package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] +email = ["email-validator (>=2.0.0)"] + +[[package]] +name = "pydantic-core" +version = "2.14.1" +description = "" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydoc-markdown" @@ -4435,7 +4460,7 @@ python-versions = ">=3.7" [[package]] name = "snowflake-connector-python" -version = "3.1.1" +version = "3.5.0" description = "Snowflake Connector for Python" category = "main" optional = true @@ -4450,12 +4475,10 @@ cryptography = ">=3.1.0,<42.0.0" filelock = ">=3.5,<4" idna = ">=2.5,<4" keyring = {version = "<16.1.0 || >16.1.0,<25.0.0", optional = true, markers = "extra == \"secure-local-storage\""} -oscrypto = "<2.0.0" packaging = "*" pandas = {version = ">=1.0.0,<2.1.0", optional = true, markers = "extra == \"pandas\""} -platformdirs = ">=2.6.0,<3.9.0" -pyarrow = {version = ">=10.0.1,<10.1.0", optional = true, markers = "extra == \"pandas\""} -pycryptodomex = ">=3.2,<3.5.0 || >3.5.0,<4.0.0" +platformdirs = ">=2.6.0,<4.0.0" +pyarrow = {version = "*", optional = true, markers = "extra == \"pandas\""} pyjwt = "<3.0.0" pyOpenSSL = ">=16.2.0,<24.0.0" pytz = "*" @@ -4463,11 +4486,11 @@ requests = "<3.0.0" sortedcontainers = ">=2.4.0" tomlkit = "*" typing-extensions = ">=4.3,<5" -urllib3 = ">=1.21.1,<1.27" +urllib3 = ">=1.21.1,<2.0.0" [package.extras] -development = ["Cython", "coverage", "more-itertools", "numpy (<1.26.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] -pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow (>=10.0.1,<10.1.0)"] +development = ["Cython", "coverage", "more-itertools", "numpy (<1.27.0)", "pendulum (!=2.1.1)", "pexpect", "pytest (<7.5.0)", "pytest-cov", "pytest-rerunfailures", "pytest-timeout", "pytest-xdist", "pytzdata"] +pandas = ["pandas (>=1.0.0,<2.1.0)", "pyarrow"] secure-local-storage = ["keyring (!=16.1.0,<25.0.0)"] [[package]] @@ -5063,7 +5086,6 @@ motherduck = ["duckdb", "pyarrow"] mssql = ["pyodbc"] parquet = ["pyarrow"] postgres = ["psycopg2-binary", "psycopg2cffi"] -pydantic = ["pydantic"] qdrant = ["qdrant-client"] redshift = ["psycopg2-binary", "psycopg2cffi"] s3 = ["s3fs", "botocore"] @@ -5073,7 +5095,7 @@ weaviate = ["weaviate-client"] [metadata] lock-version = "1.1" python-versions = ">=3.8.1,<3.13" -content-hash = "f857c300f44dadb0cf25af0016c5baf5318097a2d6b0d7035f6aaa2e7fb592b2" +content-hash = "bbfaab078877deaa60ecf6bc95c0374e1967268ca24594a99b792b88c4ef270b" [metadata.files] about-time = [ @@ -5197,6 +5219,10 @@ alive-progress = [ {file = "alive-progress-3.1.4.tar.gz", hash = "sha256:74a95d8d0d42bc99d3a3725dbd06ebb852245f1b64e301a7c375b92b22663f7b"}, {file = "alive_progress-3.1.4-py3-none-any.whl", hash = "sha256:c80ad87ce9c1054b01135a87fae69ecebbfc2107497ae87cbe6aec7e534903db"}, ] +annotated-types = [ + {file = "annotated_types-0.6.0-py3-none-any.whl", hash = "sha256:0641064de18ba7a25dee8f96403ebc39113d0cb953a01429249d5c7564666a43"}, + {file = "annotated_types-0.6.0.tar.gz", hash = "sha256:563339e807e53ffd9c267e99fc6d9ea23eb8443c08f112651963e24e22f84a5d"}, +] ansicon = [ {file = "ansicon-1.89.0-py2.py3-none-any.whl", hash = "sha256:f1def52d17f65c2c9682cf8370c03f541f410c1752d6a14029f97318e4b9dfec"}, {file = "ansicon-1.89.0.tar.gz", hash = "sha256:e4d039def5768a47e4afec8e89e83ec3ae5a26bf00ad851f914d1240b444d2b1"}, @@ -5206,8 +5232,8 @@ anyio = [ {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, ] apache-airflow = [ - {file = "apache-airflow-2.7.0.tar.gz", hash = "sha256:06fba3df5943b6eda5e2f033e7e45b6ea557d89909ca36e61614ea61075f9722"}, - {file = "apache_airflow-2.7.0-py3-none-any.whl", hash = "sha256:8e3cf4b3cd8583a2e76bd04827af8d34747e0cf30a28cf0e70f4f4f39ce61f6d"}, + {file = "apache-airflow-2.7.2.tar.gz", hash = "sha256:c6fab3449066867d9a7728f40b6b9e27f1ea68bca39b064a27f5c5ddc3262224"}, + {file = "apache_airflow-2.7.2-py3-none-any.whl", hash = "sha256:1bc2c022bcae24b911e49fafd5fb619b49efba87ed7bc8561a2065810d8fe899"}, ] apache-airflow-providers-common-sql = [ {file = "apache-airflow-providers-common-sql-1.7.1.tar.gz", hash = "sha256:ba37f795d9656a87cf4661edc381b8ecfe930272c59324b59f8a158fd0971aeb"}, @@ -5784,8 +5810,8 @@ flask = [ {file = "Flask-2.2.5.tar.gz", hash = "sha256:edee9b0a7ff26621bd5a8c10ff484ae28737a2410d99b0bb9a6850c7fb977aa0"}, ] flask-appbuilder = [ - {file = "Flask-AppBuilder-4.3.3.tar.gz", hash = "sha256:b420379f74788e431a2763f8d3749cc37712df682dc00a45538d85d989340768"}, - {file = "Flask_AppBuilder-4.3.3-py3-none-any.whl", hash = "sha256:7eb1904d8f61297778ebf0d0b83f1d74b154534c9e84af3bb9198cfc0f51ff05"}, + {file = "Flask-AppBuilder-4.3.6.tar.gz", hash = "sha256:8ca9710fa7d2704747d195e11b487d45a571f40559d8399d9d5dfa42ea1f3c78"}, + {file = "Flask_AppBuilder-4.3.6-py3-none-any.whl", hash = "sha256:840480dfd43134bebf78f3c7dc909e324c2689d2d9f27aeb1880a8a25466bc8d"}, ] flask-babel = [ {file = "Flask-Babel-2.0.0.tar.gz", hash = "sha256:f9faf45cdb2e1a32ea2ec14403587d4295108f35017a7821a2b1acb8cfd9257d"}, @@ -7055,42 +7081,36 @@ orjson = [ {file = "orjson-3.9.5-cp39-none-win_amd64.whl", hash = "sha256:91dda66755795ac6100e303e206b636568d42ac83c156547634256a2e68de694"}, {file = "orjson-3.9.5.tar.gz", hash = "sha256:6daf5ee0b3cf530b9978cdbf71024f1c16ed4a67d05f6ec435c6e7fe7a52724c"}, ] -oscrypto = [ - {file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"}, - {file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"}, -] packaging = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] pandas = [ - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3749077d86e3a2f0ed51367f30bf5b82e131cc0f14260c4d3e499186fccc4406"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:972d8a45395f2a2d26733eb8d0f629b2f90bebe8e8eddbb8829b180c09639572"}, - {file = "pandas-1.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:50869a35cbb0f2e0cd5ec04b191e7b12ed688874bd05dd777c19b28cbea90996"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3ac844a0fe00bfaeb2c9b51ab1424e5c8744f89860b138434a363b1f620f354"}, - {file = "pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a0a56cef15fd1586726dace5616db75ebcfec9179a3a55e78f72c5639fa2a23"}, - {file = "pandas-1.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:478ff646ca42b20376e4ed3fa2e8d7341e8a63105586efe54fa2508ee087f328"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6973549c01ca91ec96199e940495219c887ea815b2083722821f1d7abfa2b4dc"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c39a8da13cede5adcd3be1182883aea1c925476f4e84b2807a46e2775306305d"}, - {file = "pandas-1.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f76d097d12c82a535fda9dfe5e8dd4127952b45fea9b0276cb30cca5ea313fbc"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e474390e60ed609cec869b0da796ad94f420bb057d86784191eefc62b65819ae"}, - {file = "pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f2b952406a1588ad4cad5b3f55f520e82e902388a6d5a4a91baa8d38d23c7f6"}, - {file = "pandas-1.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:bc4c368f42b551bf72fac35c5128963a171b40dce866fb066540eeaf46faa003"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:14e45300521902689a81f3f41386dc86f19b8ba8dd5ac5a3c7010ef8d2932813"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9842b6f4b8479e41968eced654487258ed81df7d1c9b7b870ceea24ed9459b31"}, - {file = "pandas-1.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:26d9c71772c7afb9d5046e6e9cf42d83dd147b5cf5bcb9d97252077118543792"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fbcb19d6fceb9e946b3e23258757c7b225ba450990d9ed63ccceeb8cae609f7"}, - {file = "pandas-1.5.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:565fa34a5434d38e9d250af3c12ff931abaf88050551d9fbcdfafca50d62babf"}, - {file = "pandas-1.5.3-cp38-cp38-win32.whl", hash = "sha256:87bd9c03da1ac870a6d2c8902a0e1fd4267ca00f13bc494c9e5a9020920e1d51"}, - {file = "pandas-1.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:41179ce559943d83a9b4bbacb736b04c928b095b5f25dd2b7389eda08f46f373"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c74a62747864ed568f5a82a49a23a8d7fe171d0c69038b38cedf0976831296fa"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c4c00e0b0597c8e4f59e8d461f797e5d70b4d025880516a8261b2817c47759ee"}, - {file = "pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a50d9a4336a9621cab7b8eb3fb11adb82de58f9b91d84c2cd526576b881a0c5a"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd05f7783b3274aa206a1af06f0ceed3f9b412cf665b7247eacd83be41cf7bf0"}, - {file = "pandas-1.5.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f69c4029613de47816b1bb30ff5ac778686688751a5e9c99ad8c7031f6508e5"}, - {file = "pandas-1.5.3-cp39-cp39-win32.whl", hash = "sha256:7cec0bee9f294e5de5bbfc14d0573f65526071029d036b753ee6507d2a21480a"}, - {file = "pandas-1.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:dfd681c5dc216037e0b0a2c821f5ed99ba9f03ebcf119c7dac0e9a7b960b9ec9"}, - {file = "pandas-1.5.3.tar.gz", hash = "sha256:74a3fd7e5a7ec052f183273dc7b0acd3a863edf7520f5d3a1765c04ffdb3b0b1"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, ] parsedatetime = [ {file = "parsedatetime-2.4-py2-none-any.whl", hash = "sha256:9ee3529454bf35c40a77115f5a596771e59e1aee8c53306f346c461b8e913094"}, @@ -7268,31 +7288,42 @@ py = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] pyarrow = [ - {file = "pyarrow-10.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:e00174764a8b4e9d8d5909b6d19ee0c217a6cf0232c5682e31fdfbd5a9f0ae52"}, - {file = "pyarrow-10.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6f7a7dbe2f7f65ac1d0bd3163f756deb478a9e9afc2269557ed75b1b25ab3610"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb627673cb98708ef00864e2e243f51ba7b4c1b9f07a1d821f98043eccd3f585"}, - {file = "pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba71e6fc348c92477586424566110d332f60d9a35cb85278f42e3473bc1373da"}, - {file = "pyarrow-10.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7b4ede715c004b6fc535de63ef79fa29740b4080639a5ff1ea9ca84e9282f349"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:e3fe5049d2e9ca661d8e43fab6ad5a4c571af12d20a57dffc392a014caebef65"}, - {file = "pyarrow-10.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:254017ca43c45c5098b7f2a00e995e1f8346b0fb0be225f042838323bb55283c"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70acca1ece4322705652f48db65145b5028f2c01c7e426c5d16a30ba5d739c24"}, - {file = "pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abb57334f2c57979a49b7be2792c31c23430ca02d24becd0b511cbe7b6b08649"}, - {file = "pyarrow-10.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:1765a18205eb1e02ccdedb66049b0ec148c2a0cb52ed1fb3aac322dfc086a6ee"}, - {file = "pyarrow-10.0.1-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:61f4c37d82fe00d855d0ab522c685262bdeafd3fbcb5fe596fe15025fbc7341b"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e141a65705ac98fa52a9113fe574fdaf87fe0316cde2dffe6b94841d3c61544c"}, - {file = "pyarrow-10.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf26f809926a9d74e02d76593026f0aaeac48a65b64f1bb17eed9964bfe7ae1a"}, - {file = "pyarrow-10.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:443eb9409b0cf78df10ced326490e1a300205a458fbeb0767b6b31ab3ebae6b2"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f2d00aa481becf57098e85d99e34a25dba5a9ade2f44eb0b7d80c80f2984fc03"}, - {file = "pyarrow-10.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b1fc226d28c7783b52a84d03a66573d5a22e63f8a24b841d5fc68caeed6784d4"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:efa59933b20183c1c13efc34bd91efc6b2997377c4c6ad9272da92d224e3beb1"}, - {file = "pyarrow-10.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:668e00e3b19f183394388a687d29c443eb000fb3fe25599c9b4762a0afd37775"}, - {file = "pyarrow-10.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d1bc6e4d5d6f69e0861d5d7f6cf4d061cf1069cb9d490040129877acf16d4c2a"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:42ba7c5347ce665338f2bc64685d74855900200dac81a972d49fe127e8132f75"}, - {file = "pyarrow-10.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b069602eb1fc09f1adec0a7bdd7897f4d25575611dfa43543c8b8a75d99d6874"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94fb4a0c12a2ac1ed8e7e2aa52aade833772cf2d3de9dde685401b22cec30002"}, - {file = "pyarrow-10.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db0c5986bf0808927f49640582d2032a07aa49828f14e51f362075f03747d198"}, - {file = "pyarrow-10.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:0ec7587d759153f452d5263dbc8b1af318c4609b607be2bd5127dcda6708cdb1"}, - {file = "pyarrow-10.0.1.tar.gz", hash = "sha256:1a14f57a5f472ce8234f2964cd5184cccaa8df7e04568c64edc33b23eb285dd5"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"}, + {file = "pyarrow-14.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a8ae88c0038d1bc362a682320112ee6774f006134cd5afc291591ee4bc06505"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f6f053cb66dc24091f5511e5920e45c83107f954a21032feadc7b9e3a8e7851"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:906b0dc25f2be12e95975722f1e60e162437023f490dbd80d0deb7375baf3171"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:78d4a77a46a7de9388b653af1c4ce539350726cd9af62e0831e4f2bd0c95a2f4"}, + {file = "pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:06ca79080ef89d6529bb8e5074d4b4f6086143b2520494fcb7cf8a99079cde93"}, + {file = "pyarrow-14.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:32542164d905002c42dff896efdac79b3bdd7291b1b74aa292fac8450d0e4dcd"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:c7331b4ed3401b7ee56f22c980608cf273f0380f77d0f73dd3c185f78f5a6220"}, + {file = "pyarrow-14.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:922e8b49b88da8633d6cac0e1b5a690311b6758d6f5d7c2be71acb0f1e14cd61"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58c889851ca33f992ea916b48b8540735055201b177cb0dcf0596a495a667b00"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:30d8494870d9916bb53b2a4384948491444741cb9a38253c590e21f836b01222"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:be28e1a07f20391bb0b15ea03dcac3aade29fc773c5eb4bee2838e9b2cdde0cb"}, + {file = "pyarrow-14.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:981670b4ce0110d8dcb3246410a4aabf5714db5d8ea63b15686bce1c914b1f83"}, + {file = "pyarrow-14.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:4756a2b373a28f6166c42711240643fb8bd6322467e9aacabd26b488fa41ec23"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:cf87e2cec65dd5cf1aa4aba918d523ef56ef95597b545bbaad01e6433851aa10"}, + {file = "pyarrow-14.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:470ae0194fbfdfbf4a6b65b4f9e0f6e1fa0ea5b90c1ee6b65b38aecee53508c8"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6263cffd0c3721c1e348062997babdf0151301f7353010c9c9a8ed47448f82ab"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8089d7e77d1455d529dbd7cff08898bbb2666ee48bc4085203af1d826a33cc"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:fada8396bc739d958d0b81d291cfd201126ed5e7913cb73de6bc606befc30226"}, + {file = "pyarrow-14.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:2a145dab9ed7849fc1101bf03bcdc69913547f10513fdf70fc3ab6c0a50c7eee"}, + {file = "pyarrow-14.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:05fe7994745b634c5fb16ce5717e39a1ac1fac3e2b0795232841660aa76647cd"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:a8eeef015ae69d104c4c3117a6011e7e3ecd1abec79dc87fd2fac6e442f666ee"}, + {file = "pyarrow-14.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c76807540989fe8fcd02285dd15e4f2a3da0b09d27781abec3adc265ddbeba1"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:450e4605e3c20e558485f9161a79280a61c55efe585d51513c014de9ae8d393f"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:323cbe60210173ffd7db78bfd50b80bdd792c4c9daca8843ef3cd70b186649db"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0140c7e2b740e08c5a459439d87acd26b747fc408bde0a8806096ee0baaa0c15"}, + {file = "pyarrow-14.0.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:e592e482edd9f1ab32f18cd6a716c45b2c0f2403dc2af782f4e9674952e6dd27"}, + {file = "pyarrow-14.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:d264ad13605b61959f2ae7c1d25b1a5b8505b112715c961418c8396433f213ad"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:01e44de9749cddc486169cb632f3c99962318e9dacac7778315a110f4bf8a450"}, + {file = "pyarrow-14.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d0351fecf0e26e152542bc164c22ea2a8e8c682726fce160ce4d459ea802d69c"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33c1f6110c386464fd2e5e4ea3624466055bbe681ff185fd6c9daa98f30a3f9a"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11e045dfa09855b6d3e7705a37c42e2dc2c71d608fab34d3c23df2e02df9aec3"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:097828b55321897db0e1dbfc606e3ff8101ae5725673498cbfa7754ee0da80e4"}, + {file = "pyarrow-14.0.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1daab52050a1c48506c029e6fa0944a7b2436334d7e44221c16f6f1b2cc9c510"}, + {file = "pyarrow-14.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:3f6d5faf4f1b0d5a7f97be987cf9e9f8cd39902611e818fe134588ee99bf0283"}, + {file = "pyarrow-14.0.1.tar.gz", hash = "sha256:b8b3f4fe8d4ec15e1ef9b599b94683c5216adaed78d5cb4c606180546d1e2ee1"}, ] pyasn1 = [ {file = "pyasn1-0.5.0-py2.py3-none-any.whl", hash = "sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57"}, @@ -7314,77 +7345,112 @@ pycparser = [ {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, ] -pycryptodomex = [ - {file = "pycryptodomex-3.18.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:160a39a708c36fa0b168ab79386dede588e62aec06eb505add870739329aecc6"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:c2953afebf282a444c51bf4effe751706b4d0d63d7ca2cc51db21f902aa5b84e"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:ba95abd563b0d1b88401658665a260852a8e6c647026ee6a0a65589287681df8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-manylinux2014_aarch64.whl", hash = "sha256:192306cf881fe3467dda0e174a4f47bb3a8bb24b90c9cdfbdc248eec5fc0578c"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-musllinux_1_1_aarch64.whl", hash = "sha256:f9ab5ef0718f6a8716695dea16d83b671b22c45e9c0c78fd807c32c0192e54b5"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win32.whl", hash = "sha256:50308fcdbf8345e5ec224a5502b4215178bdb5e95456ead8ab1a69ffd94779cb"}, - {file = "pycryptodomex-3.18.0-cp27-cp27m-win_amd64.whl", hash = "sha256:4d9379c684efea80fdab02a3eb0169372bca7db13f9332cb67483b8dc8b67c37"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:5594a125dae30d60e94f37797fc67ce3c744522de7992c7c360d02fdb34918f8"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:8ff129a5a0eb5ff16e45ca4fa70a6051da7f3de303c33b259063c19be0c43d35"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-manylinux2014_aarch64.whl", hash = "sha256:3d9314ac785a5b75d5aaf924c5f21d6ca7e8df442e5cf4f0fefad4f6e284d422"}, - {file = "pycryptodomex-3.18.0-cp27-cp27mu-musllinux_1_1_aarch64.whl", hash = "sha256:f237278836dda412a325e9340ba2e6a84cb0f56b9244781e5b61f10b3905de88"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac614363a86cc53d8ba44b6c469831d1555947e69ab3276ae8d6edc219f570f7"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-macosx_10_9_x86_64.whl", hash = "sha256:302a8f37c224e7b5d72017d462a2be058e28f7be627bdd854066e16722d0fc0c"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux2014_aarch64.whl", hash = "sha256:6421d23d6a648e83ba2670a352bcd978542dad86829209f59d17a3f087f4afef"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84e105787f5e5d36ec6a581ff37a1048d12e638688074b2a00bcf402f9aa1c2"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6875eb8666f68ddbd39097867325bd22771f595b4e2b0149739b5623c8bf899b"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:27072a494ce621cc7a9096bbf60ed66826bb94db24b49b7359509e7951033e74"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_i686.whl", hash = "sha256:1949e09ea49b09c36d11a951b16ff2a05a0ffe969dda1846e4686ee342fe8646"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6ed3606832987018615f68e8ed716a7065c09a0fe94afd7c9ca1b6777f0ac6eb"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win32.whl", hash = "sha256:d56c9ec41258fd3734db9f5e4d2faeabe48644ba9ca23b18e1839b3bdf093222"}, - {file = "pycryptodomex-3.18.0-cp35-abi3-win_amd64.whl", hash = "sha256:e00a4bacb83a2627e8210cb353a2e31f04befc1155db2976e5e239dd66482278"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:2dc4eab20f4f04a2d00220fdc9258717b82d31913552e766d5f00282c031b70a"}, - {file = "pycryptodomex-3.18.0-pp27-pypy_73-win32.whl", hash = "sha256:75672205148bdea34669173366df005dbd52be05115e919551ee97171083423d"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bec6c80994d4e7a38312072f89458903b65ec99bed2d65aa4de96d997a53ea7a"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d35a8ffdc8b05e4b353ba281217c8437f02c57d7233363824e9d794cf753c419"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76f0a46bee539dae4b3dfe37216f678769349576b0080fdbe431d19a02da42ff"}, - {file = "pycryptodomex-3.18.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:71687eed47df7e965f6e0bf3cadef98f368d5221f0fb89d2132effe1a3e6a194"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73d64b32d84cf48d9ec62106aa277dbe99ab5fbfd38c5100bc7bddd3beb569f7"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbdcce0a226d9205560a5936b05208c709b01d493ed8307792075dedfaaffa5f"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:58fc0aceb9c961b9897facec9da24c6a94c5db04597ec832060f53d4d6a07196"}, - {file = "pycryptodomex-3.18.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:215be2980a6b70704c10796dd7003eb4390e7be138ac6fb8344bf47e71a8d470"}, - {file = "pycryptodomex-3.18.0.tar.gz", hash = "sha256:3e3ecb5fe979e7c1bb0027e518340acf7ee60415d79295e5251d13c68dde576e"}, -] pydantic = [ - {file = "pydantic-1.10.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a1fcb59f2f355ec350073af41d927bf83a63b50e640f4dbaa01053a28b7a7718"}, - {file = "pydantic-1.10.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b7ccf02d7eb340b216ec33e53a3a629856afe1c6e0ef91d84a4e6f2fb2ca70fe"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8fb2aa3ab3728d950bcc885a2e9eff6c8fc40bc0b7bb434e555c215491bcf48b"}, - {file = "pydantic-1.10.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:771735dc43cf8383959dc9b90aa281f0b6092321ca98677c5fb6125a6f56d58d"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ca48477862372ac3770969b9d75f1bf66131d386dba79506c46d75e6b48c1e09"}, - {file = "pydantic-1.10.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a5e7add47a5b5a40c49b3036d464e3c7802f8ae0d1e66035ea16aa5b7a3923ed"}, - {file = "pydantic-1.10.12-cp310-cp310-win_amd64.whl", hash = "sha256:e4129b528c6baa99a429f97ce733fff478ec955513630e61b49804b6cf9b224a"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0d191db0f92dfcb1dec210ca244fdae5cbe918c6050b342d619c09d31eea0cc"}, - {file = "pydantic-1.10.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:795e34e6cc065f8f498c89b894a3c6da294a936ee71e644e4bd44de048af1405"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69328e15cfda2c392da4e713443c7dbffa1505bc9d566e71e55abe14c97ddc62"}, - {file = "pydantic-1.10.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2031de0967c279df0d8a1c72b4ffc411ecd06bac607a212892757db7462fc494"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:ba5b2e6fe6ca2b7e013398bc7d7b170e21cce322d266ffcd57cca313e54fb246"}, - {file = "pydantic-1.10.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:2a7bac939fa326db1ab741c9d7f44c565a1d1e80908b3797f7f81a4f86bc8d33"}, - {file = "pydantic-1.10.12-cp311-cp311-win_amd64.whl", hash = "sha256:87afda5539d5140cb8ba9e8b8c8865cb5b1463924d38490d73d3ccfd80896b3f"}, - {file = "pydantic-1.10.12-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:549a8e3d81df0a85226963611950b12d2d334f214436a19537b2efed61b7639a"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:598da88dfa127b666852bef6d0d796573a8cf5009ffd62104094a4fe39599565"}, - {file = "pydantic-1.10.12-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ba5c4a8552bff16c61882db58544116d021d0b31ee7c66958d14cf386a5b5350"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:c79e6a11a07da7374f46970410b41d5e266f7f38f6a17a9c4823db80dadf4303"}, - {file = "pydantic-1.10.12-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab26038b8375581dc832a63c948f261ae0aa21f1d34c1293469f135fa92972a5"}, - {file = "pydantic-1.10.12-cp37-cp37m-win_amd64.whl", hash = "sha256:e0a16d274b588767602b7646fa05af2782576a6cf1022f4ba74cbb4db66f6ca8"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a9dfa722316f4acf4460afdf5d41d5246a80e249c7ff475c43a3a1e9d75cf62"}, - {file = "pydantic-1.10.12-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:a73f489aebd0c2121ed974054cb2759af8a9f747de120acd2c3394cf84176ccb"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b30bcb8cbfccfcf02acb8f1a261143fab622831d9c0989707e0e659f77a18e0"}, - {file = "pydantic-1.10.12-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2fcfb5296d7877af406ba1547dfde9943b1256d8928732267e2653c26938cd9c"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:2f9a6fab5f82ada41d56b0602606a5506aab165ca54e52bc4545028382ef1c5d"}, - {file = "pydantic-1.10.12-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:dea7adcc33d5d105896401a1f37d56b47d443a2b2605ff8a969a0ed5543f7e33"}, - {file = "pydantic-1.10.12-cp38-cp38-win_amd64.whl", hash = "sha256:1eb2085c13bce1612da8537b2d90f549c8cbb05c67e8f22854e201bde5d98a47"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:ef6c96b2baa2100ec91a4b428f80d8f28a3c9e53568219b6c298c1125572ebc6"}, - {file = "pydantic-1.10.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6c076be61cd0177a8433c0adcb03475baf4ee91edf5a4e550161ad57fc90f523"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d5a58feb9a39f481eda4d5ca220aa8b9d4f21a41274760b9bc66bfd72595b86"}, - {file = "pydantic-1.10.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5f805d2d5d0a41633651a73fa4ecdd0b3d7a49de4ec3fadf062fe16501ddbf1"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:1289c180abd4bd4555bb927c42ee42abc3aee02b0fb2d1223fb7c6e5bef87dbe"}, - {file = "pydantic-1.10.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5d1197e462e0364906cbc19681605cb7c036f2475c899b6f296104ad42b9f5fb"}, - {file = "pydantic-1.10.12-cp39-cp39-win_amd64.whl", hash = "sha256:fdbdd1d630195689f325c9ef1a12900524dceb503b00a987663ff4f58669b93d"}, - {file = "pydantic-1.10.12-py3-none-any.whl", hash = "sha256:b749a43aa51e32839c9d71dc67eb1e4221bb04af1033a32e3923d46f9effa942"}, - {file = "pydantic-1.10.12.tar.gz", hash = "sha256:0fe8a415cea8f340e7a9af9c54fc71a649b43e8ca3cc732986116b3cb135d303"}, + {file = "pydantic-2.5.0-py3-none-any.whl", hash = "sha256:7ce6e766c456ad026fe5712f7bcf036efc34bd5d107b3e669ef7ea01b3a9050c"}, + {file = "pydantic-2.5.0.tar.gz", hash = "sha256:69bd6fb62d2d04b7055f59a396993486a2ee586c43a0b89231ce0000de07627c"}, +] +pydantic-core = [ + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:812beca1dcb2b722cccc7e9c620bd972cbc323321194ec2725eab3222e6ac573"}, + {file = "pydantic_core-2.14.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2ccdc53cb88e51c7d47d74c59630d7be844428f6b8d463055ffad6f0392d8da"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd937733bf2fe7d6a8bf208c12741f1f730b7bf5636033877767a75093c29b8a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:581bb606a31749a00796f5257947a0968182d7fe91e1dada41f06aeb6bfbc91a"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aadf74a40a7ae49c3c1aa7d32334fe94f4f968e21dd948e301bb4ed431fb2412"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b89821a2c77cc1b8f2c1fc3aacd6a3ecc5df8f7e518dc3f18aef8c4dcf66003d"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49ee28d65f506b2858a60745cc974ed005298ebab12693646b97641dd7c99c35"}, + {file = "pydantic_core-2.14.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:97246f896b4df7fd84caa8a75a67abb95f94bc0b547665bf0889e3262b060399"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1185548665bc61bbab0dc78f10c8eafa0db0aa1e920fe9a451b77782b10a65cc"}, + {file = "pydantic_core-2.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2a7d08b39fac97540fba785fce3b21ee01a81f081a07a4d031efd791da6666f9"}, + {file = "pydantic_core-2.14.1-cp310-none-win32.whl", hash = "sha256:0a8c8daf4e3aa3aeb98e3638fc3d58a359738f3d12590b2474c6bb64031a0764"}, + {file = "pydantic_core-2.14.1-cp310-none-win_amd64.whl", hash = "sha256:4f0788699a92d604f348e9c1ac5e97e304e97127ba8325c7d0af88dcc7d35bd3"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:2be018a84995b6be1bbd40d6064395dbf71592a981169cf154c0885637f5f54a"}, + {file = "pydantic_core-2.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fc3227408808ba7df8e95eb1d8389f4ba2203bed8240b308de1d7ae66d828f24"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42d5d0e9bbb50481a049bd0203224b339d4db04006b78564df2b782e2fd16ebc"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bc6a4ea9f88a810cb65ccae14404da846e2a02dd5c0ad21dee712ff69d142638"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d312ad20e3c6d179cb97c42232b53111bcd8dcdd5c1136083db9d6bdd489bc73"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:679cc4e184f213c8227862e57340d12fd4d4d19dc0e3ddb0f653f86f01e90f94"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:101df420e954966868b8bc992aefed5fa71dd1f2755104da62ee247abab28e2f"}, + {file = "pydantic_core-2.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c964c0cc443d6c08a2347c0e5c1fc2d85a272dc66c1a6f3cde4fc4843882ada4"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8276bbab68a9dbe721da92d19cbc061f76655248fe24fb63969d0c3e0e5755e7"}, + {file = "pydantic_core-2.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:12163197fec7c95751a3c71b36dcc1909eed9959f011ffc79cc8170a6a74c826"}, + {file = "pydantic_core-2.14.1-cp311-none-win32.whl", hash = "sha256:b8ff0302518dcd001bd722bbe342919c29e5066c7eda86828fe08cdc112668b8"}, + {file = "pydantic_core-2.14.1-cp311-none-win_amd64.whl", hash = "sha256:59fa83873223f856d898452c6162a390af4297756f6ba38493a67533387d85d9"}, + {file = "pydantic_core-2.14.1-cp311-none-win_arm64.whl", hash = "sha256:798590d38c9381f07c48d13af1f1ef337cebf76ee452fcec5deb04aceced51c7"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:587d75aec9ae50d0d63788cec38bf13c5128b3fc1411aa4b9398ebac884ab179"}, + {file = "pydantic_core-2.14.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26242e3593d4929123615bd9365dd86ef79b7b0592d64a96cd11fd83c69c9f34"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5879ac4791508d8f0eb7dec71ff8521855180688dac0c55f8c99fc4d1a939845"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ad9ea86f5fc50f1b62c31184767fe0cacaa13b54fe57d38898c3776d30602411"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:102ac85a775e77821943ae38da9634ddd774b37a8d407181b4f7b05cdfb36b55"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2459cc06572730e079ec1e694e8f68c99d977b40d98748ae72ff11ef21a56b0b"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:217dcbfaf429a9b8f1d54eb380908b9c778e78f31378283b30ba463c21e89d5d"}, + {file = "pydantic_core-2.14.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9d59e0d7cdfe8ed1d4fcd28aad09625c715dc18976c7067e37d8a11b06f4be3e"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e2be646a5155d408e68b560c0553e8a83dc7b9f90ec6e5a2fc3ff216719385db"}, + {file = "pydantic_core-2.14.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ffba979801e3931a19cd30ed2049450820effe8f152aaa317e2fd93795d318d7"}, + {file = "pydantic_core-2.14.1-cp312-none-win32.whl", hash = "sha256:132b40e479cb5cebbbb681f77aaceabbc8355df16c9124cff1d4060ada83cde2"}, + {file = "pydantic_core-2.14.1-cp312-none-win_amd64.whl", hash = "sha256:744b807fe2733b6da3b53e8ad93e8b3ea3ee3dfc3abece4dd2824cc1f39aa343"}, + {file = "pydantic_core-2.14.1-cp312-none-win_arm64.whl", hash = "sha256:24ba48f9d0b8d64fc5e42e1600366c3d7db701201294989aebdaca23110c02ab"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:ba55d73a2df4771b211d0bcdea8b79454980a81ed34a1d77a19ddcc81f98c895"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:e905014815687d88cbb14bbc0496420526cf20d49f20606537d87646b70f1046"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:443dc5eede7fa76b2370213e0abe881eb17c96f7d694501853c11d5d56916602"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:abae6fd5504e5e438e4f6f739f8364fd9ff5a5cdca897e68363e2318af90bc28"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9486e27bb3f137f33e2315be2baa0b0b983dae9e2f5f5395240178ad8e644728"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69df82892ff00491d673b1929538efb8c8d68f534fdc6cb7fd3ac8a5852b9034"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184ff7b30c3f60e1b775378c060099285fd4b5249271046c9005f8b247b39377"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3d5b2a4b3c10cad0615670cab99059441ff42e92cf793a0336f4bc611e895204"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:871c641a83719caaa856a11dcc61c5e5b35b0db888e1a0d338fe67ce744575e2"}, + {file = "pydantic_core-2.14.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:1e7208946ea9b27a8cef13822c339d4ae96e45952cc01fc4a91c7f1cb0ae2861"}, + {file = "pydantic_core-2.14.1-cp37-none-win32.whl", hash = "sha256:b4ff385a525017f5adf6066d7f9fb309f99ade725dcf17ed623dc7dce1f85d9f"}, + {file = "pydantic_core-2.14.1-cp37-none-win_amd64.whl", hash = "sha256:c7411cd06afeb263182e38c6ca5b4f5fe4f20d91466ad7db0cd6af453a02edec"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:2871daf5b2823bf77bf7d3d43825e5d904030c155affdf84b21a00a2e00821d2"}, + {file = "pydantic_core-2.14.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7977e261cac5f99873dc2c6f044315d09b19a71c4246560e1e67593889a90978"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5a111f9158555582deadd202a60bd7803b6c68f406391b7cf6905adf0af6811"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ac417312bf6b7a0223ba73fb12e26b2854c93bf5b1911f7afef6d24c379b22aa"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c36987f5eb2a7856b5f5feacc3be206b4d1852a6ce799f6799dd9ffb0cba56ae"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6e98227eb02623d57e1fd061788837834b68bb995a869565211b9abf3de4bf4"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:023b6d7ec4e97890b28eb2ee24413e69a6d48de4e8b75123957edd5432f4eeb3"}, + {file = "pydantic_core-2.14.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6015beb28deb5306049ecf2519a59627e9e050892927850a884df6d5672f8c7d"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:3f48d4afd973abbd65266ac24b24de1591116880efc7729caf6b6b94a9654c9e"}, + {file = "pydantic_core-2.14.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:28734bcfb8fc5b03293dec5eb5ea73b32ff767f6ef79a31f6e41dad2f5470270"}, + {file = "pydantic_core-2.14.1-cp38-none-win32.whl", hash = "sha256:3303113fdfaca927ef11e0c5f109e2ec196c404f9d7ba5f8ddb63cdf287ea159"}, + {file = "pydantic_core-2.14.1-cp38-none-win_amd64.whl", hash = "sha256:144f2c1d5579108b6ed1193fcc9926124bd4142b0f7020a7744980d1235c8a40"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:893bf4fb9bfb9c4639bc12f3de323325ada4c6d60e478d5cded65453e9364890"}, + {file = "pydantic_core-2.14.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:052d8731aaf844f91fe4cd3faf28983b109a5865b3a256ec550b80a5689ead87"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb1c6ecb53e4b907ee8486f453dd940b8cbb509946e2b671e3bf807d310a96fc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:94cf6d0274eb899d39189144dcf52814c67f9b0fd196f211420d9aac793df2da"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36c3bf96f803e207a80dbcb633d82b98ff02a9faa76dd446e969424dec8e2b9f"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fb290491f1f0786a7da4585250f1feee200fc17ff64855bdd7c42fb54526fa29"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6590ed9d13eb51b28ea17ddcc6c8dbd6050b4eb589d497105f0e13339f223b72"}, + {file = "pydantic_core-2.14.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:69cd74e55a5326d920e7b46daa2d81c2bdb8bcf588eafb2330d981297b742ddc"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d965bdb50725a805b083f5f58d05669a85705f50a6a864e31b545c589290ee31"}, + {file = "pydantic_core-2.14.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ca942a2dc066ca5e04c27feaa8dfb9d353ddad14c6641660c565149186095343"}, + {file = "pydantic_core-2.14.1-cp39-none-win32.whl", hash = "sha256:72c2ef3787c3b577e5d6225d73a77167b942d12cef3c1fbd5e74e55b7f881c36"}, + {file = "pydantic_core-2.14.1-cp39-none-win_amd64.whl", hash = "sha256:55713d155da1e508083c4b08d0b1ad2c3054f68b8ef7eb3d3864822e456f0bb5"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:53efe03cc383a83660cfdda6a3cb40ee31372cedea0fde0b2a2e55e838873ab6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:f523e116879bc6714e61d447ce934676473b068069dce6563ea040381dc7a257"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85bb66d661be51b2cba9ca06759264b3469d2dbb53c3e6effb3f05fec6322be6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f53a3ccdc30234cb4342cec541e3e6ed87799c7ca552f0b5f44e3967a5fed526"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:1bfb63821ada76719ffcd703fc40dd57962e0d8c253e3c565252e6de6d3e0bc6"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:e2c689439f262c29cf3fcd5364da1e64d8600facecf9eabea8643b8755d2f0de"}, + {file = "pydantic_core-2.14.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:a15f6e5588f7afb7f6fc4b0f4ff064749e515d34f34c666ed6e37933873d8ad8"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-macosx_10_7_x86_64.whl", hash = "sha256:f1a30eef060e21af22c7d23349f1028de0611f522941c80efa51c05a63142c62"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:16f4a7e1ec6b3ea98a1e108a2739710cd659d68b33fbbeaba066202cab69c7b6"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd80a2d383940eec3db6a5b59d1820f947317acc5c75482ff8d79bf700f8ad6a"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:a68a36d71c7f638dda6c9e6b67f6aabf3fa1471b198d246457bfdc7c777cdeb7"}, + {file = "pydantic_core-2.14.1-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:ebc79120e105e4bcd7865f369e3b9dbabb0d492d221e1a7f62a3e8e292550278"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c8c466facec2ccdf025b0b1455b18f2c3d574d5f64d24df905d3d7b8f05d5f4e"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:b91b5ec423e88caa16777094c4b2b97f11453283e7a837e5e5e1b886abba1251"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130e49aa0cb316f743bc7792c36aefa39fc2221312f1d4b333b19edbdd71f2b1"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f483467c046f549572f8aca3b7128829e09ae3a9fe933ea421f7cb7c58120edb"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dee4682bd7947afc682d342a8d65ad1834583132383f8e801601a8698cb8d17a"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8d927d042c0ef04607ee7822828b208ab045867d20477ec6593d612156798547"}, + {file = "pydantic_core-2.14.1-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:5a1570875eb0d1479fb2270ed80c88c231aaaf68b0c3f114f35e7fb610435e4f"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:cb2fd3ab67558eb16aecfb4f2db4febb4d37dc74e6b8613dc2e7160fb58158a9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:a7991f25b98038252363a03e6a9fe92e60fe390fda2631d238dc3b0e396632f8"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b45b7be9f99991405ecd6f6172fb6798908a8097106ae78d5cc5cc15121bad9"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:51506e7652a2ef1d1cf763c4b51b972ff4568d1dddc96ca83931a6941f5e6389"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:66dc0e63349ec39c1ea66622aa5c2c1f84382112afd3ab2fa0cca4fb01f7db39"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:8e17f0c3ba4cb07faa0038a59ce162de584ed48ba645c8d05a5de1e40d4c21e7"}, + {file = "pydantic_core-2.14.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d983222223f63e323a5f497f5b85e211557a5d8fb670dc88f343784502b466ba"}, + {file = "pydantic_core-2.14.1.tar.gz", hash = "sha256:0d82a6ee815388a362885186e431fac84c7a06623bc136f508e9f88261d8cadb"}, ] pydoc-markdown = [ {file = "pydoc_markdown-4.8.2-py3-none-any.whl", hash = "sha256:203f74119e6bb2f9deba43d452422de7c8ec31955b61e0620fa4dd8c2611715f"}, @@ -8093,27 +8159,27 @@ sniffio = [ {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, ] snowflake-connector-python = [ - {file = "snowflake-connector-python-3.1.1.tar.gz", hash = "sha256:2700503a5f99d6e22e412d7cf4fd2211296cc0e50b2a38ad9c6f48ddb8beff67"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3aec4ab6f6d66a0dc2b5bbd8fc2c11fd76090c63fdc65577af9d4e28055c51f2"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5d2589f39b1c1c91eda6711181afb7f197f7dd43204f26db48df90849d9f528b"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c540b4fe173cc9a24df285ce49c70fe0dadc6316b8a2160324c549086a71a118"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25007ccf5d9c0b87e29af40470f6f1e76d03621642a7492d62282215b7e9d67d"}, - {file = "snowflake_connector_python-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:fff3caebd8b60cee09ad55674d12b8940b9d5f57a394c8467637167372710841"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7b7622be7bcad26786bf771341e3b4819df6e4d7858e5dd4c8700423ca7364e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:260d259a79e6120bf58fcec9a52705fd02a430f296a77a1531720906b7a02f5e"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0163d5036f05a39977c6d7aba5e8bb1632be1117785a72e2602e3a34b89ded1c"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d38546ebcba7bca37a16cfcbbc0f8e7c19946b4e45e0c5dc2a8963f3b739958"}, - {file = "snowflake_connector_python-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:484044c2d9aacd5c8a0a9d8d8b69b06352e3612f23c5e44d54771a96047d80b1"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e4a4aab55a4a3236625b738fad19524c9cef810fe041d567dc5dc1d9b1f9eb7"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:5d95eeaff7b085b0c8facab40391bede699ffc0865f2cdaa37b19a8429d47943"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a944a1862672552f8c00b98b576a8b16da46f9c5b918ba4b969bd7d1205c32a"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7abb142ba3ee5db6c61be0dc578fa10e59b7c1f33716b0c93ae6706b2a8bbee3"}, - {file = "snowflake_connector_python-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:bf6ca8f8678dea6cf5275f69dbd9e4ebb18c2211be35379b65175e36e5953b92"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ceb263b95720ab645c2e60e37d436db51321e0192d399631d052387728911689"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:8b7fe82d8d1cdc90caadbcce419d3bcbf1bdeffb9bba974a81a46f389d8ee243"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d46b798507f6c7447e21c76bd71969e22e55fa848196f20de73b3e2b65373b5"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1bdcce7069368b7b2ec8a855812c1b0e9e6bdf6b01660225ffff5ba163fa507d"}, - {file = "snowflake_connector_python-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:daedeff774cd68df05e68dbfa66e83a877e63a99461b8262eb5c8cd37e309aa7"}, + {file = "snowflake-connector-python-3.5.0.tar.gz", hash = "sha256:654e4a1f68a491544bd8f7c5ab02eb8531df67c5f4309d5253bd204044f8a1b3"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a365fa4f23be27a4a46d04f73a48ccb1ddad5b9558f100ba592a49571c90a33c"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:5b648b8f32aa540e9adf14e84ea5d77a6c3c6cbc3cbcf172622a0b8db0e99384"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:722dc0100c3247788aeb975a8a5941f2f757e8524d2626cf6fe78df02b6384fb"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7029b8776c5d2153ed2b0254dc23ae1e3bde141b6634fc6c77b919ed29d5bb42"}, + {file = "snowflake_connector_python-3.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:3472703fc4f308343d925c41dab976a42e10192fa0b8b9025e80b083ad7dcf1b"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:40f4a376b6da875d70383b60c66ad3723f0bed21d8bdbf7afb39525cb70c70ef"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-macosx_11_0_x86_64.whl", hash = "sha256:8a08d8df6f1b5b5d0bf9145e6339dbeaf294392529629d0bd7e4dd3e49d7892c"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac16a00bb3824069303e119cd049858c2caf92d174f9486ba273d19abf06a18d"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a820148b64436621b5db79c2e7848d5d12ece13b0948281c19dd2f8a50e4dbe"}, + {file = "snowflake_connector_python-3.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:ffa8f95a767e5077e82cf290a43950f37cfc25e34935f038abc96494a1595a03"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:ef70cd89aee56fbbaeb68dc1f7612598b0c8a470d16ddb68ca7657bd70cbf8d7"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:09ff23c1aa4bf9e148e491512a81b097ce0b1c2a870f3d0bb0dc5febf764c45c"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e696f133c57494dce57a68a92d1e2cf20334361400fe3c4c73637627f7d9c0ec"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0136a9fb45013ea3d50045acb3cedb50b2d5d6ac1d0f9adc538e28cf86a1386"}, + {file = "snowflake_connector_python-3.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:133e2a8a5e7b59d84e83886bb516d290edbd0b92dd69304f8f7ac613faca2aeb"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c463d11b05b57c40eb83d84044d761535a855e498ffd52456e92eed333e43b17"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:cdd198dbc0aff373bb9e95f315cdc0b922ae61186ba9bd7da4950835827cd7f9"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d8769b95a46040261a46dc58757c59b26e6122466222d8b8e518ea6aa62e83d"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee97a8ac0aaf40a7b7420c8936a66d8d33376cd40498ac3d38efa7bb5712d14a"}, + {file = "snowflake_connector_python-3.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:e8cd747e2719ba44dd2ce0e9b1e6f8b03485b2b335a352f3b45138b56fad5888"}, ] sortedcontainers = [ {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, diff --git a/pyproject.toml b/pyproject.toml index 7307634cd6..6798df3696 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dlt" -version = "0.3.24" +version = "0.4.1a0" description = "dlt is an open-source python-first scalable data loading library that does not require any backend to run." authors = ["dltHub Inc. "] maintainers = [ "Marcin Rudolf ", "Adrian Brudaru ", "Ty Dunn "] @@ -57,7 +57,7 @@ psycopg2-binary = {version = ">=2.9.1", optional = true} psycopg2cffi = {version = ">=2.9.0", optional = true, markers="platform_python_implementation == 'PyPy'"} grpcio = {version = ">=1.50.0", optional = true} google-cloud-bigquery = {version = ">=2.26.0", optional = true} -pyarrow = {version = ">=8.0.0", optional = true} +pyarrow = {version = ">=12.0.0", optional = true} duckdb = {version = ">=0.6.1,<0.10.0", optional = true} dbt-core = {version = ">=1.2.0", optional = true} dbt-redshift = {version = ">=1.2.0", optional = true} @@ -68,12 +68,11 @@ dbt-athena-community = {version = ">=1.2.0", optional = true} s3fs = {version = ">=2022.4.0", optional = true} gcsfs = {version = ">=2022.4.0", optional = true} botocore = {version = ">=1.28", optional = true} -snowflake-connector-python = {version = ">=3.1.1", optional = true, extras = ["pandas"]} +snowflake-connector-python = {version = ">=3.5.0", optional = true, extras = ["pandas"]} cron-descriptor = {version = ">=1.2.32", optional = true} pipdeptree = {version = ">=2.9.0,<2.10", optional = true} pyathena = {version = ">=2.9.6", optional = true} weaviate-client = {version = ">=3.22", optional = true} -pydantic = {version = ">=1.10,<2.0", optional = true} adlfs = {version = ">=2022.4.0", optional = true} pyodbc = {version = "^4.0.39", optional = true} qdrant-client = {version = "^1.6.4", optional = true, extras = ["fastembed"]} @@ -97,7 +96,6 @@ motherduck = ["duckdb", "pyarrow"] cli = ["pipdeptree", "cron-descriptor"] athena = ["pyathena", "pyarrow", "s3fs", "botocore"] weaviate = ["weaviate-client"] -pydantic = ["pydantic"] mssql = ["pyodbc"] qdrant = ["qdrant-client"] @@ -107,13 +105,8 @@ dlt = "dlt.cli._dlt:_main" [tool.poetry.group.dev.dependencies] requests-mock = "^1.10.0" types-click = "^7.1.8" -pandas = "^1.5.3" sqlfluff = "^2.3.2" -google-auth-oauthlib = "^1.0.0" types-deprecated = "^1.2.9.2" -tqdm = "^4.65.0" -enlighten = "^1.11.2" -alive-progress = "^3.1.1" pytest-console-scripts = "^1.4.1" pytest = "^6.2.4" mypy = "^1.6.1" @@ -139,6 +132,17 @@ types-tqdm = "^4.66.0.2" types-psutil = "^5.9.5.16" types-psycopg2 = "^2.9.21.14" +[tool.poetry.group.pipeline] +optional=true + +[tool.poetry.group.pipeline.dependencies] +google-auth-oauthlib = "^1.0.0" +tqdm = "^4.65.0" +enlighten = "^1.11.2" +alive-progress = "^3.1.1" +pydantic = ">2" +pandas = ">2" + [tool.poetry.group.airflow] optional = true @@ -151,6 +155,9 @@ optional = true [tool.poetry.group.providers.dependencies] google-api-python-client = "^2.86.0" +[tool.poetry.group.sentry-sdk] +optional = true + [tool.poetry.group.sentry-sdk.dependencies] sentry-sdk = "^1.5.6" diff --git a/pytest.ini b/pytest.ini index fc7ce9119b..88c8353a69 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,4 +6,5 @@ xfail_strict= true log_cli= 1 log_cli_level= INFO python_files = test_*.py *_test.py *snippets.py *snippet.pytest -python_functions = *_test test_* *_snippet \ No newline at end of file +python_functions = *_test test_* *_snippet +filterwarnings= ignore::DeprecationWarning \ No newline at end of file diff --git a/tests/cases.py b/tests/cases.py index ca8a97082e..70c20d74af 100644 --- a/tests/cases.py +++ b/tests/cases.py @@ -333,7 +333,14 @@ def assert_all_data_types_row( assert db_mapping == expected_rows -def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = True, include_time: bool = True, num_rows: int = 3) -> Tuple[Any, List[Dict[str, Any]]]: +def arrow_table_all_data_types( + object_format: TArrowFormat, + include_json: bool = True, + include_time: bool = True, + include_not_normalized_name: bool = True, + include_name_clash: bool = False, + num_rows: int = 3 +) -> Tuple[Any, List[Dict[str, Any]]]: """Create an arrow object or pandas dataframe with all supported data types. Returns the table and its records in python format @@ -342,7 +349,6 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = from dlt.common.libs.pyarrow import pyarrow as pa data = { - "Pre Normalized Column": [random.choice(ascii_lowercase) for _ in range(num_rows)], "string": [random.choice(ascii_lowercase) for _ in range(num_rows)], "float": [round(random.uniform(0, 100), 4) for _ in range(num_rows)], "int": [random.randrange(0, 100) for _ in range(num_rows)], @@ -355,6 +361,12 @@ def arrow_table_all_data_types(object_format: TArrowFormat, include_json: bool = "null": pd.Series( [None for _ in range(num_rows)]) } + if include_name_clash: + data["pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + include_not_normalized_name = True + if include_not_normalized_name: + data["Pre Normalized Column"] = [random.choice(ascii_lowercase) for _ in range(num_rows)] + if include_json: data["json"] = [{"a": random.randrange(0, 100)} for _ in range(num_rows)] diff --git a/tests/cli/cases/deploy_pipeline/dummy_pipeline.py b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py new file mode 100644 index 0000000000..48e13c35cd --- /dev/null +++ b/tests/cli/cases/deploy_pipeline/dummy_pipeline.py @@ -0,0 +1,20 @@ +import dlt + + +@dlt.resource +def example_resource(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id=0): + yield [api_url, api_key, str(last_id), "param4", "param5"] + + +@dlt.source +def example_source(api_url=dlt.config.value, api_key=dlt.secrets.value, last_id = 0): + # return all the resources to be loaded + return example_resource(api_url, api_key, last_id) + + +if __name__ == '__main__': + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination="dummy") + load_info = p.run( + example_source(last_id=819273998) + ) + print(load_info) diff --git a/tests/cli/common/test_cli_invoke.py b/tests/cli/common/test_cli_invoke.py index 99f34eeaa7..e3a7676ad1 100644 --- a/tests/cli/common/test_cli_invoke.py +++ b/tests/cli/common/test_cli_invoke.py @@ -1,8 +1,13 @@ import os +import shutil +from subprocess import CalledProcessError +import pytest from pytest_console_scripts import ScriptRunner from unittest.mock import patch +import dlt from dlt.common.configuration.paths import get_dlt_data_dir +from dlt.common.runners.venv import Venv from dlt.common.utils import custom_environ, set_working_dir from dlt.common.pipeline import get_dlt_pipelines_dir @@ -35,7 +40,7 @@ def test_invoke_basic(script_runner: ScriptRunner) -> None: def test_invoke_list_pipelines(script_runner: ScriptRunner) -> None: result = script_runner.run(['dlt', 'pipeline', '--list-pipelines']) # directory does not exist (we point to TEST_STORAGE) - assert result.returncode == 1 + assert result.returncode == -2 # create empty os.makedirs(get_dlt_pipelines_dir()) @@ -43,11 +48,45 @@ def test_invoke_list_pipelines(script_runner: ScriptRunner) -> None: assert result.returncode == 0 assert "No pipelines found in" in result.stdout + +def test_invoke_pipeline(script_runner: ScriptRunner) -> None: # info on non existing pipeline result = script_runner.run(['dlt', 'pipeline', 'debug_pipeline', 'info']) - assert result.returncode == 1 + assert result.returncode == -1 assert "the pipeline was not found in" in result.stderr + # copy dummy pipeline + p = dlt.pipeline(pipeline_name="dummy_pipeline") + p._wipe_working_folder() + + shutil.copytree("tests/cli/cases/deploy_pipeline", TEST_STORAGE_ROOT, dirs_exist_ok=True) + + with set_working_dir(TEST_STORAGE_ROOT): + with custom_environ({"COMPETED_PROB": "1.0", "DLT_DATA_DIR": get_dlt_data_dir()}): + venv = Venv.restore_current() + venv.run_script("dummy_pipeline.py") + # we check output test_pipeline_command else + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'info']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'trace']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'failed-jobs']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package']) + assert result.returncode == 0 + result = script_runner.run(['dlt', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + assert result.returncode == -2 + try: + # use debug flag to raise an exception + result = script_runner.run(['dlt', '--debug', 'pipeline', 'dummy_pipeline', 'load-package', "NON EXISTENT"]) + # exception terminates command + assert result.returncode == 1 + assert "LoadPackageNotFound" in result.stderr + finally: + # reset debug flag so other tests may pass + from dlt.cli import _dlt + _dlt.DEBUG_FLAG = False + def test_invoke_init_chess_and_template(script_runner: ScriptRunner) -> None: with set_working_dir(TEST_STORAGE_ROOT): diff --git a/tests/cli/test_pipeline_command.py b/tests/cli/test_pipeline_command.py index 1ffc0c66aa..19bb5fa277 100644 --- a/tests/cli/test_pipeline_command.py +++ b/tests/cli/test_pipeline_command.py @@ -1,6 +1,7 @@ import io import os import contextlib +import pytest from subprocess import CalledProcessError import dlt @@ -44,7 +45,7 @@ def test_pipeline_command_operations(repo_dir: str, project_files: FileStorage) pipeline_command.pipeline_command("info", "chess_pipeline", None, 0) _out = buf.getvalue() # do we have duckdb destination - assert "dlt.destinations.duckdb" in _out + assert "destination: duckdb" in _out print(_out) with io.StringIO() as buf, contextlib.redirect_stdout(buf): @@ -142,7 +143,6 @@ def test_pipeline_command_failed_jobs(repo_dir: str, project_files: FileStorage) try: pipeline = dlt.attach(pipeline_name="chess_pipeline") - print(pipeline.working_dir) pipeline.drop() except Exception as e: print(e) @@ -168,3 +168,42 @@ def test_pipeline_command_failed_jobs(repo_dir: str, project_files: FileStorage) _out = buf.getvalue() # actual failed job data assert "JOB file type: jsonl" in _out + + +def test_pipeline_command_drop_partial_loads(repo_dir: str, project_files: FileStorage) -> None: + init_command.init_command("chess", "dummy", False, repo_dir) + + try: + pipeline = dlt.attach(pipeline_name="chess_pipeline") + pipeline.drop() + except Exception as e: + print(e) + + # now run the pipeline + os.environ["EXCEPTION_PROB"] = "1.0" + os.environ["FAIL_IN_INIT"] = "False" + os.environ["TIMEOUT"] = "1.0" + venv = Venv.restore_current() + with pytest.raises(CalledProcessError) as cpe: + print(venv.run_script("chess_pipeline.py")) + assert "Dummy job status raised exception" in cpe.value.stdout + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + pipeline_command.pipeline_command("info", "chess_pipeline", None, 1) + _out = buf.getvalue() + # one package is partially loaded + assert 'This package is partially loaded' in _out + print(_out) + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + with echo.always_choose(False, True): + pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) + _out = buf.getvalue() + assert 'Pending packages deleted' in _out + print(_out) + + with io.StringIO() as buf, contextlib.redirect_stdout(buf): + pipeline_command.pipeline_command("drop-pending-packages", "chess_pipeline", None, 1) + _out = buf.getvalue() + assert 'No pending packages found' in _out + print(_out) \ No newline at end of file diff --git a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml index fd612df987..f8645d78ae 100644 --- a/tests/common/cases/schemas/eth/ethereum_schema_v7.yml +++ b/tests/common/cases/schemas/eth/ethereum_schema_v7.yml @@ -1,5 +1,5 @@ -version: 14 -version_hash: ZbDv9+tdJK7P/4QIB0qqHzqNSsVynVx90GL4giV8/p0= +version: 15 +version_hash: yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE= engine_version: 7 name: ethereum tables: @@ -27,6 +27,7 @@ tables: name: schema_version_hash write_disposition: skip description: Created by DLT. Tracks completed loads + schema_contract: {} name: _dlt_loads resource: _dlt_loads _dlt_version: @@ -57,13 +58,13 @@ tables: name: schema write_disposition: skip description: Created by DLT. Tracks schema updates + schema_contract: {} name: _dlt_version resource: _dlt_version blocks: description: Ethereum blocks x-annotation: this will be preserved on save write_disposition: append - table_sealed: true filters: includes: [] excludes: [] @@ -159,6 +160,7 @@ tables: nullable: false data_type: text name: transactions_root + schema_contract: {} name: blocks resource: blocks blocks__transactions: @@ -421,7 +423,6 @@ tables: name: value name: blocks__uncles settings: - schema_sealed: true default_hints: foreign_key: - _dlt_parent_id @@ -441,6 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp + schema_contract: {} normalizers: names: dlt.common.normalizers.names.snake_case json: @@ -454,6 +456,4 @@ normalizers: blocks: timestamp: block_timestamp hash: block_hash -ancestors: -- Q/LxiP7taycE+u9PQNb2wiit+G5GntiifOUK2CFM3sQ= diff --git a/tests/common/data_writers/test_buffered_writer.py b/tests/common/data_writers/test_buffered_writer.py index 85cfcb2d0c..c275f22b2b 100644 --- a/tests/common/data_writers/test_buffered_writer.py +++ b/tests/common/data_writers/test_buffered_writer.py @@ -1,28 +1,14 @@ -import os -from typing import Iterator, Set, Literal +from typing import Iterator import pytest -from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter from dlt.common.data_writers.exceptions import BufferedDataWriterClosed -from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext from dlt.common.schema.utils import new_column from dlt.common.storages.file_storage import FileStorage from dlt.common.typing import DictStrAny -from tests.utils import TEST_STORAGE_ROOT, write_version, autouse_test_storage -import datetime # noqa: 251 - - -ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} - - -def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: - caps = DestinationCapabilitiesContext.generic_capabilities() - caps.preferred_loader_file_format = _format - file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") - return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) +from tests.common.data_writers.utils import ALL_WRITERS, get_writer def test_write_no_item() -> None: @@ -175,47 +161,3 @@ def test_writer_optional_schema(disable_compression: bool) -> None: with get_writer(_format="jsonl", disable_compression=disable_compression) as writer: writer.write_data_item([{"col1": 1}], None) writer.write_data_item([{"col1": 1}], None) - - -@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) -def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format=writer_format) as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item({"col1": 1}, columns=c1) - assert writer._buffered_items_count == 1 - # list - writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) - assert writer._buffered_items_count == 3 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 3 - - -def test_writer_items_count_arrow() -> None: - import pyarrow as pa - c1 = {"col1": new_column("col1", "bigint")} - with get_writer(_format="arrow") as writer: - assert writer._buffered_items_count == 0 - # single item - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) - assert writer._buffered_items_count == 1 - # single item with many rows - writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) - assert writer._buffered_items_count == 3 - # empty list - writer.write_data_item([], columns=c1) - assert writer._buffered_items_count == 3 - # list with one item - writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) - assert writer._buffered_items_count == 4 - # list with many items - writer.write_data_item( - [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], - columns=c1 - ) - assert writer._buffered_items_count == 7 - writer._flush_items() - assert writer._buffered_items_count == 0 - assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/test_data_writers.py b/tests/common/data_writers/test_data_writers.py index 66b8f765c7..9d655bc4db 100644 --- a/tests/common/data_writers/test_data_writers.py +++ b/tests/common/data_writers/test_data_writers.py @@ -5,7 +5,7 @@ from dlt.common import pendulum, json from dlt.common.typing import AnyFun # from dlt.destinations.postgres import capabilities -from dlt.destinations.redshift import capabilities as redshift_caps +from dlt.destinations.impl.redshift import capabilities as redshift_caps from dlt.common.data_writers.escape import escape_redshift_identifier, escape_bigquery_identifier, escape_redshift_literal, escape_postgres_literal, escape_duckdb_literal from dlt.common.data_writers.writers import DataWriter, InsertValuesWriter, JsonlWriter, ParquetDataWriter diff --git a/tests/common/data_writers/utils.py b/tests/common/data_writers/utils.py new file mode 100644 index 0000000000..e1a071903f --- /dev/null +++ b/tests/common/data_writers/utils.py @@ -0,0 +1,17 @@ +import os +from typing import Set, Literal + + +from dlt.common.data_writers.buffered import BufferedDataWriter, DataWriter +from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext + +from tests.utils import TEST_STORAGE_ROOT + +ALL_WRITERS: Set[Literal[TLoaderFileFormat]] = {"insert_values", "jsonl", "parquet", "arrow", "puae-jsonl"} + + +def get_writer(_format: TLoaderFileFormat = "insert_values", buffer_max_items: int = 10, disable_compression: bool = False) -> BufferedDataWriter[DataWriter]: + caps = DestinationCapabilitiesContext.generic_capabilities() + caps.preferred_loader_file_format = _format + file_template = os.path.join(TEST_STORAGE_ROOT, f"{_format}.%s") + return BufferedDataWriter(_format, file_template, buffer_max_items=buffer_max_items, disable_compression=disable_compression, _caps=caps) diff --git a/tests/common/normalizers/test_json_relational.py b/tests/common/normalizers/test_json_relational.py index 7169044117..91b5a93466 100644 --- a/tests/common/normalizers/test_json_relational.py +++ b/tests/common/normalizers/test_json_relational.py @@ -321,6 +321,59 @@ def test_list_position(norm: RelationalNormalizer) -> None: # print(rows) +def test_control_descending(norm: RelationalNormalizer) -> None: + row: StrAny = { + "f": [{ + "l": ["a", "b", "c"], + "v": 120, + "lo": [[{"e": "a"}, {"e": "b"}, {"e":"c"}]] + }], + "g": "val" + } + + # break at first row + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # prevent yielding descendants of "f" but yield all else + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + (table, _), _ = rows_gen.send(True) + assert table == "table__f" + # won't yield anything else + with pytest.raises(StopIteration): + rows_gen.send(False) + + # descend into "l" + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__l" + assert one_row["value"] == "a" + # get next element in the list - even with sending False - we do not descend + (table, _), one_row = rows_gen.send(False) + assert table == "table__f__l" + assert one_row["value"] == "b" + + # prevent descending into list of lists + rows_gen = norm.normalize_data_item(row, "load_id", "table") + rows_gen.send(None) + rows_gen.send(True) + # yield "l" + next(rows_gen) + next(rows_gen) + next(rows_gen) + (table, _), one_row = rows_gen.send(True) + assert table == "table__f__lo" + # do not descend into lists + with pytest.raises(StopIteration): + rows_gen.send(False) + + def test_list_in_list() -> None: chats = { "_dlt_id": "123456", diff --git a/tests/common/schema/test_merges.py b/tests/common/schema/test_merges.py index 64e90c7c21..2eb903f041 100644 --- a/tests/common/schema/test_merges.py +++ b/tests/common/schema/test_merges.py @@ -202,14 +202,17 @@ def test_diff_tables() -> None: # ignore identical table props existing = deepcopy(table) changed["write_disposition"] = "append" + changed["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", "description": "new description", "write_disposition": "append", + "schema_contract": "freeze", "columns": {} } existing["write_disposition"] = "append" + existing["schema_contract"] = "freeze" partial = utils.diff_tables(deepcopy(existing), changed) assert partial == { "name": "new name", diff --git a/tests/common/schema/test_schema.py b/tests/common/schema/test_schema.py index c84c25574f..104b634491 100644 --- a/tests/common/schema/test_schema.py +++ b/tests/common/schema/test_schema.py @@ -132,7 +132,7 @@ def test_simple_regex_validator() -> None: def test_load_corrupted_schema() -> None: - eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v4") + eth_v4: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") del eth_v4["tables"]["blocks"] with pytest.raises(ParentTableNotFoundException): utils.validate_stored_schema(eth_v4) @@ -203,13 +203,21 @@ def test_replace_schema_content() -> None: eth_v5: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v5") eth_v5["imported_version_hash"] = "IMP_HASH" schema_eth = Schema.from_dict(eth_v5) # type: ignore[arg-type] - schema_eth.bump_version() schema.replace_schema_content(schema_eth) assert schema_eth.stored_version_hash == schema.stored_version_hash assert schema_eth.version == schema.version assert schema_eth.version_hash == schema.version_hash assert schema_eth._imported_version_hash == schema._imported_version_hash + # replace content of modified schema + eth_v5 = load_yml_case("schemas/eth/ethereum_schema_v5") + schema_eth = Schema.from_dict(eth_v5, bump_version=False) # type: ignore[arg-type] + assert schema_eth.version_hash != schema_eth.stored_version_hash + # replace content does not bump version + schema = Schema("simple") + schema.replace_schema_content(schema_eth) + assert schema.version_hash != schema.stored_version_hash + @pytest.mark.parametrize("columns,hint,value", [ (["_dlt_id", "_dlt_root_id", "_dlt_load_id", "_dlt_parent_id", "_dlt_list_idx"], "nullable", False), diff --git a/tests/common/schema/test_schema_contract.py b/tests/common/schema/test_schema_contract.py new file mode 100644 index 0000000000..2f6b4743f3 --- /dev/null +++ b/tests/common/schema/test_schema_contract.py @@ -0,0 +1,324 @@ +from typing import cast + +import pytest +import copy + +from dlt.common.schema import Schema, DEFAULT_SCHEMA_CONTRACT_MODE, TSchemaContractDict +from dlt.common.schema.exceptions import DataValidationError +from dlt.common.schema.typing import TTableSchema + +def get_schema() -> Schema: + s = Schema("event") + + columns = { + "column_1": { + "name": "column_1", + "data_type": "text" + }, + "column_2": { + "name": "column_2", + "data_type": "bigint", + "is_variant": True + } + } + + incomplete_columns = { + "incomplete_column_1": { + "name": "incomplete_column_1", + }, + "incomplete_column_2": { + "name": "incomplete_column_2", + } + } + + + # add some tables + s.update_table(cast(TTableSchema, { + "name": "tables", + "columns": columns + })) + + s.update_table(cast(TTableSchema, { + "name": "child_table", + "parent": "tables", + "columns": columns + })) + + s.update_table(cast(TTableSchema, { + "name": "incomplete_table", + "columns": incomplete_columns + })) + + s.update_table(cast(TTableSchema, { + "name": "mixed_table", + "columns": {**incomplete_columns, **columns} + })) + + s.update_table(cast(TTableSchema, { + "name": "evolve_once_table", + "x-normalizer": {"evolve-columns-once": True}, + "columns": {**incomplete_columns, **columns} + })) + + return s + + +def test_resolve_contract_settings() -> None: + + # defaults + schema = get_schema() + assert schema.resolve_contract_settings_for_table("tables") == DEFAULT_SCHEMA_CONTRACT_MODE + assert schema.resolve_contract_settings_for_table("child_table") == DEFAULT_SCHEMA_CONTRACT_MODE + + # table specific full setting + schema = get_schema() + schema.tables["tables"]["schema_contract"] = "freeze" + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + + # table specific single setting + schema = get_schema() + schema.tables["tables"]["schema_contract"] = { + "tables": "freeze", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + + # schema specific full setting + schema = get_schema() + schema._settings["schema_contract"] = "freeze" + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } + + # schema specific single setting + schema = get_schema() + schema._settings["schema_contract"] = { + "tables": "freeze", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "freeze", + "columns": "discard_value", + "data_type": "evolve" + } + + # mixed settings: table setting always prevails + schema = get_schema() + schema._settings["schema_contract"] = "freeze" + schema.tables["tables"]["schema_contract"] = { + "tables": "evolve", + "columns": "discard_value", + } + assert schema.resolve_contract_settings_for_table("tables") == { + "tables": "evolve", + "columns": "discard_value", + "data_type": "evolve" + } + assert schema.resolve_contract_settings_for_table("child_table") == { + "tables": "evolve", + "columns": "discard_value", + "data_type": "evolve" + } + + +# ensure other settings do not interfere with the main setting we are testing +base_settings = [{ + "tables": "evolve", + "columns": "evolve", + "data_type": "evolve" + }, { + "tables": "discard_row", + "columns": "discard_row", + "data_type": "discard_row" + }, { + "tables": "discard_value", + "columns": "discard_value", + "data_type": "discard_value" + }, { + "tables": "freeze", + "columns": "freeze", + "data_type": "freeze" + } +] + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_table(base_settings) -> None: + + schema = get_schema() + new_table = copy.deepcopy(schema.tables["tables"]) + new_table["name"] = "new_table" + + # + # check adding new table + # + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "evolve"}}), new_table) + assert (partial, filters) == (new_table, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_row"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "discard_value"}}), new_table) + assert (partial, filters) == (None, [("tables", "new_table", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, raise_on_freeze=False) + assert (partial, filters) == (None, [("tables", "new_table", "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"tables": "freeze"}}), new_table, data_item={"item": 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == "new_table" + assert val_ex.value.column_name is None + assert val_ex.value.contract_entity == "tables" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is None # there's no validating schema on new table + assert val_ex.value.data_item == {"item": 1} + + +@pytest.mark.parametrize("base_settings", base_settings) +def test_check_adding_new_columns(base_settings) -> None: + schema = get_schema() + + + def assert_new_column(table_update: TTableSchema, column_name: str) -> None: + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop(column_name) + + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", column_name, "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**base_settings, **{"columns": "freeze"}}), copy.deepcopy(table_update), {column_name: 1}) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == column_name + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item == {column_name: 1} + + # + # check adding new column + # + table_update: TTableSchema = { + "name": "tables", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "text" + } + } + } + assert_new_column(table_update, "new_column") + + # + # check adding new column if target column is not complete + # + table_update = { + "name": "mixed_table", + "columns": { + "incomplete_column_1": { + "name": "incomplete_column_1", + } + } + } + assert_new_column(table_update, "incomplete_column_1") + + # + # check x-normalize evolve_once behaving as evolve override + # + table_update = { + "name": "evolve_once_table", + "columns": { + "new_column": { + "name": "new_column", + "data_type": "text" + }, + "incomplete_column_1": { + "name": "incomplete_column_1", + } + } + } + partial, filters = schema.apply_schema_contract(base_settings, copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + + +def test_check_adding_new_variant() -> None: + schema = get_schema() + + # + # check adding new variant column + # + table_update: TTableSchema = { + "name": "tables", + "columns": { + "column_2_variant": { + "name": "column_2_variant", + "data_type": "bigint", + "variant": True + } + } + } + popped_table_update = copy.deepcopy(table_update) + popped_table_update["columns"].pop("column_2_variant") + + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_row"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_row")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "discard_value"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "discard_value")]) + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update), raise_on_freeze=False) + assert (partial, filters) == (popped_table_update, [("columns", "column_2_variant", "freeze")]) + + with pytest.raises(DataValidationError) as val_ex: + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) + assert val_ex.value.schema_name == schema.name + assert val_ex.value.table_name == table_update["name"] + assert val_ex.value.column_name == "column_2_variant" + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema == schema.get_table(table_update["name"]) + assert val_ex.value.data_item is None # we do not pass it to apply_schema_contract + + # variants are not new columns - new data types + partial, filters = schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "evolve", "columns": "freeze"}}), copy.deepcopy(table_update)) + assert (partial, filters) == (table_update, []) + + # evolve once does not apply to variant evolution + table_update["name"] = "evolve_once_table" + with pytest.raises(DataValidationError): + schema.apply_schema_contract(cast(TSchemaContractDict, {**DEFAULT_SCHEMA_CONTRACT_MODE, **{"data_type": "freeze"}}), copy.deepcopy(table_update)) diff --git a/tests/common/schema/test_versioning.py b/tests/common/schema/test_versioning.py index 7d0074e934..a971c8c93f 100644 --- a/tests/common/schema/test_versioning.py +++ b/tests/common/schema/test_versioning.py @@ -84,10 +84,10 @@ def test_infer_column_bumps_version() -> None: def test_preserve_version_on_load() -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") - version = eth_v6["version"] - version_hash = eth_v6["version_hash"] - schema = Schema.from_dict(eth_v6) # type: ignore[arg-type] + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + version = eth_v7["version"] + version_hash = eth_v7["version_hash"] + schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] # version should not be bumped assert version_hash == schema._stored_version_hash assert version_hash == schema.version_hash @@ -96,8 +96,8 @@ def test_preserve_version_on_load() -> None: @pytest.mark.parametrize("remove_defaults", [True, False]) def test_version_preserve_on_reload(remove_defaults: bool) -> None: - eth_v6: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") - schema = Schema.from_dict(eth_v6) # type: ignore[arg-type] + eth_v7: TStoredSchema = load_yml_case("schemas/eth/ethereum_schema_v7") + schema = Schema.from_dict(eth_v7) # type: ignore[arg-type] to_save_dict = schema.to_dict(remove_defaults=remove_defaults) assert schema.stored_version == to_save_dict["version"] diff --git a/tests/common/storages/test_loader_storage.py b/tests/common/storages/test_loader_storage.py index 4f2d0193fe..1acfeb873b 100644 --- a/tests/common/storages/test_loader_storage.py +++ b/tests/common/storages/test_loader_storage.py @@ -26,12 +26,12 @@ def test_complete_successful_package(storage: LoadStorage) -> None: # should delete package in full storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.complete_job(load_id, file_name) assert_package_info(storage, load_id, "normalized", "completed_jobs") storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has package assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) @@ -47,7 +47,7 @@ def test_complete_successful_package(storage: LoadStorage) -> None: storage.complete_job(load_id, file_name) storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # has load preserved assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) assert storage.storage.has_file(os.path.join(storage.get_completed_package_path(load_id), LoadStorage.PACKAGE_COMPLETED_FILE_NAME)) @@ -59,22 +59,45 @@ def test_complete_successful_package(storage: LoadStorage) -> None: def test_wipe_normalized_packages(storage: LoadStorage) -> None: load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - storage.wipe_normalized_packages() - assert not storage.storage.has_folder(storage.NORMALIZED_FOLDER) +def test_is_partially_loaded(storage: LoadStorage) -> None: + load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}], start_job=False) + info = storage.get_load_package_info(load_id) + # all jobs are new + assert LoadStorage.is_package_partially_loaded(info) is False + # start job + storage.start_job(load_id, file_name) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + # complete job + storage.complete_job(load_id, file_name) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + # must complete package + storage.complete_load_package(load_id, False) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is False + + # abort package + load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) + storage.complete_load_package(load_id, True) + info = storage.get_load_package_info(load_id) + assert LoadStorage.is_package_partially_loaded(info) is True + + def test_complete_package_failed_jobs(storage: LoadStorage) -> None: # loads with failed jobs are always persisted storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.fail_job(load_id, file_name, "EXCEPTION") assert_package_info(storage, load_id, "normalized", "failed_jobs") storage.complete_load_package(load_id, False) # deleted from loading - assert not storage.storage.has_folder(storage.get_package_path(load_id)) + assert not storage.storage.has_folder(storage.get_normalized_package_path(load_id)) # present in completed loads folder assert storage.storage.has_folder(storage.get_completed_package_path(load_id)) # has completed loads @@ -105,7 +128,7 @@ def test_abort_package(storage: LoadStorage) -> None: # loads with failed jobs are always persisted storage.config.delete_completed_jobs = True load_id, file_name = start_loading_file(storage, [{"content": "a"}, {"content": "b"}]) - assert storage.storage.has_folder(storage.get_package_path(load_id)) + assert storage.storage.has_folder(storage.get_normalized_package_path(load_id)) storage.fail_job(load_id, file_name, "EXCEPTION") assert_package_info(storage, load_id, "normalized", "failed_jobs") storage.complete_load_package(load_id, True) @@ -195,7 +218,7 @@ def test_process_schema_update(storage: LoadStorage) -> None: storage.commit_schema_update(load_id, applied_update) assert storage.begin_schema_update(load_id) is None # processed file exists - applied_update_path = os.path.join(storage.get_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) + applied_update_path = os.path.join(storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME) assert storage.storage.has_file(applied_update_path) is True assert json.loads(storage.storage.load(applied_update_path)) == applied_update # verify info package @@ -237,7 +260,7 @@ def test_unknown_migration_path() -> None: LoadStorage(False, "jsonl", LoadStorage.ALL_SUPPORTED_FILE_FORMATS) -def start_loading_file(s: LoadStorage, content: Sequence[StrAny]) -> Tuple[str, str]: +def start_loading_file(s: LoadStorage, content: Sequence[StrAny], start_job: bool = True) -> Tuple[str, str]: load_id = uniq_id() s.create_temp_load_package(load_id) # write test file @@ -247,8 +270,9 @@ def start_loading_file(s: LoadStorage, content: Sequence[StrAny]) -> Tuple[str, s.save_temp_schema_updates(load_id, {}) s.commit_temp_load_package(load_id) assert_package_info(s, load_id, "normalized", "new_jobs") - s.start_job(load_id, file_name) - assert_package_info(s, load_id, "normalized", "started_jobs") + if start_job: + s.start_job(load_id, file_name) + assert_package_info(s, load_id, "normalized", "started_jobs") return load_id, file_name diff --git a/tests/common/storages/utils.py b/tests/common/storages/utils.py index 6900c6fdcf..a4296279bf 100644 --- a/tests/common/storages/utils.py +++ b/tests/common/storages/utils.py @@ -1,7 +1,5 @@ from typing import List from fsspec import AbstractFileSystem -import pandas -from pyarrow import parquet from dlt.common import pendulum from dlt.common.storages import FilesystemConfiguration @@ -29,13 +27,16 @@ def assert_sample_files(all_file_items: List[FileItem], filesystem: AbstractFile assert content == f.read() # read via various readers if item["mime_type"] == "text/csv": - with file_dict.open() as f: - df = pandas.read_csv(f, header="infer") - assert len(df.to_dict(orient="records")) > 0 + # parse csv + with file_dict.open(mode="rt") as f: + from csv import DictReader + elements = list(DictReader(f)) + assert len(elements) > 0 if item["mime_type"] == "application/parquet": + # verify it is a real parquet with file_dict.open() as f: - table = parquet.ParquetFile(f).read() - assert len(table.to_pylist()) + parquet: bytes = f.read() + assert parquet.startswith(b"PAR1") if item["mime_type"].startswith("text"): with file_dict.open(mode="rt") as f_txt: lines = f_txt.readlines() diff --git a/tests/common/test_destination.py b/tests/common/test_destination.py index 7afa10ed68..5483a95f45 100644 --- a/tests/common/test_destination.py +++ b/tests/common/test_destination.py @@ -1,6 +1,7 @@ import pytest -from dlt.common.destination.reference import DestinationClientDwhConfiguration, DestinationReference +from dlt.common.destination.reference import DestinationClientDwhConfiguration, Destination +from dlt.common.destination import DestinationCapabilitiesContext from dlt.common.exceptions import InvalidDestinationReference, UnknownDestinationModule from dlt.common.schema import Schema from dlt.common.schema.exceptions import InvalidDatasetName @@ -11,24 +12,24 @@ def test_import_unknown_destination() -> None: # standard destination with pytest.raises(UnknownDestinationModule): - DestinationReference.from_name("meltdb") + Destination.from_reference("meltdb") # custom module with pytest.raises(UnknownDestinationModule): - DestinationReference.from_name("melt.db") + Destination.from_reference("melt.db") def test_invalid_destination_reference() -> None: with pytest.raises(InvalidDestinationReference): - DestinationReference.from_name("tests.load.cases.fake_destination") + Destination.from_reference("tests.load.cases.fake_destination.not_a_destination") def test_import_all_destinations() -> None: # this must pass without the client dependencies being imported - for module in ACTIVE_DESTINATIONS: - dest = DestinationReference.from_name(module) - assert dest.__name__ == "dlt.destinations." + module + for dest_name in ACTIVE_DESTINATIONS: + dest = Destination.from_reference(dest_name) + assert dest.name == dest_name dest.spec() - dest.capabilities() + assert isinstance(dest.capabilities(), DestinationCapabilitiesContext) def test_normalize_dataset_name() -> None: diff --git a/tests/common/test_json.py b/tests/common/test_json.py index 983484d326..f6e9b06425 100644 --- a/tests/common/test_json.py +++ b/tests/common/test_json.py @@ -6,7 +6,7 @@ from dlt.common import json, Decimal, pendulum from dlt.common.arithmetics import numeric_default_context -from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, _orjson, _simplejson, SupportsJson, _DATETIME +from dlt.common.json import _DECIMAL, _WEI, custom_pua_decode, may_have_pua, _orjson, _simplejson, SupportsJson, _DATETIME from tests.utils import autouse_test_storage, TEST_STORAGE_ROOT from tests.cases import JSON_TYPED_DICT, JSON_TYPED_DICT_DECODED, JSON_TYPED_DICT_NESTED, JSON_TYPED_DICT_NESTED_DECODED @@ -250,6 +250,18 @@ def test_json_typed_encode(json_impl: SupportsJson) -> None: assert d_d == JSON_TYPED_DICT_DECODED +@pytest.mark.parametrize("json_impl", _JSON_IMPL) +def test_pua_detection(json_impl: SupportsJson) -> None: + with io.BytesIO() as b: + json_impl.typed_dump(JSON_TYPED_DICT, b) + content_b = b.getvalue() + assert may_have_pua(content_b) + with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: + content_b = f.read() + assert not may_have_pua(content_b) + + + def test_load_and_compare_all_impls() -> None: with open(json_case_path("rasa_event_bot_metadata"), "rb") as f: content_b = f.read() diff --git a/tests/common/test_pydantic.py b/tests/common/test_pydantic.py deleted file mode 100644 index 770fcce6e5..0000000000 --- a/tests/common/test_pydantic.py +++ /dev/null @@ -1,134 +0,0 @@ -import pytest -from typing import Union, Optional, List, Dict, Any -from enum import Enum - -from datetime import datetime, date, time # noqa: I251 -from dlt.common import Decimal -from dlt.common import json - -from pydantic import BaseModel, Json, AnyHttpUrl -from dlt.common.libs.pydantic import pydantic_to_table_schema_columns - - -class StrEnum(str, Enum): - a = "a_value" - b = "b_value" - c = "c_value" - - -class IntEnum(int, Enum): - a = 0 - b = 1 - c = 2 - - -class MixedEnum(Enum): - a_int = 0 - b_str = "b_value" - c_int = 2 - - -class NestedModel(BaseModel): - nested_field: str - - -class Model(BaseModel): - bigint_field: int - text_field: str - timestamp_field: datetime - date_field: date - decimal_field: Decimal - double_field: float - time_field: time - - nested_field: NestedModel - list_field: List[str] - - union_field: Union[int, str] - - optional_field: Optional[float] - - blank_dict_field: dict # type: ignore[type-arg] - parametrized_dict_field: Dict[str, int] - - str_enum_field: StrEnum - int_enum_field: IntEnum - # Both of these shouold coerce to str - mixed_enum_int_field: MixedEnum - mixed_enum_str_field: MixedEnum - - json_field: Json[List[str]] - - url_field: AnyHttpUrl - - any_field: Any - json_any_field: Json[Any] - - - -@pytest.mark.parametrize('instance', [True, False]) -def test_pydantic_model_to_columns(instance: bool) -> None: - if instance: - model = Model( - bigint_field=1, text_field="text", timestamp_field=datetime.now(), - date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, - time_field=time(1, 2, 3, 12345), - nested_field=NestedModel(nested_field="nested"), - list_field=["a", "b", "c"], - union_field=1, - optional_field=None, - blank_dict_field={}, - parametrized_dict_field={"a": 1, "b": 2, "c": 3}, - str_enum_field=StrEnum.a, - int_enum_field=IntEnum.a, - mixed_enum_int_field=MixedEnum.a_int, - mixed_enum_str_field=MixedEnum.b_str, - json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] - url_field="https://example.com", # type: ignore[arg-type] - any_field="any_string", - json_any_field=json.dumps("any_string"), - ) - else: - model = Model # type: ignore[assignment] - - result = pydantic_to_table_schema_columns(model) - - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" - assert result["date_field"]["data_type"] == "date" - assert result["decimal_field"]["data_type"] == "decimal" - assert result["double_field"]["data_type"] == "double" - assert result["time_field"]["data_type"] == "time" - assert result["nested_field"]["data_type"] == "complex" - assert result['list_field']['data_type'] == 'complex' - assert result['union_field']['data_type'] == 'bigint' - assert result['optional_field']['data_type'] == 'double' - assert result['optional_field']['nullable'] is True - assert result['blank_dict_field']['data_type'] == 'complex' - assert result['parametrized_dict_field']['data_type'] == 'complex' - assert result['str_enum_field']['data_type'] == 'text' - assert result['int_enum_field']['data_type'] == 'bigint' - assert result['mixed_enum_int_field']['data_type'] == 'text' - assert result['mixed_enum_str_field']['data_type'] == 'text' - assert result['json_field']['data_type'] == 'complex' - assert result['url_field']['data_type'] == 'text' - - # Any type fields are excluded from schema - assert 'any_field' not in result - assert 'json_any_field' not in result - - -def test_pydantic_model_skip_complex_types() -> None: - result = pydantic_to_table_schema_columns(Model, skip_complex_types=True) - - assert result["bigint_field"]["data_type"] == "bigint" - - assert "nested_field" not in result - assert "list_field" not in result - assert "blank_dict_field" not in result - assert "parametrized_dict_field" not in result - assert "json_field" not in result - assert result["bigint_field"]["data_type"] == "bigint" - assert result["text_field"]["data_type"] == "text" - assert result["timestamp_field"]["data_type"] == "timestamp" diff --git a/tests/common/test_typing.py b/tests/common/test_typing.py index 399ab284ea..41d3d8d274 100644 --- a/tests/common/test_typing.py +++ b/tests/common/test_typing.py @@ -3,7 +3,7 @@ from dlt.common.configuration.specs.base_configuration import BaseConfiguration, get_config_if_union_hint from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.common.typing import StrAny, extract_inner_type, extract_optional_type, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict +from dlt.common.typing import StrAny, extract_inner_type, extract_union_types, is_dict_generic_type, is_list_generic_type, is_literal_type, is_newtype_type, is_optional_type, is_typeddict, is_union_type @@ -15,6 +15,8 @@ class TTestTyDi(TypedDict): TOptionalLi = Optional[TTestLi] TOptionalTyDi = Optional[TTestTyDi] +TOptionalUnionLiTyDi = Optional[Union[TTestTyDi, TTestLi]] + def test_is_typeddict() -> None: assert is_typeddict(TTestTyDi) is True @@ -28,6 +30,7 @@ def test_is_list_generic_type() -> None: assert is_list_generic_type(List[str]) is True assert is_list_generic_type(Sequence[str]) is True assert is_list_generic_type(MutableSequence[str]) is True + assert is_list_generic_type(TOptionalUnionLiTyDi) is False # type: ignore[arg-type] def test_is_dict_generic_type() -> None: @@ -46,8 +49,19 @@ def test_optional() -> None: assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] assert is_optional_type(TTestTyDi) is False - assert extract_optional_type(TOptionalLi) is TTestLi # type: ignore[arg-type] - assert extract_optional_type(TOptionalTyDi) is TTestTyDi # type: ignore[arg-type] + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] + + +def test_union_types() -> None: + assert is_optional_type(TOptionalLi) is True # type: ignore[arg-type] + assert is_optional_type(TOptionalTyDi) is True # type: ignore[arg-type] + assert is_optional_type(TTestTyDi) is False + assert extract_union_types(TOptionalLi) == [TTestLi, type(None)] # type: ignore[arg-type] + assert extract_union_types(TOptionalTyDi) == [TTestTyDi, type(None)] # type: ignore[arg-type] + assert is_optional_type(TOptionalUnionLiTyDi) is True # type: ignore[arg-type] + assert extract_union_types(TOptionalUnionLiTyDi) == [TTestTyDi, TTestLi, type(None)] # type: ignore[arg-type] + assert is_union_type(MutableSequence[str]) is False def test_is_newtype() -> None: diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index bbda683717..d35adc8c7b 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -1,7 +1,7 @@ from copy import deepcopy import pytest import yaml -from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional +from typing import Dict, List, Literal, Mapping, Sequence, TypedDict, Optional, Union from dlt.common import json from dlt.common.exceptions import DictValidationException @@ -10,8 +10,12 @@ from dlt.common.typing import DictStrStr, StrStr from dlt.common.validation import validate_dict, validate_dict_ignoring_xkeys + + TLiteral = Literal["uno", "dos", "tres"] +class TDict(TypedDict): + field: TLiteral class TTestRecord(TypedDict): f_bool: bool @@ -31,6 +35,7 @@ class TTestRecord(TypedDict): f_literal: TLiteral f_literal_optional: Optional[TLiteral] f_seq_literal: Sequence[Optional[TLiteral]] + f_optional_union: Optional[Union[TLiteral, TDict]] TEST_COL: TColumnSchema = { @@ -74,7 +79,8 @@ class TTestRecord(TypedDict): "f_column": deepcopy(TEST_COL), "f_literal": "uno", "f_literal_optional": "dos", - "f_seq_literal": ["uno", "dos", "tres"] + "f_seq_literal": ["uno", "dos", "tres"], + "f_optional_union": {"field": "uno"} } @pytest.fixture @@ -227,3 +233,23 @@ def test_filter(test_doc: TTestRecord) -> None: test_doc["x-extra"] = "x-annotation" # type: ignore[typeddict-unknown-key] # remove x-extra with a filter validate_dict(TTestRecord, test_doc, ".", filter_f=lambda k: k != "x-extra") + + +def test_nested_union(test_doc: TTestRecord) -> None: + test_doc["f_optional_union"] = {"field": "uno"} + validate_dict(TTestRecord, TEST_DOC, ".") + + test_doc["f_optional_union"] = {"field": "not valid"} # type: ignore[typeddict-item] + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == {'field': 'not valid'} + + test_doc["f_optional_union"] = "dos" + validate_dict(TTestRecord, test_doc, ".") + + test_doc["f_optional_union"] = "blah" # type: ignore[typeddict-item] + with pytest.raises(DictValidationException) as e: + validate_dict(TTestRecord, test_doc, ".") + assert e.value.field == "f_optional_union" + assert e.value.value == "blah" \ No newline at end of file diff --git a/tests/common/utils.py b/tests/common/utils.py index 8e0d5351e6..d612dcbdcf 100644 --- a/tests/common/utils.py +++ b/tests/common/utils.py @@ -16,7 +16,7 @@ COMMON_TEST_CASES_PATH = "./tests/common/cases/" # for import schema tests, change when upgrading the schema version -IMPORTED_VERSION_HASH_ETH_V7 = "ZbDv9+tdJK7P/4QIB0qqHzqNSsVynVx90GL4giV8/p0=" +IMPORTED_VERSION_HASH_ETH_V7 = "yjMtV4Zv0IJlfR5DPMwuXxGg8BRhy7E79L26XAHWEGE=" # test sentry DSN TEST_SENTRY_DSN = "https://797678dd0af64b96937435326c7d30c1@o1061158.ingest.sentry.io/4504306172821504" # preserve secrets path to be able to restore it diff --git a/tests/conftest.py b/tests/conftest.py index 56760508da..8a14fa1550 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -59,7 +59,7 @@ def _create_pipeline_instance_id(self) -> str: Pipeline._create_pipeline_instance_id = _create_pipeline_instance_id # type: ignore[method-assign] # push sentry to ci - os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" + # os.environ["RUNTIME__SENTRY_DSN"] = "https://6f6f7b6f8e0f458a89be4187603b55fe@o1061158.ingest.sentry.io/4504819859914752" # disable sqlfluff logging for log in ["sqlfluff.parser", "sqlfluff.linter", "sqlfluff.templater", "sqlfluff.lexer"]: diff --git a/tests/extract/cases/eth_source/ethereum.schema.yaml b/tests/extract/cases/eth_source/ethereum.schema.yaml index b5d54f9c49..5a8db47163 100644 --- a/tests/extract/cases/eth_source/ethereum.schema.yaml +++ b/tests/extract/cases/eth_source/ethereum.schema.yaml @@ -1,328 +1,428 @@ -version: 11 -version_hash: GPHX4B+0xnRuGZM/w3UYVbldRyg8jSJp1G60K4RDcZg= -engine_version: 5 +version: 14 +version_hash: VuzNqiLOk7XuPxYLndMFMPHTDVItKU5ayiy70nQLdus= +engine_version: 7 name: ethereum tables: _dlt_loads: columns: load_id: - data_type: text nullable: false - schema_name: data_type: text + name: load_id + schema_name: nullable: true + data_type: text + name: schema_name status: - data_type: bigint nullable: false + data_type: bigint + name: status inserted_at: - data_type: timestamp nullable: false + data_type: timestamp + name: inserted_at + schema_version_hash: + nullable: true + data_type: text + name: schema_version_hash write_disposition: skip description: Created by DLT. Tracks completed loads + schema_contract: {} + name: _dlt_loads + resource: _dlt_loads _dlt_version: columns: version: - data_type: bigint nullable: false - engine_version: data_type: bigint + name: version + engine_version: nullable: false + data_type: bigint + name: engine_version inserted_at: - data_type: timestamp nullable: false + data_type: timestamp + name: inserted_at schema_name: - data_type: text nullable: false - version_hash: data_type: text + name: schema_name + version_hash: nullable: false - schema: data_type: text + name: version_hash + schema: nullable: false + data_type: text + name: schema write_disposition: skip description: Created by DLT. Tracks schema updates + schema_contract: {} + name: _dlt_version + resource: _dlt_version blocks: description: Ethereum blocks x-annotation: this will be preserved on save write_disposition: append - table_sealed: true filters: includes: [] excludes: [] columns: _dlt_load_id: + nullable: false description: load id coming from the extractor data_type: text - nullable: false + name: _dlt_load_id _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id number: + nullable: false primary_key: true data_type: bigint - nullable: false + name: number parent_hash: - data_type: text nullable: true + data_type: text + name: parent_hash hash: + nullable: false cluster: true unique: true data_type: text - nullable: false + name: hash base_fee_per_gas: - data_type: wei nullable: false - difficulty: data_type: wei + name: base_fee_per_gas + difficulty: nullable: false + data_type: wei + name: difficulty extra_data: - data_type: text nullable: true + data_type: text + name: extra_data gas_limit: - data_type: bigint nullable: false - gas_used: data_type: bigint + name: gas_limit + gas_used: nullable: false + data_type: bigint + name: gas_used logs_bloom: - data_type: binary nullable: true + data_type: binary + name: logs_bloom miner: - data_type: text nullable: true - mix_hash: data_type: text + name: miner + mix_hash: nullable: true - nonce: data_type: text + name: mix_hash + nonce: nullable: true - receipts_root: data_type: text + name: nonce + receipts_root: nullable: true - sha3_uncles: data_type: text + name: receipts_root + sha3_uncles: nullable: true + data_type: text + name: sha3_uncles size: - data_type: bigint nullable: true + data_type: bigint + name: size state_root: - data_type: text nullable: false + data_type: text + name: state_root timestamp: + nullable: false unique: true sort: true data_type: timestamp - nullable: false + name: timestamp total_difficulty: - data_type: wei nullable: true + data_type: wei + name: total_difficulty transactions_root: - data_type: text nullable: false + data_type: text + name: transactions_root + schema_contract: {} + name: blocks + resource: blocks blocks__transactions: parent: blocks columns: _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id block_number: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false primary_key: true data_type: bigint - nullable: false + name: transaction_index hash: + nullable: false unique: true data_type: text - nullable: false + name: hash block_hash: + nullable: false cluster: true data_type: text - nullable: false + name: block_hash block_timestamp: + nullable: false sort: true data_type: timestamp - nullable: false + name: block_timestamp chain_id: - data_type: text nullable: true - from: data_type: text + name: chain_id + from: nullable: true + data_type: text + name: from gas: - data_type: bigint nullable: true - gas_price: data_type: bigint + name: gas + gas_price: nullable: true + data_type: bigint + name: gas_price input: - data_type: text nullable: true + data_type: text + name: input max_fee_per_gas: - data_type: wei nullable: true - max_priority_fee_per_gas: data_type: wei + name: max_fee_per_gas + max_priority_fee_per_gas: nullable: true + data_type: wei + name: max_priority_fee_per_gas nonce: - data_type: bigint nullable: true + data_type: bigint + name: nonce r: - data_type: text nullable: true - s: data_type: text + name: r + s: nullable: true + data_type: text + name: s status: - data_type: bigint nullable: true + data_type: bigint + name: status to: - data_type: text nullable: true - type: data_type: text + name: to + type: nullable: true + data_type: text + name: type v: - data_type: bigint nullable: true + data_type: bigint + name: v value: - data_type: wei nullable: false + data_type: wei + name: value eth_value: - data_type: decimal nullable: true + data_type: decimal + name: eth_value + name: blocks__transactions blocks__transactions__logs: parent: blocks__transactions columns: _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id address: - data_type: text nullable: false + data_type: text + name: address block_timestamp: + nullable: false sort: true data_type: timestamp - nullable: false + name: block_timestamp block_hash: + nullable: false cluster: true data_type: text - nullable: false + name: block_hash block_number: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: block_number transaction_index: + nullable: false primary_key: true foreign_key: true data_type: bigint - nullable: false + name: transaction_index log_index: + nullable: false primary_key: true data_type: bigint - nullable: false + name: log_index data: - data_type: text nullable: true + data_type: text + name: data removed: - data_type: bool nullable: true + data_type: bool + name: removed transaction_hash: - data_type: text nullable: false + data_type: text + name: transaction_hash + name: blocks__transactions__logs blocks__transactions__logs__topics: parent: blocks__transactions__logs columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__transactions__logs__topics blocks__transactions__access_list: parent: blocks__transactions columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id address: - data_type: text nullable: true + data_type: text + name: address + name: blocks__transactions__access_list blocks__transactions__access_list__storage_keys: parent: blocks__transactions__access_list columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__transactions__access_list__storage_keys blocks__uncles: parent: blocks columns: _dlt_parent_id: + nullable: false foreign_key: true data_type: text - nullable: false + name: _dlt_parent_id _dlt_list_idx: - data_type: bigint nullable: false + data_type: bigint + name: _dlt_list_idx _dlt_id: + nullable: false unique: true data_type: text - nullable: false + name: _dlt_id _dlt_root_id: + nullable: false root_key: true data_type: text - nullable: false + name: _dlt_root_id value: - data_type: text nullable: true + data_type: text + name: value + name: blocks__uncles settings: - schema_sealed: true default_hints: foreign_key: - _dlt_parent_id @@ -342,6 +442,7 @@ settings: preferred_types: timestamp: timestamp block_timestamp: timestamp + schema_contract: {} normalizers: names: dlt.common.normalizers.names.snake_case json: diff --git a/tests/extract/test_decorators.py b/tests/extract/test_decorators.py index b8a6b80cfa..28f3d34dcf 100644 --- a/tests/extract/test_decorators.py +++ b/tests/extract/test_decorators.py @@ -17,12 +17,15 @@ from dlt.common.schema import Schema from dlt.common.schema.utils import new_table, new_column from dlt.common.schema.typing import TTableSchemaColumns +from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.common.typing import TDataItem from dlt.cli.source_detection import detect_source_configs -from dlt.common.typing import TDataItem -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, DynamicNameNotStandaloneResource, ExplicitSourceNameInvalid, InconsistentTableTemplate, InvalidResourceDataTypeFunctionNotAGenerator, InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable -from dlt.extract.source import DltResource, DltSource -from dlt.common.schema.exceptions import InvalidSchemaName +from dlt.extract import DltResource, DltSource +from dlt.extract.exceptions import (DynamicNameNotStandaloneResource, InvalidResourceDataTypeFunctionNotAGenerator, + InvalidResourceDataTypeIsNone, InvalidResourceDataTypeMultiplePipes, ParametrizedResourceUnbound, + PipeGenInvalid, PipeNotBoundToData, ResourceFunctionExpected, ResourceInnerCallableConfigWrapDisallowed, + SourceDataIsNone, SourceIsAClassTypeError, SourceNotAFunction, SourceSchemaNotAvailable) from dlt.extract.typing import TableNameMeta from tests.common.utils import IMPORTED_VERSION_HASH_ETH_V7 diff --git a/tests/extract/test_extract.py b/tests/extract/test_extract.py index 1557a64315..8259483088 100644 --- a/tests/extract/test_extract.py +++ b/tests/extract/test_extract.py @@ -1,8 +1,9 @@ import dlt from dlt.common import json from dlt.common.storages import NormalizeStorageConfiguration + +from dlt.extract import DltResource, DltSource from dlt.extract.extract import ExtractorStorage, extract -from dlt.extract.source import DltResource, DltSource from tests.utils import clean_test_storage from tests.extract.utils import expect_extracted_file @@ -18,13 +19,11 @@ def expect_tables(resource: DltResource) -> dlt.Schema: storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - # odd and even tables - assert len(schema_update) == 2 - assert "odd_table" in schema_update - assert "even_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + # odd and even tables must be in the source schema + assert len(source.schema.data_tables(include_incomplete=True)) == 2 + assert "odd_table" in source.schema._schema_tables + assert "even_table" in source.schema._schema_tables # you must commit the files assert len(storage.list_files_to_normalize_sorted()) == 0 storage.commit_extract_files(extract_id) @@ -42,11 +41,9 @@ def expect_tables(resource: DltResource) -> dlt.Schema: source = source.with_resources(resource.name) source.selected_resources[resource.name].bind(10).select_tables("odd_table") extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert len(schema_update) == 1 - assert "odd_table" in schema_update - for partials in schema_update.values(): - assert len(partials) == 1 + extract(extract_id, source, storage) + assert len(source.schema.data_tables(include_incomplete=True)) == 1 + assert "odd_table" in source.schema._schema_tables storage.commit_extract_files(extract_id) assert len(storage.list_files_to_normalize_sorted()) == 1 expect_extracted_file(storage, "selectables", "odd_table", json.dumps([1,3,5,7,9])) @@ -86,10 +83,10 @@ def input_gen(): source = DltSource("module", dlt.Schema("selectables"), [input_r, input_r.with_name("gen_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) + extract(extract_id, source, storage) # both tables got generated - assert "input_gen" in schema_update - assert "gen_clone" in schema_update + assert "input_gen" in source.schema._schema_tables + assert "gen_clone" in source.schema._schema_tables def test_extract_renamed_clone_and_parent(): @@ -105,8 +102,8 @@ def tx_step(item): source = DltSource("module", dlt.Schema("selectables"), [input_r, (input_r | input_tx).with_name("tx_clone")]) storage = ExtractorStorage(NormalizeStorageConfiguration()) extract_id = storage.create_extract_id() - schema_update = extract(extract_id, source, storage) - assert "input_gen" in schema_update - assert "tx_clone" in schema_update + extract(extract_id, source, storage) + assert "input_gen" in source.schema._schema_tables + assert "tx_clone" in source.schema._schema_tables # mind that pipe name of the evaluated parent will have different name than the resource assert source.tx_clone._pipe.parent.name == "input_gen_tx_clone" diff --git a/tests/extract/test_incremental.py b/tests/extract/test_incremental.py index fb72eee6f4..595c67f7c6 100644 --- a/tests/extract/test_incremental.py +++ b/tests/extract/test_incremental.py @@ -17,16 +17,17 @@ from dlt.common.utils import uniq_id, digest128, chunks from dlt.common.json import json -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.sources.helpers.transform import take_first -from dlt.extract.incremental import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing +from dlt.extract.incremental.exceptions import IncrementalCursorPathMissing, IncrementalPrimaryKeyMissing from dlt.pipeline.exceptions import PipelineStepFailed -from tests.extract.utils import AssertItems, data_to_item_format, TItemFormat, ALL_ITEM_FORMATS, data_item_to_list +from tests.extract.utils import AssertItems, data_item_to_list +from tests.utils import data_to_item_format, TDataItemFormat, ALL_DATA_ITEM_FORMATS -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -42,8 +43,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_single_items_last_value_state_is_updated_transformer(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_single_items_last_value_state_is_updated_transformer(item_type: TDataItemFormat) -> None: data = [ {'created_at': 425}, {'created_at': 426}, @@ -61,8 +62,8 @@ def some_data(item, created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 426 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_batch_items_last_value_state_is_updated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_batch_items_last_value_state_is_updated(item_type: TDataItemFormat) -> None: data1 = [{'created_at': i} for i in range(5)] data2 = [{'created_at': i} for i in range(5, 10)] @@ -81,8 +82,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_access_in_resource(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_access_in_resource(item_type: TDataItemFormat) -> None: values = [] data = [{'created_at': i} for i in range(6)] @@ -100,8 +101,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert values == [None, 5] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -127,9 +128,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -138,8 +138,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert rows == [(1, 'a'), (2, 'b'), (3, 'c'), (3, 'd'), (3, 'e'), (3, 'f'), (4, 'g')] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_rows_by_hash_are_deduplicated(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_rows_by_hash_are_deduplicated(item_type: TDataItemFormat) -> None: data1 = [ {'created_at': 1, 'id': 'a'}, {'created_at': 2, 'id': 'b'}, @@ -166,8 +166,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items2 p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, id FROM some_data order by created_at, id") as cur: @@ -189,7 +189,7 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): @pytest.mark.parametrize("item_type", ["arrow", "pandas"]) -def test_nested_cursor_path_arrow_fails(item_type: TItemFormat) -> None: +def test_nested_cursor_path_arrow_fails(item_type: TDataItemFormat) -> None: data = [ {'data': {'items': [{'created_at': 2}]}} ] @@ -208,8 +208,8 @@ def some_data(created_at=dlt.sources.incremental('data.items[0].created_at')): assert ex.exception.json_path == "data.items[0].created_at" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_initial_value(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_initial_value(item_type: TDataItemFormat) -> None: @dlt.resource def some_data(created_at=dlt.sources.incremental('created_at')): data = [{"created_at": created_at.last_value}] @@ -222,8 +222,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert s['last_value'] == 4242 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_explicit_incremental_instance(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_explicit_incremental_instance(item_type: TDataItemFormat) -> None: data = [{'inserted_at': 242, 'some_uq': 444}] source_items = data_to_item_format(item_type, data) @@ -238,7 +238,7 @@ def some_data(incremental=dlt.sources.incremental('created_at', initial_value=0) @dlt.resource -def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): +def some_data_from_config(call_no: int, item_type: TDataItemFormat, created_at: Optional[dlt.sources.incremental[str]] = dlt.secrets.value): assert created_at.cursor_path == 'created_at' # start value will update to the last_value on next call if call_no == 1: @@ -252,8 +252,8 @@ def some_data_from_config(call_no: int, item_type: TItemFormat, created_at: Opti yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_from_config(item_type: TDataItemFormat) -> None: os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__CURSOR_PATH'] = 'created_at' os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_FROM_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2022-02-03T00:00:00Z' @@ -263,8 +263,8 @@ def test_optional_incremental_from_config(item_type: TItemFormat) -> None: p.extract(some_data_from_config(2, item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_incremental_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_incremental_not_passed(item_type: TDataItemFormat) -> None: """Resource still runs when no incremental is passed""" data = [1,2,3] source_items = data_to_item_format(item_type, data) @@ -283,15 +283,15 @@ class OptionalIncrementalConfig(BaseConfiguration): @dlt.resource(spec=OptionalIncrementalConfig) -def optional_incremental_arg_resource(item_type: TItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: +def optional_incremental_arg_resource(item_type: TDataItemFormat, incremental: Optional[dlt.sources.incremental[Any]] = None) -> Any: data = [1,2,3] source_items = data_to_item_format(item_type, data) assert incremental is None yield source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_optional_arg_from_spec_not_passed(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_optional_arg_from_spec_not_passed(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) p.extract(optional_incremental_arg_resource(item_type)) @@ -303,7 +303,7 @@ class SomeDataOverrideConfiguration(BaseConfiguration): # provide what to inject via spec. the spec contain the default @dlt.resource(spec=SomeDataOverrideConfiguration) -def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): +def some_data_override_config(item_type: TDataItemFormat, created_at: dlt.sources.incremental[str] = dlt.config.value): assert created_at.cursor_path == 'created_at' assert created_at.initial_value == '2000-02-03T00:00:00Z' data = [{'created_at': '2023-03-03T00:00:00Z'}] @@ -311,8 +311,8 @@ def some_data_override_config(item_type: TItemFormat, created_at: dlt.sources.in yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_initial_value_from_config(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_initial_value_from_config(item_type: TDataItemFormat) -> None: # use the shortest possible config version # os.environ['SOURCES__TEST_INCREMENTAL__SOME_DATA_OVERRIDE_CONFIG__CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' os.environ['CREATED_AT__INITIAL_VALUE'] = '2000-02-03T00:00:00Z' @@ -321,8 +321,8 @@ def test_override_initial_value_from_config(item_type: TItemFormat) -> None: p.extract(some_data_override_config(item_type)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_override_primary_key_in_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_override_primary_key_in_pipeline(item_type: TDataItemFormat) -> None: """Primary key hint passed to pipeline is propagated through apply_hints """ data = [ @@ -342,8 +342,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): p.extract(some_data, primary_key=['id', 'other_id']) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_composite_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_composite_primary_key(item_type: TDataItemFormat) -> None: data = [ {'created_at': 1, 'isrc': 'AAA', 'market': 'DE'}, {'created_at': 2, 'isrc': 'BBB', 'market': 'DE'}, @@ -360,7 +360,7 @@ def some_data(created_at=dlt.sources.incremental('created_at')): yield from source_items p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:')) - p.run(some_data()) + p.run(some_data()).raise_on_failed_jobs() with p.sql_client() as c: with c.execute_query("SELECT created_at, isrc, market FROM some_data order by created_at, isrc, market") as cur: @@ -370,8 +370,8 @@ def some_data(created_at=dlt.sources.incremental('created_at')): assert set(rows) == expected -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_last_value_func_min(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_last_value_func_min(item_type: TDataItemFormat) -> None: data = [ {'created_at': 10}, {'created_at': 11}, @@ -410,8 +410,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', last_value_func=l assert s['last_value'] == 11 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_cursor_datetime_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_cursor_datetime_type(item_type: TDataItemFormat) -> None: initial_value = pendulum.now() data = [ {'created_at': initial_value + timedelta(minutes=1)}, @@ -434,8 +434,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', initial_value)): assert s['last_value'] == initial_value + timedelta(minutes=4) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_descending_order_unique_hashes(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_descending_order_unique_hashes(item_type: TDataItemFormat) -> None: """Resource returns items in descending order but using `max` last value function. Only hash matching last_value are stored. """ @@ -459,8 +459,8 @@ def some_data(created_at=dlt.sources.incremental('created_at', 20)): assert list(some_data()) == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_unique_keys_json_identifiers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_unique_keys_json_identifiers(item_type: TDataItemFormat) -> None: """Uses primary key name that is matching the name of the JSON element in the original namespace but gets converted into destination namespace""" @dlt.resource(primary_key="DelTa") @@ -492,8 +492,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert rows2[-1][0] == 9 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_primary_key(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_primary_key(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="DELTA") def some_data(last_timestamp=dlt.sources.incremental("ts")): @@ -506,8 +506,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts")): assert py_ex.value.primary_key_column == "DELTA" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_missing_cursor_field(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_missing_cursor_field(item_type: TDataItemFormat) -> None: os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately @dlt.resource def some_data(last_timestamp=dlt.sources.incremental("item.timestamp")): @@ -566,12 +566,12 @@ def some_data(last_timestamp: dlt.sources.incremental[float] = dlt.sources.incre assert list(some_data(last_timestamp=None)) == [1] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_filter_processed_items(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_filter_processed_items(item_type: TDataItemFormat) -> None: """Checks if already processed items are filtered out""" @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -628,8 +628,8 @@ def some_data(step, last_timestamp=dlt.sources.incremental("ts")): p.run(r, destination="duckdb") -@pytest.mark.parametrize("item_type", set(ALL_ITEM_FORMATS) - {'json'}) -def test_start_value_set_to_last_value_arrow(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", set(ALL_DATA_ITEM_FORMATS) - {'json'}) +def test_start_value_set_to_last_value_arrow(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb') now = pendulum.now() @@ -655,13 +655,13 @@ def some_data(first: bool, last_timestamp=dlt.sources.incremental("ts")): p.run(some_data(False)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_replace_resets_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_replace_resets_state(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id(), destination="duckdb") now = pendulum.now() @dlt.resource - def standalone_some_data(item_type: TItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): + def standalone_some_data(item_type: TDataItemFormat, now=None, last_timestamp=dlt.sources.incremental("timestamp")): data = [{"delta": i, "timestamp": (now or pendulum.now()).add(days=i).timestamp()} for i in range(-10, 10)] source_items = data_to_item_format(item_type, data) yield from source_items @@ -688,11 +688,13 @@ def child(item): info = p.run(child, write_disposition="replace") # print(info.load_packages[0]) assert len(info.loads_ids) == 1 - # pipeline applied hints to the child resource - assert child.write_disposition == "replace" + # pipeline applied hints to the child resource but it was placed into source first + # so the original is still "append" + assert child.write_disposition == "append" # create a source where we place only child - s = DltSource("section", Schema("comp"), [child]) + child.write_disposition = "replace" + s = DltSource("comp", "section", Schema("comp"), [child]) # but extracted resources will include its parent where it derives write disposition from child extracted = s.resources.extracted assert extracted[child.name].write_disposition == "replace" @@ -726,8 +728,8 @@ def child(item): assert extracted[child._pipe.parent.name].write_disposition == "append" -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_as_transform(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_as_transform(item_type: TDataItemFormat) -> None: now = pendulum.now().timestamp() @@ -749,8 +751,8 @@ def some_data(): assert len(info.loads_ids) == 1 -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_incremental_explicit_disable_unique_check(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_incremental_explicit_disable_unique_check(item_type: TDataItemFormat) -> None: @dlt.resource(primary_key="delta") def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): data = [{"delta": i, "ts": pendulum.now().timestamp()} for i in range(-10, 10)] @@ -764,8 +766,8 @@ def some_data(last_timestamp=dlt.sources.incremental("ts", primary_key=())): assert s.state["incremental"]["ts"]["unique_hashes"] == [] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_apply_hints_incremental(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_apply_hints_incremental(item_type: TDataItemFormat) -> None: p = dlt.pipeline(pipeline_name=uniq_id()) data = [{"created_at": 1}, {"created_at": 2}, {"created_at": 3}] @@ -879,7 +881,7 @@ def some_data(updated_at: dlt.sources.incremental[pendulum.DateTime] = dlt.sourc @dlt.resource def endless_sequence( - item_type: TItemFormat, + item_type: TDataItemFormat, updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at', initial_value=1) ) -> Any: max_values = 20 @@ -889,8 +891,8 @@ def endless_sequence( yield from source_items -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_chunked_ranges(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_chunked_ranges(item_type: TDataItemFormat) -> None: """Load chunked ranges with end value along with incremental""" pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -933,8 +935,8 @@ def test_chunked_ranges(item_type: TItemFormat) -> None: assert items == expected_range -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_with_batches(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_with_batches(item_type: TDataItemFormat) -> None: """Ensure incremental with end_value works correctly when resource yields lists instead of single items""" @dlt.resource def batched_sequence( @@ -969,8 +971,8 @@ def batched_sequence( assert items == list(range(1, 14)) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_load_with_end_value_does_not_write_state(item_type: TDataItemFormat) -> None: """When loading chunk with initial/end value range. The resource state is untouched. """ pipeline = dlt.pipeline(pipeline_name='incremental_' + uniq_id(), destination='duckdb') @@ -980,8 +982,8 @@ def test_load_with_end_value_does_not_write_state(item_type: TItemFormat) -> Non assert pipeline.state.get('sources') is None -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_end_value_initial_value_errors(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_end_value_initial_value_errors(item_type: TDataItemFormat) -> None: @dlt.resource def some_data( updated_at: dlt.sources.incremental[int] = dlt.sources.incremental('updated_at') @@ -1016,8 +1018,8 @@ def custom_last_value(items): assert "The result of 'custom_last_value([end_value, initial_value])' must equal 'end_value'" in str(ex.value) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_out_of_range_flags(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_out_of_range_flags(item_type: TDataItemFormat) -> None: """Test incremental.start_out_of_range / end_out_of_range flags are set when items are filtered out""" @dlt.resource def descending( @@ -1085,8 +1087,8 @@ def ascending_single_item( pipeline.extract(ascending_single_item()) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_get_incremental_value_type(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_get_incremental_value_type(item_type: TDataItemFormat) -> None: assert dlt.sources.incremental("id").get_incremental_value_type() is Any assert dlt.sources.incremental("id", initial_value=0).get_incremental_value_type() is int assert dlt.sources.incremental("id", initial_value=None).get_incremental_value_type() is Any @@ -1146,8 +1148,8 @@ def test_type_5(updated_at = dlt.sources.incremental("updated_at", allow_externa assert r.incremental._incremental.get_incremental_value_type() is Any -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1165,8 +1167,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment assert data_item_to_list(item_type, result) == [{'updated_at': 2}] -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_join_env_scheduler_pipeline(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_join_env_scheduler_pipeline(item_type: TDataItemFormat) -> None: @dlt.resource def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at", allow_external_schedulers=True)): data = [{"updated_at": d} for d in [1, 2, 3]] @@ -1194,8 +1196,8 @@ def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.increment pipeline.extract(r) -@pytest.mark.parametrize("item_type", ALL_ITEM_FORMATS) -def test_allow_external_schedulers(item_type: TItemFormat) -> None: +@pytest.mark.parametrize("item_type", ALL_DATA_ITEM_FORMATS) +def test_allow_external_schedulers(item_type: TDataItemFormat) -> None: @dlt.resource() def test_type_2(updated_at: dlt.sources.incremental[int] = dlt.sources.incremental("updated_at")): data = [{"updated_at": d} for d in [1, 2, 3]] diff --git a/tests/extract/test_sources.py b/tests/extract/test_sources.py index 965392f271..e9f7b4a8f1 100644 --- a/tests/extract/test_sources.py +++ b/tests/extract/test_sources.py @@ -9,10 +9,14 @@ from dlt.common.pipeline import StateInjectableContext, source_state from dlt.common.schema import Schema from dlt.common.typing import TDataItems -from dlt.extract.exceptions import DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, ParametrizedResourceUnbound, ResourcesNotFoundError + +from dlt.extract import DltResource, DltSource, Incremental +from dlt.extract.source import DltResourceDict +from dlt.extract.exceptions import (DataItemRequiredForDynamicTableHints, InconsistentTableTemplate, InvalidParentResourceDataType, + InvalidParentResourceIsAFunction, InvalidResourceDataTypeMultiplePipes, + InvalidTransformerDataTypeGeneratorFunctionRequired, InvalidTransformerGeneratorFunction, + ParametrizedResourceUnbound, ResourcesNotFoundError) from dlt.extract.pipe import Pipe -from dlt.extract.typing import FilterItem, MapItem -from dlt.extract.source import DltResource, DltResourceDict, DltSource def test_call_data_resource() -> None: @@ -1147,7 +1151,7 @@ def empty_gen(): empty_r = empty() # check defaults assert empty_r.name == empty.name == empty_r.table_name == empty.table_name == "empty_gen" - assert empty_r._table_schema_template is None + # assert empty_r._table_schema_template is None assert empty_r.compute_table_schema() == empty_table_schema assert empty_r.write_disposition == "append" @@ -1160,7 +1164,7 @@ def empty_gen(): empty_r.write_disposition = "append" assert empty_r.compute_table_schema()["write_disposition"] == "append" - empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"]) + empty_r.apply_hints(table_name="table", parent_table_name="parent", primary_key=["a", "b"], merge_key=["c", "a"], schema_contract="freeze") table = empty_r.compute_table_schema() assert table["columns"]["a"] == {'merge_key': True, 'name': 'a', 'nullable': False, 'primary_key': True} assert table["columns"]["b"] == {'name': 'b', 'nullable': False, 'primary_key': True} @@ -1168,10 +1172,11 @@ def empty_gen(): assert table["name"] == "table" assert table["parent"] == "parent" assert empty_r.table_name == "table" + assert table["schema_contract"] == "freeze" # reset - empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}) - assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append'} + empty_r.apply_hints(table_name="", parent_table_name="", primary_key=[], merge_key="", columns={}, incremental=Incremental.EMPTY, schema_contract={}) + assert empty_r._table_schema_template == {'columns': {}, 'incremental': None, 'validator': None, 'write_disposition': 'append', 'original_columns': {}} table = empty_r.compute_table_schema() assert table["name"] == "empty_gen" assert "parent" not in table diff --git a/tests/extract/test_validation.py b/tests/extract/test_validation.py index 64e06bcecc..db39530567 100644 --- a/tests/extract/test_validation.py +++ b/tests/extract/test_validation.py @@ -1,15 +1,19 @@ """Tests for resource validation with pydantic schema """ import typing as t - import pytest + import dlt -from dlt.extract.typing import ValidateItem +from dlt.common import json +from dlt.common.schema.exceptions import DataValidationError from dlt.common.typing import TDataItems -from dlt.extract.validation import PydanticValidator -from dlt.extract.exceptions import ValidationError, ResourceExtractionError +from dlt.common.libs.pydantic import BaseModel -from pydantic import BaseModel +from dlt.extract import DltResource +from dlt.extract.typing import ValidateItem +from dlt.extract.validation import PydanticValidator +from dlt.extract.exceptions import ResourceExtractionError +from dlt.pipeline.exceptions import PipelineStepFailed class SimpleModel(BaseModel): @@ -30,7 +34,8 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(some_data()) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + # compare content-wise. model names change due to extra settings on columns + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -50,7 +55,7 @@ def some_data() -> t.Iterator[TDataItems]: # Items are passed through model data = list(resource) - assert data == [SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")] + assert json.dumpb(data) == json.dumpb([SimpleModel(a=1, b="2"), SimpleModel(a=2, b="3")]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -68,7 +73,7 @@ def some_data() -> t.Iterator[TDataItems]: resource.validator = None data = list(resource) - assert data == [{"a": 1, "b": "2"}, {"a": 2, "b": "3"}] + assert json.dumpb(data) == json.dumpb([{"a": 1, "b": "2"}, {"a": 2, "b": "3"}]) @pytest.mark.parametrize("yield_list", [True, False]) @@ -94,14 +99,15 @@ class AnotherModel(BaseModel): data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) # Ensure only one validator is applied in steps steps = resource._pipe.steps assert len(steps) == 2 assert isinstance(steps[-1], ValidateItem) - assert steps[-1].model is AnotherModel # type: ignore[attr-defined] + # model name will change according to extra items handling + assert steps[-1].model.__name__.startswith(AnotherModel.__name__) # type: ignore[attr-defined] @pytest.mark.parametrize("yield_list", [True, False]) @@ -117,24 +123,24 @@ def some_data() -> t.Iterator[TDataItems]: resource = some_data() - assert isinstance(resource.validator, PydanticValidator) and resource.validator.model is SimpleModel + assert isinstance(resource.validator, PydanticValidator) and resource.validator.model.__name__.startswith(SimpleModel.__name__) class AnotherModel(BaseModel): a: int b: str c: float = 0.5 - resource.validator = PydanticValidator(AnotherModel) + resource.validator = PydanticValidator(AnotherModel, column_mode="freeze", data_mode="freeze") - assert resource.validator and resource.validator.model is AnotherModel + assert resource.validator and resource.validator.model.__name__.startswith(AnotherModel.__name__) data = list(resource) # Items are validated with the new model - assert data == [AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)] + assert json.dumpb(data) == json.dumpb([AnotherModel(a=1, b="2", c=0.5), AnotherModel(a=2, b="3", c=0.5)]) @pytest.mark.parametrize("yield_list", [True, False]) -def test_failed_validation(yield_list: bool) -> None: +def test_default_validation(yield_list: bool) -> None: @dlt.resource(columns=SimpleModel) def some_data() -> t.Iterator[TDataItems]: # yield item that fails schema validation @@ -144,9 +150,94 @@ def some_data() -> t.Iterator[TDataItems]: else: yield from items + # some_data must have default Pydantic schema contract + assert some_data().schema_contract == {"tables": "evolve", "columns": "discard_value", "data_type": "freeze"} + # extraction fails with ValidationError with pytest.raises(ResourceExtractionError) as exinfo: list(some_data()) - assert isinstance(exinfo.value.__cause__, ValidationError) - assert str(PydanticValidator(SimpleModel)) in str(exinfo.value) + val_ex = exinfo.value.__cause__ + assert isinstance(val_ex, DataValidationError) + assert val_ex.schema_name is None + assert val_ex.table_name == "some_data" + assert val_ex.column_name == "('items', 1, 'a')" if yield_list else "('a',)" + assert val_ex.data_item == {"a": "not_int", "b": "x"} + assert val_ex.contract_entity == "data_type" + + # fail in pipeline + @dlt.resource(columns=SimpleModel) + def some_data_extra() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z", "c": 1.3}, {"a": "not_int", "b": "x"}] + if yield_list: + yield items + else: + yield from items + + pipeline = dlt.pipeline() + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(some_data_extra()) + assert isinstance(py_ex.value.__cause__, ResourceExtractionError) + assert isinstance(py_ex.value.__cause__.__cause__, DataValidationError) + val_ex = py_ex.value.__cause__.__cause__ + assert val_ex.table_name == "some_data_extra" + assert val_ex.contract_entity == "data_type" # extra field is the cause + assert val_ex.data_item == {"a": "not_int", "b": "x"} + + +@pytest.mark.parametrize("yield_list", [True, False]) +def test_validation_with_contracts(yield_list: bool) -> None: + + def some_data() -> t.Iterator[TDataItems]: + # yield item that fails schema validation + items = [{"a": 1, "b": "z"}, {"a": "not_int", "b": "x"}, {"c": "not_int"}] + if yield_list: + yield items + else: + yield from items + + # let it evolve + r: DltResource = dlt.resource(some_data(), schema_contract="evolve", columns=SimpleModel) + validator: PydanticValidator[SimpleModel] = r.validator # type: ignore[assignment] + assert validator.column_mode == "evolve" + assert validator.data_mode == "evolve" + assert validator.model.__name__.endswith("AnyExtraAllow") + items = list(r) + assert len(items) == 3 + # fully valid + assert items[0].a == 1 + assert items[0].b == "z" + # data type not valid + assert items[1].a == "not_int" + assert items[1].b == "x" + # extra attr and data invalid + assert items[2].a is None + assert items[2].b is None + assert items[2].c == "not_int" + + # let it drop + r = dlt.resource(some_data(), schema_contract="discard_row", columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_row" + assert validator.data_mode == "discard_row" + assert validator.model.__name__.endswith("ExtraForbid") + items = list(r) + assert len(items) == 1 + assert items[0].a == 1 + assert items[0].b == "z" + + # filter just offending values + with pytest.raises(NotImplementedError): + # pydantic data_type cannot be discard_value + dlt.resource(some_data(), schema_contract="discard_value", columns=SimpleModel) + r = dlt.resource(some_data(), schema_contract={"columns": "discard_value", "data_type": "evolve"}, columns=SimpleModel) + validator = r.validator # type: ignore[assignment] + assert validator.column_mode == "discard_value" + assert validator.data_mode == "evolve" + # ignore is the default so no Extra in name + assert validator.model.__name__.endswith("Any") + items = list(r) + assert len(items) == 3 + # c is gone from the last model + assert not hasattr(items[2], "c") diff --git a/tests/extract/utils.py b/tests/extract/utils.py index b109cdbdd9..006816b5cd 100644 --- a/tests/extract/utils.py +++ b/tests/extract/utils.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, List, Literal, get_args +from typing import Any, Optional, List import pytest from itertools import zip_longest @@ -7,13 +7,7 @@ from dlt.extract.extract import ExtractorStorage from dlt.extract.typing import ItemTransform -import pandas as pd -from dlt.common.libs.pyarrow import pyarrow as pa - - -TItemFormat = Literal["json", "pandas", "arrow"] - -ALL_ITEM_FORMATS = get_args(TItemFormat) +from tests.utils import TDataItemFormat def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_name: str, content: str) -> None: @@ -35,7 +29,7 @@ def expect_extracted_file(storage: ExtractorStorage, schema_name: str, table_nam class AssertItems(ItemTransform[TDataItem]): - def __init__(self, expected_items: Any, item_type: TItemFormat = "json") -> None: + def __init__(self, expected_items: Any, item_type: TDataItemFormat = "json") -> None: self.expected_items = expected_items self.item_type = item_type @@ -44,22 +38,8 @@ def __call__(self, item: TDataItems, meta: Any = None) -> Optional[TDataItems]: return item -def data_to_item_format(item_format: TItemFormat, data: List[TDataItem]): - """Return the given data in the form of pandas, arrow table or json items""" - if item_format == "json": - return data - # Make dataframe from the data - df = pd.DataFrame(data) - if item_format == "pandas": - return [df] - elif item_format == "arrow": - return [pa.Table.from_pandas(df)] - else: - raise ValueError(f"Unknown item format: {item_format}") - - -def data_item_to_list(from_type: TItemFormat, values: List[TDataItem]): - if from_type == "arrow": +def data_item_to_list(from_type: TDataItemFormat, values: List[TDataItem]): + if from_type in ["arrow", "arrow-batch"]: return values[0].to_pylist() elif from_type == "pandas": return values[0].to_dict("records") diff --git a/tests/helpers/dbt_tests/local/test_dbt_utils.py b/tests/helpers/dbt_tests/local/test_dbt_utils.py index 71e570bd69..133ecf1617 100644 --- a/tests/helpers/dbt_tests/local/test_dbt_utils.py +++ b/tests/helpers/dbt_tests/local/test_dbt_utils.py @@ -7,7 +7,7 @@ from dlt.common.storages import FileStorage from dlt.common.utils import uniq_id -from dlt.destinations.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres.configuration import PostgresCredentials from dlt.helpers.dbt.dbt_utils import DBTProcessingError, initialize_dbt_logging, run_dbt_command, is_incremental_schema_out_of_sync_error from tests.utils import test_storage, preserve_environ diff --git a/tests/helpers/dbt_tests/test_runner_dbt_versions.py b/tests/helpers/dbt_tests/test_runner_dbt_versions.py index b418bf15b6..1037908e59 100644 --- a/tests/helpers/dbt_tests/test_runner_dbt_versions.py +++ b/tests/helpers/dbt_tests/test_runner_dbt_versions.py @@ -14,8 +14,8 @@ from dlt.common.runners.synth_pickle import decode_obj, encode_obj from dlt.common.typing import AnyFun -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.bigquery import BigQueryClientConfiguration +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.helpers.dbt.configuration import DBTRunnerConfiguration from dlt.helpers.dbt.exceptions import PrerequisitesException, DBTProcessingError from dlt.helpers.dbt import package_runner, create_venv, _create_dbt_deps, _default_profile_name, DEFAULT_DBT_VERSION diff --git a/tests/libs/__init__.py b/tests/libs/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/libs/test_buffered_writer_arrow,py b/tests/libs/test_buffered_writer_arrow,py new file mode 100644 index 0000000000..f0f0968942 --- /dev/null +++ b/tests/libs/test_buffered_writer_arrow,py @@ -0,0 +1,50 @@ +import pytest + +from dlt.common.destination import TLoaderFileFormat +from dlt.common.schema.utils import new_column + +from tests.common.data_writers.utils import get_writer, ALL_WRITERS + + +@pytest.mark.parametrize("writer_format", ALL_WRITERS - {"arrow"}) +def test_writer_items_count(writer_format: TLoaderFileFormat) -> None: + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format=writer_format) as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item({"col1": 1}, columns=c1) + assert writer._buffered_items_count == 1 + # list + writer.write_data_item([{"col1": 1}, {"col1": 2}], columns=c1) + assert writer._buffered_items_count == 3 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 3 + + +def test_writer_items_count_arrow() -> None: + import pyarrow as pa + c1 = {"col1": new_column("col1", "bigint")} + with get_writer(_format="arrow") as writer: + assert writer._buffered_items_count == 0 + # single item + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}]), columns=c1) + assert writer._buffered_items_count == 1 + # single item with many rows + writer.write_data_item(pa.Table.from_pylist([{"col1": 1}, {"col1": 2}]), columns=c1) + assert writer._buffered_items_count == 3 + # empty list + writer.write_data_item([], columns=c1) + assert writer._buffered_items_count == 3 + # list with one item + writer.write_data_item([pa.Table.from_pylist([{"col1": 1}])], columns=c1) + assert writer._buffered_items_count == 4 + # list with many items + writer.write_data_item( + [pa.Table.from_pylist([{"col1": 1}]), pa.Table.from_pylist([{"col1": 1}, {"col1": 2}])], + columns=c1 + ) + assert writer._buffered_items_count == 7 + writer._flush_items() + assert writer._buffered_items_count == 0 + assert writer._writer.items_count == 7 diff --git a/tests/common/data_writers/test_parquet_writer.py b/tests/libs/test_parquet_writer.py similarity index 100% rename from tests/common/data_writers/test_parquet_writer.py rename to tests/libs/test_parquet_writer.py diff --git a/tests/common/test_pyarrow.py b/tests/libs/test_pyarrow.py similarity index 100% rename from tests/common/test_pyarrow.py rename to tests/libs/test_pyarrow.py diff --git a/tests/libs/test_pydantic.py b/tests/libs/test_pydantic.py new file mode 100644 index 0000000000..5606bd25b2 --- /dev/null +++ b/tests/libs/test_pydantic.py @@ -0,0 +1,391 @@ +from copy import copy +import pytest +from typing import ClassVar, Sequence, Mapping, Dict, MutableMapping, MutableSequence, Union, Optional, List, Dict, Any +from enum import Enum + +from datetime import datetime, date, time # noqa: I251 +from dlt.common import Decimal +from dlt.common import json + +from dlt.common.libs.pydantic import DltConfig, pydantic_to_table_schema_columns, apply_schema_contract_to_model, validate_item, validate_items, create_list_model +from pydantic import BaseModel, Json, AnyHttpUrl, ConfigDict, ValidationError + +from dlt.common.schema.exceptions import DataValidationError + + +class StrEnum(str, Enum): + a = "a_value" + b = "b_value" + c = "c_value" + + +class IntEnum(int, Enum): + a = 0 + b = 1 + c = 2 + + +class MixedEnum(Enum): + a_int = 0 + b_str = "b_value" + c_int = 2 + + +class NestedModel(BaseModel): + nested_field: str + + +class Model(BaseModel): + bigint_field: int + text_field: str + timestamp_field: datetime + date_field: date + decimal_field: Decimal + double_field: float + time_field: time + + nested_field: NestedModel + list_field: List[str] + + union_field: Union[int, str] + + optional_field: Optional[float] + + blank_dict_field: dict # type: ignore[type-arg] + parametrized_dict_field: Dict[str, int] + + str_enum_field: StrEnum + int_enum_field: IntEnum + # Both of these shouold coerce to str + mixed_enum_int_field: MixedEnum + mixed_enum_str_field: MixedEnum + + json_field: Json[List[str]] + + url_field: AnyHttpUrl + + any_field: Any + json_any_field: Json[Any] + + +class ModelWithConfig(Model): + model_config = ConfigDict(frozen=True, extra="allow") + + +TEST_MODEL_INSTANCE = Model( + bigint_field=1, text_field="text", timestamp_field=datetime.now(), + date_field=date.today(), decimal_field=Decimal(1.1), double_field=1.1, + time_field=time(1, 2, 3, 12345), + nested_field=NestedModel(nested_field="nested"), + list_field=["a", "b", "c"], + union_field=1, + optional_field=None, + blank_dict_field={}, + parametrized_dict_field={"a": 1, "b": 2, "c": 3}, + str_enum_field=StrEnum.a, + int_enum_field=IntEnum.a, + mixed_enum_int_field=MixedEnum.a_int, + mixed_enum_str_field=MixedEnum.b_str, + json_field=json.dumps(["a", "b", "c"]), # type: ignore[arg-type] + url_field="https://example.com", # type: ignore[arg-type] + any_field="any_string", + json_any_field=json.dumps("any_string"), +) + + +@pytest.mark.parametrize('instance', [True, False]) +def test_pydantic_model_to_columns(instance: bool) -> None: + if instance: + model = TEST_MODEL_INSTANCE + else: + model = Model # type: ignore[assignment] + + result = pydantic_to_table_schema_columns(model) + + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + assert result["date_field"]["data_type"] == "date" + assert result["decimal_field"]["data_type"] == "decimal" + assert result["double_field"]["data_type"] == "double" + assert result["time_field"]["data_type"] == "time" + assert result["nested_field"]["data_type"] == "complex" + assert result['list_field']['data_type'] == 'complex' + assert result['union_field']['data_type'] == 'bigint' + assert result['optional_field']['data_type'] == 'double' + assert result['optional_field']['nullable'] is True + assert result['blank_dict_field']['data_type'] == 'complex' + assert result['parametrized_dict_field']['data_type'] == 'complex' + assert result['str_enum_field']['data_type'] == 'text' + assert result['int_enum_field']['data_type'] == 'bigint' + assert result['mixed_enum_int_field']['data_type'] == 'text' + assert result['mixed_enum_str_field']['data_type'] == 'text' + assert result['json_field']['data_type'] == 'complex' + assert result['url_field']['data_type'] == 'text' + + # Any type fields are excluded from schema + assert 'any_field' not in result + assert 'json_any_field' not in result + + +def test_pydantic_model_skip_complex_types() -> None: + class SkipNestedModel(Model): + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + result = pydantic_to_table_schema_columns(SkipNestedModel) + + assert result["bigint_field"]["data_type"] == "bigint" + assert "nested_field" not in result + assert "list_field" not in result + assert "blank_dict_field" not in result + assert "parametrized_dict_field" not in result + assert "json_field" not in result + assert result["bigint_field"]["data_type"] == "bigint" + assert result["text_field"]["data_type"] == "text" + assert result["timestamp_field"]["data_type"] == "timestamp" + + +def test_model_for_column_mode() -> None: + # extra prop + instance_extra = TEST_MODEL_INSTANCE.dict() + instance_extra["extra_prop"] = "EXTRA" + # back to string + instance_extra["json_field"] = json.dumps(["a", "b", "c"]) + instance_extra["json_any_field"] = json.dumps("any_string") + + # evolve - allow extra fields + model_evolve = apply_schema_contract_to_model(ModelWithConfig, "evolve") + # assert "frozen" in model_evolve.model_config + extra_instance = model_evolve.parse_obj(instance_extra) + assert hasattr(extra_instance, "extra_prop") + assert extra_instance.extra_prop == "EXTRA" + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_evolve.parse_obj(instance_extra) + assert extra_instance.extra_prop == "EXTRA" # type: ignore[attr-defined] + + # freeze - validation error on extra fields + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "freeze") + # assert "frozen" in model_freeze.model_config + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + model_freeze = apply_schema_contract_to_model(Model, "freeze") # type: ignore[arg-type] + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard row - same as freeze + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "discard_row") + with pytest.raises(ValidationError) as py_ex: + model_freeze.parse_obj(instance_extra) + assert py_ex.value.errors()[0]["loc"] == ('extra_prop',) + + # discard value - ignore extra fields + model_discard = apply_schema_contract_to_model(ModelWithConfig, "discard_value") + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + model_evolve = apply_schema_contract_to_model(Model, "evolve") # type: ignore[arg-type] + extra_instance = model_discard.parse_obj(instance_extra) + assert not hasattr(extra_instance, "extra_prop") + + # evolve data but freeze new columns + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "freeze") + instance_extra_2 = copy(instance_extra) + # should parse ok + model_discard.parse_obj(instance_extra_2) + # this must fail validation + instance_extra_2["bigint_field"] = "NOT INT" + with pytest.raises(ValidationError): + model_discard.parse_obj(instance_extra_2) + # let the datatypes evolve + model_freeze = apply_schema_contract_to_model(ModelWithConfig, "evolve", "evolve") + print(model_freeze.parse_obj(instance_extra_2).dict()) + + with pytest.raises(NotImplementedError): + apply_schema_contract_to_model(ModelWithConfig, "evolve", "discard_value") + + +def test_nested_model_config_propagation() -> None: + class UserLabel(BaseModel): + label: str + + class UserAddress(BaseModel): + street: str + zip_code: Sequence[int] + label: Optional[UserLabel] + ro_labels: Mapping[str, UserLabel] + wr_labels: MutableMapping[str, List[UserLabel]] + ro_list: Sequence[UserLabel] + wr_list: MutableSequence[Dict[str, UserLabel]] + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + address: UserAddress + unity: Union[UserAddress, UserLabel, Dict[str, UserAddress]] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + model_freeze = apply_schema_contract_to_model(User, "evolve", "freeze") + from typing import get_type_hints + print(get_type_hints(model_freeze)) + print(get_type_hints(model_freeze.model_fields["address"].annotation)) + + + +def test_item_list_validation() -> None: + + class ItemModel(BaseModel): + b: bool + opt: Optional[int] = None + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + discard_list_model = create_list_model(discard_model) + # violate data type + items = validate_items( + "items", + discard_list_model, + [{"b": True}, {"b": 2, "opt": "not int", "extra": 1.2}, {"b": 3}, {"b": False}], + "discard_row", "discard_row" + ) + # {"b": 2, "opt": "not int", "extra": 1.2} - note that this will generate 3 errors for the same item + # and is crucial in our tests when discarding rows + assert len(items) == 2 + assert items[0].b is True + assert items[1].b is False + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "discard_row") + assert len(items) == 1 + assert items[0].b is True + + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + freeze_list_model = create_list_model(freeze_model) + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'b')) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_items("items", freeze_list_model, [{"b": True}, {"a": 2, "b": False}, {"b": 3}, {"b": False}], "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(("items", 1 , 'a')) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_list_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + discard_list_model = create_list_model(discard_value_model) + # violate extra field + items = validate_items("items", discard_list_model, [{"b": True}, {"b": False, "a": False}], "discard_value", "freeze") + assert len(items) == 2 + # "a" extra got remove + assert items[1].dict() == {"b": False, "opt": None} + # violate data type + with pytest.raises(NotImplementedError): + apply_schema_contract_to_model(ItemModel, "discard_value", "discard_value") + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + evolve_list_model = create_list_model(evolve_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields allowed + items = validate_items("items", evolve_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "evolve", "evolve") + assert len(items) == 4 + assert items[3].b is False + assert items[3].a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + mixed_list_model = create_list_model(mixed_model) + # for data types a lenient model will be created that accepts any type + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False}], "discard_row", "evolve") + assert len(items) == 4 + assert items[0].b is True + assert items[1].b == 2 + # extra fields forbidden - full rows discarded + items = validate_items("items", mixed_list_model, [{"b": True}, {"b": 2}, {"b": 3}, {"b": False, "a": False}], "discard_row", "evolve") + assert len(items) == 3 + + +def test_item_validation() -> None: + + class ItemModel(BaseModel): + b: bool + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": False} + + + + # non validating items removed from the list (both extra and declared) + discard_model = apply_schema_contract_to_model(ItemModel, "discard_row", "discard_row") + # violate data type + assert validate_item("items", discard_model, {"b": 2}, "discard_row", "discard_row") is None + # violate extra field + assert validate_item("items", discard_model, {"b": False, "a": False}, "discard_row", "discard_row") is None + + # freeze on non validating items (both extra and declared) + freeze_model = apply_schema_contract_to_model(ItemModel, "freeze", "freeze") + # violate data type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"b": 2}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('b',)) # pydantic location + assert val_ex.value.contract_entity == "data_type" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"b": 2} + # extra type + with pytest.raises(DataValidationError) as val_ex: + validate_item("items", freeze_model, {"a": 2, "b": False}, "freeze", "freeze") + assert val_ex.value.schema_name is None + assert val_ex.value.table_name == "items" + assert val_ex.value.column_name == str(('a',)) # pydantic location + assert val_ex.value.contract_entity == "columns" + assert val_ex.value.contract_mode == "freeze" + assert val_ex.value.table_schema is freeze_model + assert val_ex.value.data_item == {"a": 2, "b": False} + + # discard values + discard_value_model = apply_schema_contract_to_model(ItemModel, "discard_value", "freeze") + # violate extra field + item = validate_item("items", discard_value_model, {"b": False, "a": False}, "discard_value", "freeze") + # "a" extra got removed + assert item.dict() == {"b": False} + + # evolve data types and extras + evolve_model = apply_schema_contract_to_model(ItemModel, "evolve", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", evolve_model, {"b": 2}, "evolve", "evolve") + assert item.b == 2 + # extra fields allowed + item = validate_item("items", evolve_model, {"b": False, "a": False}, "evolve", "evolve") + assert item.b is False + assert item.a is False # type: ignore[attr-defined] + + # accept new types but discard new columns + mixed_model = apply_schema_contract_to_model(ItemModel, "discard_row", "evolve") + # for data types a lenient model will be created that accepts any type + item = validate_item("items", mixed_model, {"b": 3}, "discard_row", "evolve") + assert item.b == 3 + # extra fields forbidden - full rows discarded + assert validate_item("items", mixed_model, {"b": False, "a": False}, "discard_row", "evolve") is None diff --git a/tests/load/bigquery/test_bigquery_client.py b/tests/load/bigquery/test_bigquery_client.py index 145898cde3..abbaf8d414 100644 --- a/tests/load/bigquery/test_bigquery_client.py +++ b/tests/load/bigquery/test_bigquery_client.py @@ -14,7 +14,7 @@ from dlt.common.storages import FileStorage from dlt.common.utils import digest128, uniq_id, custom_environ -from dlt.destinations.bigquery.bigquery import BigQueryClient, BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.bigquery import BigQueryClient, BigQueryClientConfiguration from dlt.destinations.exceptions import LoadJobNotExistsException, LoadJobTerminalException from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, preserve_environ @@ -242,7 +242,7 @@ def test_bigquery_job_errors(client: BigQueryClient, file_storage: FileStorage) @pytest.mark.parametrize('location', ["US", "EU"]) def test_bigquery_location(location: str, file_storage: FileStorage) -> None: - with cm_yield_client_with_storage("bigquery", default_config_values={"location": location}) as client: + with cm_yield_client_with_storage("bigquery", default_config_values={"credentials": {"location": location}}) as client: user_table_name = prepare_table(client) load_json = { "_dlt_id": uniq_id(), diff --git a/tests/load/bigquery/test_bigquery_table_builder.py b/tests/load/bigquery/test_bigquery_table_builder.py index a3222ba020..0d8ab1c8c2 100644 --- a/tests/load/bigquery/test_bigquery_table_builder.py +++ b/tests/load/bigquery/test_bigquery_table_builder.py @@ -8,8 +8,8 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.specs import GcpServiceAccountCredentialsWithoutDefaults -from dlt.destinations.bigquery.bigquery import BigQueryClient -from dlt.destinations.bigquery.configuration import BigQueryClientConfiguration +from dlt.destinations.impl.bigquery.bigquery import BigQueryClient +from dlt.destinations.impl.bigquery.configuration import BigQueryClientConfiguration from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/cases/fake_destination.py b/tests/load/cases/fake_destination.py index 152b2db918..016cc19020 100644 --- a/tests/load/cases/fake_destination.py +++ b/tests/load/cases/fake_destination.py @@ -1 +1,6 @@ -# module that is used to test wrong destination references \ No newline at end of file +# module that is used to test wrong destination references + + +class not_a_destination: + def __init__(self, **kwargs) -> None: + pass diff --git a/tests/load/duckdb/test_duckdb_client.py b/tests/load/duckdb/test_duckdb_client.py index 6c362a6b76..ddfc681a84 100644 --- a/tests/load/duckdb/test_duckdb_client.py +++ b/tests/load/duckdb/test_duckdb_client.py @@ -6,7 +6,8 @@ from dlt.common.configuration.resolve import resolve_configuration from dlt.common.configuration.utils import get_resolved_traces -from dlt.destinations.duckdb.configuration import DUCK_DB_NAME, DuckDbClientConfiguration, DuckDbCredentials, DEFAULT_DUCK_DB_NAME +from dlt.destinations.impl.duckdb.configuration import DUCK_DB_NAME, DuckDbClientConfiguration, DuckDbCredentials, DEFAULT_DUCK_DB_NAME +from dlt.destinations import duckdb from tests.load.pipeline.utils import drop_pipeline, assert_table from tests.utils import patch_home_dir, autouse_test_storage, preserve_environ, TEST_STORAGE_ROOT @@ -46,13 +47,13 @@ def test_duckdb_open_conn_default() -> None: def test_duckdb_database_path() -> None: # resolve without any path provided c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) - assert c.credentials.database.lower() == os.path.abspath("quack.duckdb").lower() + assert c.credentials._conn_str().lower() == os.path.abspath("quack.duckdb").lower() # resolve without any path but with pipeline context p = dlt.pipeline(pipeline_name="quack_pipeline") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset")) # still cwd db_path = os.path.abspath(os.path.join(".", "quack_pipeline.duckdb")) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() # we do not keep default duckdb path in the local state with pytest.raises(KeyError): p.get_local_state_val("duckdb_database") @@ -69,7 +70,7 @@ def test_duckdb_database_path() -> None: # test special :pipeline: path to create in pipeline folder c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=":pipeline:")) db_path = os.path.abspath(os.path.join(p.working_dir, DEFAULT_DUCK_DB_NAME)) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() # connect conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) @@ -80,7 +81,7 @@ def test_duckdb_database_path() -> None: # provide relative path db_path = "_storage/test_quack.duckdb" c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials="duckdb:///_storage/test_quack.duckdb")) - assert c.credentials.database.lower() == os.path.abspath(db_path).lower() + assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -90,7 +91,7 @@ def test_duckdb_database_path() -> None: db_path = os.path.abspath("_storage/abs_test_quack.duckdb") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=f"duckdb:///{db_path}")) assert os.path.isabs(c.credentials.database) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -99,7 +100,7 @@ def test_duckdb_database_path() -> None: # set just path as credentials db_path = "_storage/path_test_quack.duckdb" c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) - assert c.credentials.database.lower() == os.path.abspath(db_path).lower() + assert c.credentials._conn_str().lower() == os.path.abspath(db_path).lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -108,7 +109,7 @@ def test_duckdb_database_path() -> None: db_path = os.path.abspath("_storage/abs_path_test_quack.duckdb") c = resolve_configuration(DuckDbClientConfiguration(dataset_name="test_dataset", credentials=db_path)) assert os.path.isabs(c.credentials.database) - assert c.credentials.database.lower() == db_path.lower() + assert c.credentials._conn_str().lower() == db_path.lower() conn = c.credentials.borrow_conn(read_only=False) c.credentials.return_conn(conn) assert os.path.isfile(db_path) @@ -128,7 +129,7 @@ def test_keeps_initial_db_path() -> None: print(p.pipelines_dir) with p.sql_client() as conn: # still cwd - assert conn.credentials.database.lower() == os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower() # but it is kept in the local state assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() @@ -138,7 +139,7 @@ def test_keeps_initial_db_path() -> None: with p.sql_client() as conn: # still cwd assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() - assert conn.credentials.database.lower() == os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() == os.path.abspath(db_path).lower() # now create a new pipeline dlt.pipeline(pipeline_name="not_quack", destination="dummy") @@ -147,12 +148,12 @@ def test_keeps_initial_db_path() -> None: assert p.get_local_state_val("duckdb_database").lower() == os.path.abspath(db_path).lower() # new pipeline context took over # TODO: restore pipeline context on each call - assert conn.credentials.database.lower() != os.path.abspath(db_path).lower() + assert conn.credentials._conn_str().lower() != os.path.abspath(db_path).lower() def test_duckdb_database_delete() -> None: db_path = "_storage/path_test_quack.duckdb" - p = dlt.pipeline(pipeline_name="quack_pipeline", credentials=db_path, destination="duckdb") + p = dlt.pipeline(pipeline_name="quack_pipeline", destination=duckdb(credentials=db_path)) p.run([1, 2, 3], table_name="table", dataset_name="dataset") # attach the pipeline p = dlt.attach(pipeline_name="quack_pipeline") diff --git a/tests/load/duckdb/test_duckdb_table_builder.py b/tests/load/duckdb/test_duckdb_table_builder.py index 247d134b06..a5870763fc 100644 --- a/tests/load/duckdb/test_duckdb_table_builder.py +++ b/tests/load/duckdb/test_duckdb_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.duckdb.duck import DuckDbClient -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb.duck import DuckDbClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/duckdb/test_motherduck_client.py b/tests/load/duckdb/test_motherduck_client.py index 4a167fa016..582847bfa2 100644 --- a/tests/load/duckdb/test_motherduck_client.py +++ b/tests/load/duckdb/test_motherduck_client.py @@ -3,7 +3,7 @@ from dlt.common.configuration.resolve import resolve_configuration -from dlt.destinations.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration +from dlt.destinations.impl.motherduck.configuration import MotherDuckCredentials, MotherDuckClientConfiguration from tests.utils import patch_home_dir, preserve_environ, skip_if_not_active diff --git a/tests/load/filesystem/test_filesystem_client.py b/tests/load/filesystem/test_filesystem_client.py index f290892e18..0055f37716 100644 --- a/tests/load/filesystem/test_filesystem_client.py +++ b/tests/load/filesystem/test_filesystem_client.py @@ -6,7 +6,7 @@ from dlt.common.utils import digest128, uniq_id from dlt.common.storages import LoadStorage, FileStorage -from dlt.destinations.filesystem.filesystem import LoadFilesystemJob, FilesystemDestinationClientConfiguration +from dlt.destinations.impl.filesystem.filesystem import LoadFilesystemJob, FilesystemDestinationClientConfiguration from tests.load.filesystem.utils import perform_load from tests.utils import clean_test_storage, init_test_logging diff --git a/tests/load/filesystem/utils.py b/tests/load/filesystem/utils.py index eebfa6e87c..8186e82c3b 100644 --- a/tests/load/filesystem/utils.py +++ b/tests/load/filesystem/utils.py @@ -5,16 +5,16 @@ from dlt.load import Load from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import DestinationReference, LoadJob +from dlt.common.destination.reference import Destination, LoadJob, TDestination from dlt.destinations import filesystem -from dlt.destinations.filesystem.filesystem import FilesystemClient +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient from dlt.destinations.job_impl import EmptyLoadJob from tests.load.utils import prepare_load_package def setup_loader(dataset_name: str) -> Load: - destination: DestinationReference = filesystem # type: ignore[assignment] - config = filesystem.spec()(dataset_name=dataset_name) + destination: TDestination = filesystem() # type: ignore[assignment] + config = filesystem.spec(dataset_name=dataset_name) # setup loader with Container().injectable_context(ConfigSectionContext(sections=('filesystem',))): return Load( diff --git a/tests/load/mssql/test_mssql_credentials.py b/tests/load/mssql/test_mssql_credentials.py index 9b57692bb2..5428246247 100644 --- a/tests/load/mssql/test_mssql_credentials.py +++ b/tests/load/mssql/test_mssql_credentials.py @@ -1,6 +1,6 @@ from dlt.common.configuration import resolve_configuration -from dlt.destinations.mssql.configuration import MsSqlCredentials +from dlt.destinations.impl.mssql.configuration import MsSqlCredentials diff --git a/tests/load/mssql/test_mssql_table_builder.py b/tests/load/mssql/test_mssql_table_builder.py index 4f5a6637d6..114d94a20f 100644 --- a/tests/load/mssql/test_mssql_table_builder.py +++ b/tests/load/mssql/test_mssql_table_builder.py @@ -7,8 +7,8 @@ pytest.importorskip("dlt.destinations.mssql.mssql", reason="MSSQL ODBC driver not installed") -from dlt.destinations.mssql.mssql import MsSqlClient -from dlt.destinations.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials +from dlt.destinations.impl.mssql.mssql import MsSqlClient +from dlt.destinations.impl.mssql.configuration import MsSqlClientConfiguration, MsSqlCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/pipeline/test_arrow_loading.py b/tests/load/pipeline/test_arrow_loading.py index bd709e764d..9a72536329 100644 --- a/tests/load/pipeline/test_arrow_loading.py +++ b/tests/load/pipeline/test_arrow_loading.py @@ -138,4 +138,4 @@ def some_data(): result_tbl = pa.parquet.read_table(f) # Parquet schema is written with normalized column names - assert result_tbl.column_names == expected_column_names + assert result_tbl.schema.names == expected_column_names diff --git a/tests/load/pipeline/test_drop.py b/tests/load/pipeline/test_drop.py index a2714674be..4354460374 100644 --- a/tests/load/pipeline/test_drop.py +++ b/tests/load/pipeline/test_drop.py @@ -1,3 +1,4 @@ +import os from typing import Any, Iterator, Dict, Any, List from unittest import mock from itertools import chain @@ -5,11 +6,11 @@ import pytest import dlt -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.common.utils import uniq_id from dlt.pipeline import helpers, state_sync, Pipeline from dlt.load import Load -from dlt.pipeline.exceptions import PipelineStepFailed +from dlt.pipeline.exceptions import PipelineHasPendingDataException, PipelineNeverRan, PipelineStepFailed from dlt.destinations.job_client_impl import SqlJobClientBase from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration @@ -186,7 +187,7 @@ def test_fail_after_drop_tables(destination_config: DestinationTestConfiguration @pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) def test_load_step_fails(destination_config: DestinationTestConfiguration) -> None: - """Test idempotency. pipeline.load() fails. Command can be run again successfully""" + """Test idempotence. pipeline.load() fails. Command can be run again successfully""" source = droppable_source() pipeline = destination_config.setup_pipeline('drop_test_' + uniq_id(), full_refresh=True) pipeline.run(source) @@ -292,3 +293,15 @@ def test_drop_state_only(destination_config: DestinationTestConfiguration) -> No assert_dropped_resource_tables(attached, []) # No tables dropped assert_dropped_resource_states(attached, ['droppable_a', 'droppable_b']) assert_destination_state_loaded(attached) + + +def test_drop_first_run_and_pending_packages() -> None: + """Attempts to drop before pipeline runs and when partial loads happen""" + pipeline = dlt.pipeline('drop_test_' + uniq_id(), destination="dummy") + with pytest.raises(PipelineNeverRan): + helpers.drop(pipeline, "droppable_a") + os.environ["COMPLETED_PROB"] = "1.0" + pipeline.run(droppable_source().with_resources("droppable_a")) + pipeline.extract(droppable_source().with_resources("droppable_b")) + with pytest.raises(PipelineHasPendingDataException): + helpers.drop(pipeline, "droppable_a") \ No newline at end of file diff --git a/tests/load/pipeline/test_filesystem_pipeline.py b/tests/load/pipeline/test_filesystem_pipeline.py index 8e810015f2..dce65bc8d7 100644 --- a/tests/load/pipeline/test_filesystem_pipeline.py +++ b/tests/load/pipeline/test_filesystem_pipeline.py @@ -4,7 +4,7 @@ import dlt, os from dlt.common.utils import uniq_id from dlt.common.storages.load_storage import LoadJobInfo -from dlt.destinations.filesystem.filesystem import FilesystemClient, LoadFilesystemJob +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient, LoadFilesystemJob from dlt.common.schema.typing import LOADS_TABLE_NAME from tests.utils import skip_if_not_active diff --git a/tests/load/pipeline/test_merge_disposition.py b/tests/load/pipeline/test_merge_disposition.py index fbc5088ab2..4e8d1f9049 100644 --- a/tests/load/pipeline/test_merge_disposition.py +++ b/tests/load/pipeline/test_merge_disposition.py @@ -13,7 +13,7 @@ from dlt.common.pipeline import StateInjectableContext from dlt.common.typing import AnyFun, StrAny from dlt.common.utils import digest128 -from dlt.extract.source import DltResource +from dlt.extract import DltResource from dlt.sources.helpers.transform import skip_first, take_first from tests.pipeline.utils import assert_load_info diff --git a/tests/load/pipeline/test_pipelines.py b/tests/load/pipeline/test_pipelines.py index 99071a7ac6..004aac0285 100644 --- a/tests/load/pipeline/test_pipelines.py +++ b/tests/load/pipeline/test_pipelines.py @@ -8,13 +8,13 @@ from dlt.common.pipeline import SupportsPipeline from dlt.common import json, sleep -from dlt.common.destination.reference import DestinationReference +from dlt.common.destination import Destination from dlt.common.schema.schema import Schema from dlt.common.schema.typing import VERSION_TABLE_NAME from dlt.common.typing import TDataItem from dlt.common.utils import uniq_id from dlt.extract.exceptions import ResourceNameMissing -from dlt.extract.source import DltSource +from dlt.extract import DltSource from dlt.pipeline.exceptions import CannotRestorePipelineException, PipelineConfigMissing, PipelineStepFailed from dlt.common.schema.exceptions import CannotCoerceColumnException from dlt.common.exceptions import DestinationHasFailedJobs @@ -66,8 +66,8 @@ def data_fun() -> Iterator[Any]: # mock the correct destinations (never do that in normal code) with p.managed_state(): p._set_destinations( - DestinationReference.from_name(destination_config.destination), - DestinationReference.from_name(destination_config.staging) if destination_config.staging else None + Destination.from_reference(destination_config.destination), + Destination.from_reference(destination_config.staging) if destination_config.staging else None ) # does not reset the dataset name assert p.dataset_name in possible_dataset_names diff --git a/tests/load/pipeline/test_write_disposition_changes.py b/tests/load/pipeline/test_write_disposition_changes.py index 158993b7c8..c88fd79588 100644 --- a/tests/load/pipeline/test_write_disposition_changes.py +++ b/tests/load/pipeline/test_write_disposition_changes.py @@ -77,7 +77,7 @@ def source(): if with_root_key: assert pipeline.default_schema._normalizers_config["json"]["config"]["propagation"]["root"] == {'_dlt_id': '_dlt_root_id'} else: - assert "propagation" not in pipeline.default_schema._normalizers_config["json"]["config"] + assert "propagation" not in pipeline.default_schema._normalizers_config["json"].get("config", {}) # without a root key this will fail, it is expected if not with_root_key and destination_config.supports_merge: diff --git a/tests/load/pipeline/utils.py b/tests/load/pipeline/utils.py index 752571591c..94fbc80cf8 100644 --- a/tests/load/pipeline/utils.py +++ b/tests/load/pipeline/utils.py @@ -1,22 +1,19 @@ -import posixpath, os -from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Optional, Tuple, Dict, Callable +from typing import Any, Iterator, List, Sequence, TYPE_CHECKING, Callable import pytest import dlt from dlt.common.destination.reference import WithStagingDataset -from dlt.pipeline.pipeline import Pipeline -from dlt.common import json from dlt.common.configuration.container import Container from dlt.common.pipeline import LoadInfo, PipelineContext -from dlt.common.typing import DictStrAny -from dlt.pipeline.exceptions import SqlClientNotAvailable -from dlt.common.schema.typing import LOADS_TABLE_NAME +from tests.pipeline.utils import (load_table_counts, load_data_table_counts, assert_data_table_counts, load_file, + load_files, load_tables_to_dicts, load_table_distinct_counts) from tests.load.utils import DestinationTestConfiguration, destinations_configs if TYPE_CHECKING: - from dlt.destinations.filesystem.filesystem import FilesystemClient + from dlt.destinations.impl.filesystem.filesystem import FilesystemClient + @pytest.fixture(autouse=True) def drop_pipeline(request) -> Iterator[None]: @@ -67,7 +64,7 @@ def _drop_dataset(schema_name: str) -> None: def _is_filesystem(p: dlt.Pipeline) -> bool: if not p.destination: return False - return p.destination.__name__.rsplit('.', 1)[-1] == 'filesystem' + return p.destination.name == 'filesystem' def assert_table(p: dlt.Pipeline, table_name: str, table_data: List[Any], schema_name: str = None, info: LoadInfo = None) -> None: @@ -120,149 +117,3 @@ def assert_query_data(p: dlt.Pipeline, sql: str, table_data: List[Any], schema_n # the second is load id if info: assert row[1] in info.loads_ids - - -def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: - """ - util function to load a filesystem destination file and return parsed content - values may not be cast to the right type, especially for insert_values, please - make sure to do conversions and casting if needed in your tests - """ - result: List[Dict[str, Any]] = [] - - # check if this is a file we want to read - file_name_items = file.split(".") - ext = file_name_items[-1] - if ext not in ["jsonl", "insert_values", "parquet"]: - return "skip", [] - - # table name will be last element of path - table_name = path.split("/")[-1] - - # skip loads table - if table_name == "_dlt_loads": - return table_name, [] - - full_path = posixpath.join(path, file) - - # load jsonl - if ext == "jsonl": - with open(full_path, "rU", encoding="utf-8") as f: - for line in f: - result.append(json.loads(line)) - - # load insert_values (this is a bit volatile if the exact format of the source file changes) - elif ext == "insert_values": - with open(full_path, "rU", encoding="utf-8") as f: - lines = f.readlines() - # extract col names - cols = lines[0][15:-2].split(",") - for line in lines[2:]: - values = line[1:-3].split(",") - result.append(dict(zip(cols, values))) - - # load parquet - elif ext == "parquet": - import pyarrow.parquet as pq - with open(full_path, "rb") as f: - table = pq.read_table(f) - cols = table.column_names - count = 0 - for column in table: - column_name = cols[count] - item_count = 0 - for item in column.to_pylist(): - if len(result) <= item_count: - result.append({column_name: item}) - else: - result[item_count][column_name] = item - item_count += 1 - count += 1 - - return table_name, result - - -def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" - client: FilesystemClient = p.destination_client() # type: ignore[assignment] - result: Dict[str, Any] = {} - for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): - for file in files: - table_name, items = load_file(basedir, file) - if table_name not in table_names: - continue - if table_name in result: - result[table_name] = result[table_name] + items - else: - result[table_name] = items - - # loads file is special case - if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): - result[LOADS_TABLE_NAME] = [] - - return result - - -def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: - """Returns row counts for `table_names` as dict""" - - # try sql, could be other destination though - try: - with p.sql_client() as c: - qualified_names = [c.make_qualified_table_name(name) for name in table_names] - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} - except SqlClientNotAvailable: - pass - - # try filesystem - file_tables = load_files(p, *table_names) - result = {} - for table_name, items in file_tables.items(): - result[table_name] = len(items) - return result - -def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: - tables = [table["name"] for table in p.default_schema.data_tables()] - return load_table_counts(p, *tables) - - -def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: - table_counts = load_data_table_counts(p) - assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" - - -def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: - - # try sql, could be other destination though - try: - result = {} - for table_name in table_names: - table_rows = [] - columns = p.default_schema.get_table_columns(table_name).keys() - query_columns = ",".join(columns) - - with p.sql_client() as c: - f_q_table_name = c.make_qualified_table_name(table_name) - query = f"SELECT {query_columns} FROM {f_q_table_name}" - with c.execute_query(query) as cur: - for row in list(cur.fetchall()): - table_rows.append(dict(zip(columns, row))) - result[table_name] = table_rows - return result - - except SqlClientNotAvailable: - pass - - # try files - return load_files(p, *table_names) - -def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: - """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" - query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) - with p.sql_client() as c: - with c.execute_query(query) as cur: - rows = list(cur.fetchall()) - return {r[0]: r[1] for r in rows} diff --git a/tests/load/postgres/test_postgres_client.py b/tests/load/postgres/test_postgres_client.py index dcc242cf50..65ac61cfd4 100644 --- a/tests/load/postgres/test_postgres_client.py +++ b/tests/load/postgres/test_postgres_client.py @@ -7,9 +7,9 @@ from dlt.common.storages import FileStorage from dlt.common.utils import uniq_id -from dlt.destinations.postgres.configuration import PostgresCredentials -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.sql_client import psycopg2 +from dlt.destinations.impl.postgres.configuration import PostgresCredentials +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.sql_client import psycopg2 from tests.utils import TEST_STORAGE_ROOT, delete_test_storage, skipifpypy, preserve_environ from tests.load.utils import expect_load_file, prepare_table, yield_client_with_storage diff --git a/tests/load/postgres/test_postgres_table_builder.py b/tests/load/postgres/test_postgres_table_builder.py index 165c62a468..1d6965c0c0 100644 --- a/tests/load/postgres/test_postgres_table_builder.py +++ b/tests/load/postgres/test_postgres_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.configuration import PostgresClientConfiguration, PostgresCredentials +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.configuration import PostgresClientConfiguration, PostgresCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/qdrant/test_pipeline.py b/tests/load/qdrant/test_pipeline.py index 303a5de69f..760eec4631 100644 --- a/tests/load/qdrant/test_pipeline.py +++ b/tests/load/qdrant/test_pipeline.py @@ -5,8 +5,8 @@ from dlt.common import json from dlt.common.utils import uniq_id -from dlt.destinations.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT -from dlt.destinations.qdrant.qdrant_client import QdrantClient +from dlt.destinations.impl.qdrant.qdrant_adapter import qdrant_adapter, VECTORIZE_HINT +from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient from tests.pipeline.utils import assert_load_info from tests.load.qdrant.utils import drop_active_pipeline_data, assert_collection diff --git a/tests/load/qdrant/utils.py b/tests/load/qdrant/utils.py index 96b582a28e..1dfacbee7f 100644 --- a/tests/load/qdrant/utils.py +++ b/tests/load/qdrant/utils.py @@ -5,7 +5,7 @@ from dlt.common.pipeline import PipelineContext from dlt.common.configuration.container import Container -from dlt.destinations.qdrant.qdrant_client import QdrantClient +from dlt.destinations.impl.qdrant.qdrant_client import QdrantClient def assert_unordered_list_equal(list1: List[Any], list2: List[Any]) -> None: diff --git a/tests/load/redshift/test_redshift_client.py b/tests/load/redshift/test_redshift_client.py index 9839965b70..7f617024df 100644 --- a/tests/load/redshift/test_redshift_client.py +++ b/tests/load/redshift/test_redshift_client.py @@ -12,8 +12,8 @@ from dlt.common.utils import uniq_id from dlt.destinations.exceptions import DatabaseTerminalException -from dlt.destinations.redshift.configuration import RedshiftCredentials -from dlt.destinations.redshift.redshift import RedshiftClient, psycopg2 +from dlt.destinations.impl.redshift.configuration import RedshiftCredentials +from dlt.destinations.impl.redshift.redshift import RedshiftClient, psycopg2 from tests.common.utils import COMMON_TEST_CASES_PATH from tests.utils import TEST_STORAGE_ROOT, autouse_test_storage, skipifpypy diff --git a/tests/load/redshift/test_redshift_table_builder.py b/tests/load/redshift/test_redshift_table_builder.py index 8c61ccc1f2..2e0feb44e7 100644 --- a/tests/load/redshift/test_redshift_table_builder.py +++ b/tests/load/redshift/test_redshift_table_builder.py @@ -6,8 +6,8 @@ from dlt.common.schema import Schema from dlt.common.configuration import resolve_configuration -from dlt.destinations.redshift.redshift import RedshiftClient -from dlt.destinations.redshift.configuration import RedshiftClientConfiguration, RedshiftCredentials +from dlt.destinations.impl.redshift.redshift import RedshiftClient +from dlt.destinations.impl.redshift.configuration import RedshiftClientConfiguration, RedshiftCredentials from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/snowflake/test_snowflake_configuration.py b/tests/load/snowflake/test_snowflake_configuration.py index 7108ad06e5..abf80a1241 100644 --- a/tests/load/snowflake/test_snowflake_configuration.py +++ b/tests/load/snowflake/test_snowflake_configuration.py @@ -9,7 +9,7 @@ from dlt.common.configuration.exceptions import ConfigurationValueError from dlt.common.utils import digest128 -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials from tests.common.configuration.utils import environment diff --git a/tests/load/snowflake/test_snowflake_table_builder.py b/tests/load/snowflake/test_snowflake_table_builder.py index 81164625f9..9ede1c8d13 100644 --- a/tests/load/snowflake/test_snowflake_table_builder.py +++ b/tests/load/snowflake/test_snowflake_table_builder.py @@ -5,8 +5,8 @@ from dlt.common.utils import uniq_id from dlt.common.schema import Schema -from dlt.destinations.snowflake.snowflake import SnowflakeClient -from dlt.destinations.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials +from dlt.destinations.impl.snowflake.snowflake import SnowflakeClient +from dlt.destinations.impl.snowflake.configuration import SnowflakeClientConfiguration, SnowflakeCredentials from dlt.destinations.exceptions import DestinationSchemaWillNotUpdate from tests.load.utils import TABLE_UPDATE diff --git a/tests/load/test_dummy_client.py b/tests/load/test_dummy_client.py index e7e0166177..9edc49a607 100644 --- a/tests/load/test_dummy_client.py +++ b/tests/load/test_dummy_client.py @@ -11,14 +11,14 @@ from dlt.common.storages import FileStorage, LoadStorage from dlt.common.storages.load_storage import JobWithUnsupportedWriterException from dlt.common.utils import uniq_id -from dlt.common.destination.reference import DestinationReference, LoadJob +from dlt.common.destination.reference import Destination, LoadJob, TDestination from dlt.load import Load from dlt.destinations.job_impl import EmptyLoadJob from dlt.destinations import dummy -from dlt.destinations.dummy import dummy as dummy_impl -from dlt.destinations.dummy.configuration import DummyClientConfiguration +from dlt.destinations.impl.dummy import dummy as dummy_impl +from dlt.destinations.impl.dummy.configuration import DummyClientConfiguration from dlt.load.exceptions import LoadClientJobFailed, LoadClientJobRetry from dlt.common.schema.utils import get_top_level_table @@ -184,7 +184,7 @@ def test_spool_job_failed_exception_init() -> None: def test_spool_job_failed_exception_complete() -> None: # this config fails job on start - os.environ["RAISE_ON_FAILED_JOBS"] = "true" + os.environ["LOAD__RAISE_ON_FAILED_JOBS"] = "true" os.environ["FAIL_IN_INIT"] = "false" load = setup_loader(client_config=DummyClientConfiguration(fail_prob=1.0)) load_id, _ = prepare_load_package( @@ -340,7 +340,7 @@ def test_retry_on_new_loop() -> None: assert len(files) == 0 # complete package load.run(pool) - assert not load.load_storage.storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) # parse the completed job names completed_path = load.load_storage.get_completed_package_path(load_id) for fn in load.load_storage.storage.list_folder_files(os.path.join(completed_path, LoadStorage.COMPLETED_JOBS_FOLDER)): @@ -382,7 +382,7 @@ def test_load_single_thread() -> None: metrics = load.run(None) while metrics.pending_items > 0: metrics = load.run(None) - assert not load.load_storage.storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not load.load_storage.storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) def test_wrong_writer_type() -> None: @@ -417,11 +417,11 @@ def assert_complete_job(load: Load, storage: FileStorage, should_delete_complete with ThreadPoolExecutor() as pool: load.run(pool) # did process schema update - assert storage.has_file(os.path.join(load.load_storage.get_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME)) + assert storage.has_file(os.path.join(load.load_storage.get_normalized_package_path(load_id), LoadStorage.APPLIED_SCHEMA_UPDATES_FILE_NAME)) # will finalize the whole package load.run(pool) # moved to loaded - assert not storage.has_folder(load.load_storage.get_package_path(load_id)) + assert not storage.has_folder(load.load_storage.get_normalized_package_path(load_id)) completed_path = load.load_storage._get_job_folder_completed_path(load_id, "completed_jobs") if should_delete_completed: # package was deleted @@ -445,7 +445,7 @@ def run_all(load: Load) -> None: def setup_loader(delete_completed_jobs: bool = False, client_config: DummyClientConfiguration = None) -> Load: # reset jobs for a test dummy_impl.JOBS = {} - destination: DestinationReference = dummy # type: ignore[assignment] + destination: TDestination = dummy() # type: ignore[assignment] client_config = client_config or DummyClientConfiguration(loader_file_format="jsonl") # patch destination to provide client_config # destination.client = lambda schema: dummy_impl.DummyClient(schema, client_config) diff --git a/tests/load/test_insert_job_client.py b/tests/load/test_insert_job_client.py index 95e63a79f2..86049b035a 100644 --- a/tests/load/test_insert_job_client.py +++ b/tests/load/test_insert_job_client.py @@ -52,7 +52,7 @@ def test_simple_load(client: InsertValuesJobClient, file_storage: FileStorage) - def test_loading_errors(client: InsertValuesJobClient, file_storage: FileStorage) -> None: # test expected dbiapi exceptions for supported destinations import duckdb - from dlt.destinations.postgres.sql_client import psycopg2 + from dlt.destinations.impl.postgres.sql_client import psycopg2 TNotNullViolation = psycopg2.errors.NotNullViolation TNumericValueOutOfRange = psycopg2.errors.NumericValueOutOfRange diff --git a/tests/load/test_job_client.py b/tests/load/test_job_client.py index 35394ed1c6..e08919424a 100644 --- a/tests/load/test_job_client.py +++ b/tests/load/test_job_client.py @@ -337,7 +337,7 @@ def test_preserve_column_order(client: SqlJobClientBase) -> None: import random columns = deepcopy(TABLE_UPDATE) random.shuffle(columns) - print(columns) + schema.update_table(new_table(table_name, columns=columns)) schema.bump_version() diff --git a/tests/load/utils.py b/tests/load/utils.py index be2097c879..f591f51585 100644 --- a/tests/load/utils.py +++ b/tests/load/utils.py @@ -12,8 +12,8 @@ from dlt.common.configuration import resolve_configuration from dlt.common.configuration.container import Container from dlt.common.configuration.specs.config_section_context import ConfigSectionContext -from dlt.common.destination.reference import DestinationClientDwhConfiguration, DestinationReference, JobClientBase, LoadJob, DestinationClientStagingConfiguration, WithStagingDataset, TDestinationReferenceArg -from dlt.common.destination import TLoaderFileFormat +from dlt.common.destination.reference import DestinationClientDwhConfiguration, JobClientBase, LoadJob, DestinationClientStagingConfiguration, WithStagingDataset, TDestinationReferenceArg +from dlt.common.destination import TLoaderFileFormat, Destination from dlt.common.data_writers import DataWriter from dlt.common.schema import TColumnSchema, TTableSchemaColumns, Schema from dlt.common.storages import SchemaStorage, FileStorage, SchemaStorageConfiguration @@ -229,15 +229,15 @@ def yield_client( ) -> Iterator[SqlJobClientBase]: os.environ.pop("DATASET_NAME", None) # import destination reference by name - destination = import_module(f"dlt.destinations.{destination_name}") + destination = Destination.from_reference(destination_name) # create initial config dest_config: DestinationClientDwhConfiguration = None - dest_config = destination.spec()() + dest_config = destination.spec() # type: ignore[assignment] dest_config.dataset_name = dataset_name # type: ignore[misc] # TODO: Why is dataset_name final? if default_config_values is not None: # apply the values to credentials, if dict is provided it will be used as default - dest_config.credentials = default_config_values # type: ignore[assignment] + # dest_config.credentials = default_config_values # type: ignore[assignment] # also apply to config dest_config.update(default_config_values) # get event default schema @@ -261,7 +261,7 @@ def yield_client( # lookup for credentials in the section that is destination name with Container().injectable_context(ConfigSectionContext(sections=("destination", destination_name,))): - with destination.client(schema, dest_config) as client: + with destination.client(schema, dest_config) as client: # type: ignore[assignment] yield client @contextlib.contextmanager diff --git a/tests/load/weaviate/test_naming.py b/tests/load/weaviate/test_naming.py index 488d66b725..850f70ee19 100644 --- a/tests/load/weaviate/test_naming.py +++ b/tests/load/weaviate/test_naming.py @@ -1,7 +1,7 @@ import dlt, pytest -from dlt.destinations.weaviate.naming import NamingConvention -from dlt.destinations.weaviate.ci_naming import NamingConvention as CINamingConvention +from dlt.destinations.impl.weaviate.naming import NamingConvention +from dlt.destinations.impl.weaviate.ci_naming import NamingConvention as CINamingConvention from tests.common.utils import load_yml_case diff --git a/tests/load/weaviate/test_pipeline.py b/tests/load/weaviate/test_pipeline.py index 339c94575e..691281c63e 100644 --- a/tests/load/weaviate/test_pipeline.py +++ b/tests/load/weaviate/test_pipeline.py @@ -6,10 +6,10 @@ from dlt.common.schema import Schema from dlt.common.utils import uniq_id -from dlt.destinations.weaviate import weaviate_adapter -from dlt.destinations.weaviate.exceptions import PropertyNameConflict -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT -from dlt.destinations.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate import weaviate_adapter +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient from dlt.pipeline.exceptions import PipelineStepFailed from tests.pipeline.utils import assert_load_info @@ -374,7 +374,7 @@ def test_vectorize_property_without_data() -> None: # set the naming convention to case insensitive # os.environ["SCHEMA__NAMING"] = "direct" - dlt.config["schema.naming"] = "dlt.destinations.weaviate.ci_naming" + dlt.config["schema.naming"] = "dlt.destinations.impl.weaviate.ci_naming" # create new schema with changed naming convention p = p.drop() info = p.run(weaviate_adapter(["there are", "no stop", "words in here"], vectorize="vAlue"), primary_key="vALue", columns={"vAlue": {"data_type": "text"}}) diff --git a/tests/load/weaviate/test_weaviate_client.py b/tests/load/weaviate/test_weaviate_client.py index d102610f68..ca9d853d98 100644 --- a/tests/load/weaviate/test_weaviate_client.py +++ b/tests/load/weaviate/test_weaviate_client.py @@ -9,8 +9,8 @@ from dlt.common.schema.typing import TWriteDisposition, TColumnSchema, TTableSchemaColumns from dlt.destinations import weaviate -from dlt.destinations.weaviate.exceptions import PropertyNameConflict -from dlt.destinations.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate.exceptions import PropertyNameConflict +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient from dlt.common.storages.file_storage import FileStorage from dlt.common.schema.utils import new_table @@ -27,9 +27,10 @@ def drop_weaviate_schema() -> Iterator[None]: def get_client_instance(schema: Schema) -> WeaviateClient: - config = weaviate.spec()(dataset_name="ClientTest" + uniq_id()) - with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): - return weaviate.client(schema, config) # type: ignore[return-value] + dest = weaviate(dataset_name="ClientTest" + uniq_id()) + return dest.client(schema, dest.spec()) + # with Container().injectable_context(ConfigSectionContext(sections=('destination', 'weaviate'))): + # return dest.client(schema, config) @pytest.fixture(scope='function') @@ -44,7 +45,7 @@ def ci_client() -> Iterator[WeaviateClient]: def make_client(naming_convention: str) -> Iterator[WeaviateClient]: schema = Schema('test_schema', { - 'names': f"dlt.destinations.weaviate.{naming_convention}", + 'names': f"dlt.destinations.impl.weaviate.{naming_convention}", 'json': None }) _client = get_client_instance(schema) diff --git a/tests/load/weaviate/utils.py b/tests/load/weaviate/utils.py index d5568b0598..ed378191e6 100644 --- a/tests/load/weaviate/utils.py +++ b/tests/load/weaviate/utils.py @@ -6,8 +6,8 @@ from dlt.common.configuration.container import Container from dlt.common.schema.utils import get_columns_names_with_prop -from dlt.destinations.weaviate.weaviate_client import WeaviateClient -from dlt.destinations.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT +from dlt.destinations.impl.weaviate.weaviate_client import WeaviateClient +from dlt.destinations.impl.weaviate.weaviate_adapter import VECTORIZE_HINT, TOKENIZATION_HINT def assert_unordered_list_equal(list1: List[Any], list2: List[Any]) -> None: diff --git a/tests/normalize/test_normalize.py b/tests/normalize/test_normalize.py index 2484b1ea61..12b6267a59 100644 --- a/tests/normalize/test_normalize.py +++ b/tests/normalize/test_normalize.py @@ -226,7 +226,7 @@ def test_normalize_many_schemas(caps: DestinationCapabilitiesContext, rasa_norma with ProcessPoolExecutor(max_workers=4) as p: rasa_normalize.run(p) # must have two loading groups with model and event schemas - loads = rasa_normalize.load_storage.list_packages() + loads = rasa_normalize.load_storage.list_normalized_packages() assert len(loads) == 2 schemas = [] # load all schemas @@ -247,7 +247,7 @@ def test_normalize_typed_json(caps: DestinationCapabilitiesContext, raw_normaliz extract_items(raw_normalize.normalize_storage, [JSON_TYPED_DICT], "special", "special") with ThreadPoolExecutor(max_workers=1) as pool: raw_normalize.run(pool) - loads = raw_normalize.load_storage.list_packages() + loads = raw_normalize.load_storage.list_normalized_packages() assert len(loads) == 1 # load all schemas schema = raw_normalize.load_storage.load_package_schema(loads[0]) @@ -438,7 +438,7 @@ def get_line_from_file(load_storage: LoadStorage, loaded_files: List[str], retur def assert_timestamp_data_type(load_storage: LoadStorage, data_type: TDataType) -> None: # load generated schema - loads = load_storage.list_packages() + loads = load_storage.list_normalized_packages() event_schema = load_storage.load_package_schema(loads[0]) # in raw normalize timestamp column must not be coerced to timestamp assert event_schema.get_table_columns("event")["timestamp"]["data_type"] == data_type diff --git a/tests/normalize/utils.py b/tests/normalize/utils.py index 3ee14948c1..0ce099d4b6 100644 --- a/tests/normalize/utils.py +++ b/tests/normalize/utils.py @@ -1,10 +1,10 @@ from typing import Mapping, cast -from dlt.destinations.duckdb import capabilities as duck_insert_caps -from dlt.destinations.redshift import capabilities as rd_insert_caps -from dlt.destinations.postgres import capabilities as pg_insert_caps -from dlt.destinations.bigquery import capabilities as jsonl_caps -from dlt.destinations.filesystem import capabilities as filesystem_caps +from dlt.destinations.impl.duckdb import capabilities as duck_insert_caps +from dlt.destinations.impl.redshift import capabilities as rd_insert_caps +from dlt.destinations.impl.postgres import capabilities as pg_insert_caps +from dlt.destinations.impl.bigquery import capabilities as jsonl_caps +from dlt.destinations.impl.filesystem import capabilities as filesystem_caps DEFAULT_CAPS = pg_insert_caps diff --git a/tests/pipeline/test_arrow_sources.py b/tests/pipeline/test_arrow_sources.py index 31d5d001df..686ad2ffd3 100644 --- a/tests/pipeline/test_arrow_sources.py +++ b/tests/pipeline/test_arrow_sources.py @@ -6,15 +6,17 @@ import os import io import pyarrow as pa -from typing import List import dlt +from dlt.common import json, Decimal from dlt.common.utils import uniq_id +from dlt.common.libs.pyarrow import NameNormalizationClash + from dlt.pipeline.exceptions import PipelineStepFailed + from tests.cases import arrow_table_all_data_types, TArrowFormat from tests.utils import preserve_environ -from dlt.common import json -from dlt.common import Decimal + @pytest.mark.parametrize( @@ -87,7 +89,6 @@ def some_data(): assert schema_columns['json']['data_type'] == 'complex' - @pytest.mark.parametrize( ("item_type", "is_list"), [("pandas", False), ("table", False), ("record_batch", False), ("pandas", True), ("table", True), ("record_batch", True)] ) @@ -181,6 +182,44 @@ def data_frames(): assert len(pipeline.get_load_package_info(load_id).jobs["new_jobs"]) == 10 +@pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) +def test_arrow_clashing_names(item_type: TArrowFormat) -> None: + # # use parquet for dummy + os.environ["DESTINATION__LOADER_FILE_FORMAT"] = "parquet" + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="dummy") + + item, _ = arrow_table_all_data_types(item_type, include_name_clash=True) + + @dlt.resource + def data_frames(): + for _ in range(10): + yield item + + with pytest.raises(PipelineStepFailed) as py_ex: + pipeline.extract(data_frames()) + assert isinstance(py_ex.value.__context__, NameNormalizationClash) + + +@pytest.mark.parametrize("item_type", ["table", "record_batch"]) +def test_load_arrow_vary_schema(item_type: TArrowFormat) -> None: + pipeline_name = "arrow_" + uniq_id() + pipeline = dlt.pipeline(pipeline_name=pipeline_name, destination="duckdb") + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + item, _ = arrow_table_all_data_types(item_type, include_not_normalized_name=False) + # remove int column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="data").raise_on_failed_jobs() + + @pytest.mark.parametrize("item_type", ["pandas", "table", "record_batch"]) def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: os.environ["RESTORE_FROM_DESTINATION"] = "False" @@ -199,7 +238,7 @@ def test_arrow_as_data_loading(item_type: TArrowFormat) -> None: assert info.row_counts["items"] == len(rows) -@pytest.mark.parametrize("item_type", ["table", "pandas", "record_batch"]) +@pytest.mark.parametrize("item_type", ["table"]) # , "pandas", "record_batch" def test_normalize_with_dlt_columns(item_type: TArrowFormat): item, records = arrow_table_all_data_types(item_type, num_rows=5432) os.environ['NORMALIZE__PARQUET_NORMALIZER__ADD_DLT_LOAD_ID'] = "True" @@ -212,10 +251,10 @@ def test_normalize_with_dlt_columns(item_type: TArrowFormat): def some_data(): yield item - pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="filesystem") + pipeline = dlt.pipeline("arrow_" + uniq_id(), destination="duckdb") pipeline.extract(some_data()) - pipeline.normalize() + pipeline.normalize(loader_file_format="parquet") load_id = pipeline.list_normalized_load_packages()[0] storage = pipeline._get_load_storage() @@ -241,3 +280,26 @@ def some_data(): schema = pipeline.default_schema assert schema.tables['some_data']['columns']['_dlt_id']['data_type'] == 'text' assert schema.tables['some_data']['columns']['_dlt_load_id']['data_type'] == 'text' + + pipeline.load().raise_on_failed_jobs() + + # should be able to load again + pipeline.run(some_data()).raise_on_failed_jobs() + + # should be able to load arrow without a column + try: + item = item.drop("int") + except AttributeError: + names = item.schema.names + names.remove("int") + item = item.select(names) + pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + # should be able to load arrow with a new column + # TODO: uncomment when load_id fixed in normalizer + # item, records = arrow_table_all_data_types(item_type, num_rows=200) + # item = item.append_column("static_int", [[0] * 200]) + # pipeline.run(item, table_name="some_data").raise_on_failed_jobs() + + # schema = pipeline.default_schema + # assert schema.tables['some_data']['columns']['static_int']['data_type'] == 'bigint' diff --git a/tests/pipeline/test_dlt_versions.py b/tests/pipeline/test_dlt_versions.py index 3d79c22b14..7ac7dcbb34 100644 --- a/tests/pipeline/test_dlt_versions.py +++ b/tests/pipeline/test_dlt_versions.py @@ -10,8 +10,8 @@ from dlt.common.storages import FileStorage from dlt.common.schema.typing import LOADS_TABLE_NAME, VERSION_TABLE_NAME, TStoredSchema from dlt.common.configuration.resolve import resolve_configuration -from dlt.destinations.duckdb.configuration import DuckDbClientConfiguration -from dlt.destinations.duckdb.sql_client import DuckDbSqlClient +from dlt.destinations.impl.duckdb.configuration import DuckDbClientConfiguration +from dlt.destinations.impl.duckdb.sql_client import DuckDbSqlClient from tests.utils import TEST_STORAGE_ROOT, test_storage diff --git a/tests/pipeline/test_pipeline.py b/tests/pipeline/test_pipeline.py index 16cce0c7f6..e761186d3a 100644 --- a/tests/pipeline/test_pipeline.py +++ b/tests/pipeline/test_pipeline.py @@ -1,41 +1,39 @@ import itertools import logging import os -import random -from typing import Any, Optional, Iterator, Dict, Any, cast +from typing import Any, Any, cast from tenacity import retry_if_exception, Retrying, stop_after_attempt -from pydantic import BaseModel import pytest import dlt -from dlt.common import json, sleep, pendulum +from dlt.common import json, pendulum from dlt.common.configuration.container import Container from dlt.common.configuration.specs.aws_credentials import AwsCredentials from dlt.common.configuration.specs.exceptions import NativeValueError from dlt.common.configuration.specs.gcp_credentials import GcpOAuthCredentials from dlt.common.destination import DestinationCapabilitiesContext -from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.destination.reference import WithStateSync from dlt.common.exceptions import DestinationHasFailedJobs, DestinationTerminalException, PipelineStateNotAvailable, UnknownDestinationModule from dlt.common.pipeline import PipelineContext -from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.common.runtime.collector import LogCollector from dlt.common.schema.utils import new_column, new_table from dlt.common.utils import uniq_id +from dlt.common.schema import Schema +from dlt.destinations import filesystem, redshift, dummy from dlt.extract.exceptions import InvalidResourceDataTypeBasic, PipeGenInvalid, SourceExhausted from dlt.extract.extract import ExtractorStorage -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.load.exceptions import LoadClientJobFailed from dlt.pipeline.exceptions import InvalidPipelineName, PipelineNotActive, PipelineStepFailed from dlt.pipeline.helpers import retry_load -from dlt.pipeline import TCollectorArg from tests.common.utils import TEST_SENTRY_DSN -from tests.load.pipeline.utils import destinations_configs, DestinationTestConfiguration -from tests.utils import TEST_STORAGE_ROOT from tests.common.configuration.utils import environment +from tests.utils import TEST_STORAGE_ROOT from tests.extract.utils import expect_extracted_file -from tests.pipeline.utils import assert_load_info, airtable_emojis +from tests.pipeline.utils import assert_load_info, airtable_emojis, many_delayed def test_default_pipeline() -> None: @@ -173,7 +171,7 @@ def test_configured_destination(environment) -> None: p = dlt.pipeline() assert p.destination is not None - assert p.destination.__name__.endswith("postgres") + assert p.destination.name.endswith("postgres") assert p.pipeline_name == "postgres_pipe" @@ -188,22 +186,6 @@ def test_deterministic_salt(environment) -> None: assert p.pipeline_salt != p3.pipeline_salt -@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) -def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: - # create pipelines, extract and normalize. that should be possible without installing any dependencies - p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) - # are capabilities injected - caps = p._container[DestinationCapabilitiesContext] - print(caps.naming_convention) - # are right naming conventions created - assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.extract([1, "2", 3], table_name="data") - # is default schema with right naming convention - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - p.normalize() - assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) - - def test_destination_explicit_credentials(environment: Any) -> None: # test redshift p = dlt.pipeline(pipeline_name="postgres_pipeline", destination="redshift", credentials="redshift://loader:loader@localhost:5432/dlt_data") @@ -228,6 +210,56 @@ def test_destination_explicit_credentials(environment: Any) -> None: assert config.credentials.is_resolved() +def test_destination_staging_config(environment: Any) -> None: + fs_dest = filesystem("file:///testing-bucket") + p = dlt.pipeline( + pipeline_name="staging_pipeline", + destination=redshift(credentials="redshift://loader:loader@localhost:5432/dlt_data"), + staging=fs_dest + ) + schema = Schema("foo") + p._inject_schema(schema) + initial_config = p._get_destination_client_initial_config(p.staging, as_staging=True) + staging_config = fs_dest.configuration(initial_config) # type: ignore[arg-type] + + # Ensure that as_staging flag is set in the final resolved conifg + assert staging_config.as_staging is True + + +def test_destination_factory_defaults_resolve_from_config(environment: Any) -> None: + """Params passed explicitly to destination supersede config values. + Env config values supersede default values. + """ + environment["FAIL_PROB"] = "0.3" + environment["RETRY_PROB"] = "0.8" + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=dummy(retry_prob=0.5)) + + client = p.destination_client() + + assert client.config.fail_prob == 0.3 # type: ignore[attr-defined] + assert client.config.retry_prob == 0.5 # type: ignore[attr-defined] + + +def test_destination_credentials_in_factory(environment: Any) -> None: + os.environ['DESTINATION__REDSHIFT__CREDENTIALS'] = "redshift://abc:123@localhost:5432/some_db" + + redshift_dest = redshift("redshift://abc:123@localhost:5432/other_db") + + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest) + + initial_config = p._get_destination_client_initial_config(p.destination) + dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type] + # Explicit factory arg supersedes config + assert dest_config.credentials.database == "other_db" + + redshift_dest = redshift() + p = dlt.pipeline(pipeline_name="dummy_pipeline", destination=redshift_dest) + + initial_config = p._get_destination_client_initial_config(p.destination) + dest_config = redshift_dest.configuration(initial_config) # type: ignore[arg-type] + assert dest_config.credentials.database == "some_db" + + @pytest.mark.skip(reason="does not work on CI. probably takes right credentials from somewhere....") def test_destination_explicit_invalid_credentials_filesystem(environment: Any) -> None: # if string cannot be parsed @@ -297,7 +329,8 @@ def i_fail(): s4 = DltSource("module", dlt.Schema("default_4"), [dlt.resource([6, 7, 8], name="resource_3"), i_fail]) with pytest.raises(PipelineStepFailed): - p.extract([s3, s4]) + # NOTE: if you swap s3 and s4 the test on list_schemas will fail: s3 will extract normally and update live schemas, s4 will break exec later + p.extract([s4, s3]) # nothing to normalize assert len(storage.list_files_to_normalize_sorted()) == 0 @@ -444,13 +477,16 @@ def data_piece_2(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None + # one of the schemas is in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 1 # restore the pipeline p = dlt.attach(pipeline_name) assert p.first_run is True assert p.has_data is False + # no schema was saved to storage, the one above was only in memory assert p._schema_storage.list_schemas() == [] assert p.default_schema_name is None @@ -478,12 +514,14 @@ def data_schema_3(): # first run didn't really happen assert p.first_run is True assert p.has_data is False - assert p._schema_storage.list_schemas() == [] + # schemas from two sources are in memory + # TODO: we may want to fix that + assert len(p._schema_storage.list_schemas()) == 2 assert p.default_schema_name is None os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p.run([data_schema_1(), data_schema_2()], write_disposition="replace") - assert p.schema_names == p._schema_storage.list_schemas() + assert set(p.schema_names) == set(p._schema_storage.list_schemas()) def test_run_with_table_name_exceeding_path_length() -> None: @@ -670,6 +708,8 @@ def resource_1(): assert p.default_schema.get_table("resource_1")["write_disposition"] == "append" p.run(resource_1, write_disposition="replace") + print(list(p._schema_storage.live_schemas.values())[0].to_pretty_yaml()) + assert p.schemas[p.default_schema_name].get_table("resource_1")["write_disposition"] == "replace" assert p.default_schema.get_table("resource_1")["write_disposition"] == "replace" @@ -774,52 +814,6 @@ def reverse_order(item): assert list(p.default_schema.tables["order_2"]["columns"].keys()) == ["col_3", "col_2", "col_1", '_dlt_load_id', '_dlt_id'] -def run_deferred(iters): - - @dlt.defer - def item(n): - sleep(random.random() / 2) - return n - - for n in range(iters): - yield item(n) - - -@dlt.source -def many_delayed(many, iters): - for n in range(many): - yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) - - -@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) -def test_pipeline_progress(progress: TCollectorArg) -> None: - - os.environ["TIMEOUT"] = "3.0" - - p = dlt.pipeline(destination="dummy", progress=progress) - p.extract(many_delayed(5, 10)) - p.normalize() - - collector = p.collector - - # attach pipeline - p = dlt.attach(progress=collector) - p.extract(many_delayed(5, 10)) - p.run(dataset_name="dummy") - - assert collector == p.drop().collector - - # make sure a valid logger was used - if progress == "tqdm": - assert isinstance(collector, TqdmCollector) - if progress == "enlighten": - assert isinstance(collector, EnlightenCollector) - if progress == "alive_progress": - assert isinstance(collector, AliveCollector) - if progress == "log": - assert isinstance(collector, LogCollector) - - def test_pipeline_log_progress() -> None: os.environ["TIMEOUT"] = "3.0" @@ -1051,50 +1045,6 @@ def res_return_yield(): assert "dlt.resource" in str(pip_ex.value) -@pytest.mark.parametrize('method', ('extract', 'run')) -def test_column_argument_pydantic(method: str) -> None: - """Test columns schema is created from pydantic model""" - p = dlt.pipeline(destination='duckdb') - - @dlt.resource - def some_data() -> Iterator[Dict[str, Any]]: - yield {} - - class Columns(BaseModel): - a: Optional[int] - b: Optional[str] - - if method == 'run': - p.run(some_data(), columns=Columns) - else: - p.extract(some_data(), columns=Columns) - - assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' - assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True - assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' - assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True - - -def test_extract_pydantic_models() -> None: - pipeline = dlt.pipeline(destination='duckdb') - - class User(BaseModel): - user_id: int - name: str - - @dlt.resource - def users() -> Iterator[User]: - yield User(user_id=1, name="a") - yield User(user_id=2, name="b") - - pipeline.extract(users()) - - storage = ExtractorStorage(pipeline._normalize_storage_config) - expect_extracted_file( - storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) - ) - - def test_resource_rename_same_table(): @dlt.resource(write_disposition="replace") def generic(start): @@ -1133,17 +1083,6 @@ def generic(start): assert pipeline.default_schema.get_table("single_table")["resource"] == "state1" -@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) -def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: - - @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) - def generic(start=8): - yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] - - pipeline = dlt.pipeline(destination='duckdb') - pipeline.run(generic(), loader_file_format=file_format) - - def test_remove_autodetect() -> None: now = pendulum.now() @@ -1219,3 +1158,46 @@ def test_empty_rows_are_included() -> None: values = [r[0] for r in rows] assert values == [1, None, None, None, None, None, None, None] + + +def test_resource_state_name_not_normalized() -> None: + pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") + peacock_s = airtable_emojis().with_resources("🦚Peacock") + pipeline.extract(peacock_s) + assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} + pipeline.normalize() + pipeline.load() + + # get state from destination + from dlt.pipeline.state_sync import load_state_from_destination + client: WithStateSync + with pipeline.destination_client() as client: # type: ignore[assignment] + state = load_state_from_destination(pipeline.pipeline_name, client) + assert "airtable_emojis" in state["sources"] + assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} + + +def test_remove_pending_packages() -> None: + pipeline = dlt.pipeline(pipeline_name="emojis", destination="dummy") + pipeline.extract(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False + pipeline.extract(airtable_emojis()) + pipeline.normalize() + pipeline.extract(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False + # partial load + os.environ["EXCEPTION_PROB"] = "1.0" + os.environ["FAIL_IN_INIT"] = "False" + os.environ["TIMEOUT"] = "1.0" + # should produce partial loads + with pytest.raises(PipelineStepFailed): + pipeline.run(airtable_emojis()) + assert pipeline.has_pending_data + pipeline.drop_pending_packages(with_partial_loads=False) + assert pipeline.has_pending_data + pipeline.drop_pending_packages() + assert pipeline.has_pending_data is False diff --git a/tests/pipeline/test_pipeline_extra.py b/tests/pipeline/test_pipeline_extra.py new file mode 100644 index 0000000000..d29bac13f2 --- /dev/null +++ b/tests/pipeline/test_pipeline_extra.py @@ -0,0 +1,176 @@ +import os +from typing import Any, ClassVar, Dict, Iterator, List, Optional +import pytest +from pydantic import BaseModel + +import dlt +from dlt.common import json, pendulum +from dlt.common.destination import DestinationCapabilitiesContext +from dlt.common.destination.capabilities import TLoaderFileFormat +from dlt.common.libs.pydantic import DltConfig +from dlt.common.runtime.collector import AliveCollector, EnlightenCollector, LogCollector, TqdmCollector +from dlt.extract.storage import ExtractorStorage +from dlt.extract.validation import PydanticValidator + +from dlt.pipeline import TCollectorArg + +from tests.extract.utils import expect_extracted_file +from tests.load.utils import DestinationTestConfiguration, destinations_configs +from tests.pipeline.utils import assert_load_info, load_data_table_counts, many_delayed + + +@pytest.mark.parametrize("destination_config", destinations_configs(default_sql_configs=True), ids=lambda x: x.name) +def test_create_pipeline_all_destinations(destination_config: DestinationTestConfiguration) -> None: + # create pipelines, extract and normalize. that should be possible without installing any dependencies + p = dlt.pipeline(pipeline_name=destination_config.destination + "_pipeline", destination=destination_config.destination, staging=destination_config.staging) + # are capabilities injected + caps = p._container[DestinationCapabilitiesContext] + print(caps.naming_convention) + # are right naming conventions created + assert p._default_naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.extract([1, "2", 3], table_name="data") + # is default schema with right naming convention + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + p.normalize() + assert p.default_schema.naming.max_length == min(caps.max_column_identifier_length, caps.max_identifier_length) + + +@pytest.mark.parametrize("progress", ["tqdm", "enlighten", "log", "alive_progress"]) +def test_pipeline_progress(progress: TCollectorArg) -> None: + + os.environ["TIMEOUT"] = "3.0" + + p = dlt.pipeline(destination="dummy", progress=progress) + p.extract(many_delayed(5, 10)) + p.normalize() + + collector = p.collector + + # attach pipeline + p = dlt.attach(progress=collector) + p.extract(many_delayed(5, 10)) + p.run(dataset_name="dummy") + + assert collector == p.drop().collector + + # make sure a valid logger was used + if progress == "tqdm": + assert isinstance(collector, TqdmCollector) + if progress == "enlighten": + assert isinstance(collector, EnlightenCollector) + if progress == "alive_progress": + assert isinstance(collector, AliveCollector) + if progress == "log": + assert isinstance(collector, LogCollector) + + +@pytest.mark.parametrize('method', ('extract', 'run')) +def test_column_argument_pydantic(method: str) -> None: + """Test columns schema is created from pydantic model""" + p = dlt.pipeline(destination='duckdb') + + @dlt.resource + def some_data() -> Iterator[Dict[str, Any]]: + yield {} + + class Columns(BaseModel): + a: Optional[int] = None + b: Optional[str] = None + + if method == 'run': + p.run(some_data(), columns=Columns) + else: + p.extract(some_data(), columns=Columns) + + assert p.default_schema.tables['some_data']['columns']['a']['data_type'] == 'bigint' + assert p.default_schema.tables['some_data']['columns']['a']['nullable'] is True + assert p.default_schema.tables['some_data']['columns']['b']['data_type'] == 'text' + assert p.default_schema.tables['some_data']['columns']['b']['nullable'] is True + + +@pytest.mark.parametrize("yield_list", [True, False]) +def test_pydantic_columns_with_contracts(yield_list: bool) -> None: + from datetime import datetime # noqa + + class UserLabel(BaseModel): + label: str + + class User(BaseModel): + user_id: int + name: str + created_at: datetime + labels: List[str] + user_label: UserLabel + user_labels: List[UserLabel] + + dlt_config: ClassVar[DltConfig] = {"skip_complex_types": True} + + user = User( + user_id=1, + name="u1", + created_at=pendulum.now(), + labels=["l1", "l2"], + user_label=UserLabel(label="in_l1"), + user_labels=[UserLabel(label="l_l1"), UserLabel(label="l_l1")] + ) + + @dlt.resource(columns=User) + def users(users_list: List[Any]) -> Iterator[Any]: + if yield_list: + yield users_list + else: + yield from users_list + + pipeline = dlt.pipeline(destination='duckdb') + info = pipeline.run(users([user.dict(), user.dict()])) + assert_load_info(info) + print(pipeline.last_trace.last_normalize_info) + # data is passing validation, all filled in + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + + # produce two users with extra attrs in the child model but set the rows to discard so nothing is loaded + u1 = user.dict() + u1["user_labels"][0]["extra_1"] = "extra" + u1["user_labels"][1]["extra_1"] = "extra" + u2 = user.dict() + u2["user_labels"][0]["is_extra"] = True + + r = users([u1, u2]) + r.apply_hints(schema_contract="discard_row") + validator: PydanticValidator[User] = r.validator # type: ignore[assignment] + assert validator.data_mode == "discard_row" + assert validator.column_mode == "discard_row" + pipeline.run(r) + assert load_data_table_counts(pipeline) == {"users": 2, "users__labels": 4, "users__user_labels": 4} + print(pipeline.last_trace.last_normalize_info) + + +def test_extract_pydantic_models() -> None: + pipeline = dlt.pipeline(destination='duckdb') + + class User(BaseModel): + user_id: int + name: str + + @dlt.resource + def users() -> Iterator[User]: + yield User(user_id=1, name="a") + yield User(user_id=2, name="b") + + pipeline.extract(users()) + + storage = ExtractorStorage(pipeline._normalize_storage_config) + expect_extracted_file( + storage, pipeline.default_schema_name, "users", json.dumps([{"user_id": 1, "name": "a"}, {"user_id": 2, "name": "b"}]) + ) + + +@pytest.mark.parametrize("file_format", ("parquet", "insert_values", "jsonl")) +def test_columns_hint_with_file_formats(file_format: TLoaderFileFormat) -> None: + + @dlt.resource(write_disposition="replace", columns=[{"name": "text", "data_type": "text"}]) + def generic(start=8): + yield [{"id": idx, "text": "A"*idx} for idx in range(start, start + 10)] + + pipeline = dlt.pipeline(destination='duckdb') + pipeline.run(generic(), loader_file_format=file_format) diff --git a/tests/pipeline/test_pipeline_state.py b/tests/pipeline/test_pipeline_state.py index 14b881eedc..019997ef6e 100644 --- a/tests/pipeline/test_pipeline_state.py +++ b/tests/pipeline/test_pipeline_state.py @@ -10,14 +10,13 @@ from dlt.common.storages import FileStorage from dlt.common import pipeline as state_module from dlt.common.utils import uniq_id -from dlt.destinations.job_client_impl import SqlJobClientBase from dlt.pipeline.exceptions import PipelineStateEngineNoUpgradePathException, PipelineStepFailed from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.state_sync import migrate_state, STATE_ENGINE_VERSION from tests.utils import test_storage -from tests.pipeline.utils import json_case_path, load_json_case, airtable_emojis +from tests.pipeline.utils import json_case_path, load_json_case @dlt.resource() @@ -48,8 +47,8 @@ def test_restore_state_props() -> None: assert state["destination"].endswith("redshift") assert state["staging"].endswith("filesystem") # also instances are restored - assert p.destination.__name__.endswith("redshift") - assert p.staging.__name__.endswith("filesystem") + assert p.destination.name.endswith("redshift") + assert p.staging.name.endswith("filesystem") def test_managed_state() -> None: @@ -427,20 +426,3 @@ def test_migrate_state(test_storage: FileStorage) -> None: p = dlt.attach(pipeline_name="debug_pipeline", pipelines_dir=test_storage.storage_path) assert p.dataset_name == "debug_pipeline_data" assert p.default_schema_name == "example_source" - - -def test_resource_state_name_not_normalized() -> None: - pipeline = dlt.pipeline(pipeline_name="emojis", destination="duckdb") - peacock_s = airtable_emojis().with_resources("🦚Peacock") - pipeline.extract(peacock_s) - assert peacock_s.resources["🦚Peacock"].state == {"🦚🦚🦚": "🦚"} - pipeline.normalize() - pipeline.load() - - # get state from destination - from dlt.pipeline.state_sync import load_state_from_destination - client: SqlJobClientBase - with pipeline.destination_client() as client: # type: ignore[assignment] - state = load_state_from_destination(pipeline.pipeline_name, client) - assert "airtable_emojis" in state["sources"] - assert state["sources"]["airtable_emojis"]["resources"] == {"🦚Peacock": {"🦚🦚🦚": "🦚"}} diff --git a/tests/pipeline/test_pipeline_trace.py b/tests/pipeline/test_pipeline_trace.py index 4e84e14425..793e51b909 100644 --- a/tests/pipeline/test_pipeline_trace.py +++ b/tests/pipeline/test_pipeline_trace.py @@ -22,7 +22,7 @@ from dlt.pipeline.pipeline import Pipeline from dlt.pipeline.trace import PipelineTrace, SerializableResolvedValueTrace, describe_extract_data, load_trace from dlt.pipeline.track import slack_notify_load_success -from dlt.extract.source import DltResource, DltSource +from dlt.extract import DltResource, DltSource from dlt.extract.pipe import Pipe from tests.utils import start_test_telemetry diff --git a/tests/pipeline/test_schema_contracts.py b/tests/pipeline/test_schema_contracts.py new file mode 100644 index 0000000000..93a5abf44c --- /dev/null +++ b/tests/pipeline/test_schema_contracts.py @@ -0,0 +1,601 @@ +import dlt, os, pytest +import contextlib +from typing import Any, Callable, Iterator, Union, Optional + +from dlt.common.schema.typing import TSchemaContract +from dlt.common.utils import uniq_id +from dlt.common.schema.exceptions import DataValidationError + +from dlt.extract import DltResource +from dlt.pipeline.pipeline import Pipeline +from dlt.pipeline.exceptions import PipelineStepFailed + +from tests.load.pipeline.utils import load_table_counts +from tests.utils import TDataItemFormat, skip_if_not_active, data_to_item_format, ALL_DATA_ITEM_FORMATS + +skip_if_not_active("duckdb") + +schema_contract = ["evolve", "discard_value", "discard_row", "freeze"] +LOCATIONS = ["source", "resource", "override"] +SCHEMA_ELEMENTS = ["tables", "columns", "data_type"] + + +@contextlib.contextmanager +def raises_frozen_exception(check_raise: bool = True) -> Any: + if not check_raise: + yield + return + with pytest.raises(PipelineStepFailed) as py_exc: + yield + assert isinstance(py_exc.value.__context__, DataValidationError) + + +def items(settings: TSchemaContract) -> Any: + + # NOTE: names must be normalizeds + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "SomeInt": 1, + "name": f"item {index}" + } + + return load_items + + +def items_with_variant(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "SomeInt": "hello" + } + + return load_items + + +def items_with_new_column(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "New^Col": "hello" + } + + return load_items + + +def items_with_subtable(settings: TSchemaContract) -> Any: + + @dlt.resource(name="Items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "name": f"item {index}", + "sub_items": [{ + "id": index + 1000, + "name": f"sub item {index + 1000}" + }] + } + + return load_items + +def new_items(settings: TSchemaContract) -> Any: + + @dlt.resource(name="new_items", write_disposition="append", schema_contract=settings) + def load_items(): + for _, index in enumerate(range(0, 10), 1): + yield { + "id": index, + "some_int": 1, + "name": f"item {index}" + } + + return load_items + +OLD_COLUMN_NAME = "name" +NEW_COLUMN_NAME = "new_col" +VARIANT_COLUMN_NAME = "some_int__v_text" +SUBITEMS_TABLE = "items__sub_items" +NEW_ITEMS_TABLE = "new_items" + + +def run_resource(pipeline: Pipeline, resource_fun: Callable[..., DltResource], settings: Any, item_format: TDataItemFormat = "json", duplicates: int = 1) -> None: + + for item in settings.keys(): + assert item in LOCATIONS + ev_settings = settings[item] + if ev_settings in schema_contract: + continue + for key, val in ev_settings.items(): + assert val in schema_contract + assert key in SCHEMA_ELEMENTS + + @dlt.source(name="freeze_tests", schema_contract=settings.get("source")) + def source() -> Iterator[DltResource]: + for idx in range(duplicates): + resource: DltResource = resource_fun(settings.get("resource")) + if item_format != "json": + resource._pipe.replace_gen(data_to_item_format(item_format, resource._pipe.gen())) # type: ignore + resource.table_name = resource.name + yield resource.with_name(resource.name + str(idx)) + + # run pipeline + pipeline.run(source(), schema_contract=settings.get("override")) + + # check global settings + assert pipeline.default_schema._settings.get("schema_contract", None) == (settings.get("override") or settings.get("source")) + + # check items table settings + # assert pipeline.default_schema.tables["items"].get("schema_contract", {}) == (settings.get("resource") or {}) + + # check effective table settings + # assert resolve_contract_settings_for_table(None, "items", pipeline.default_schema) == expand_schema_contract_settings(settings.get("resource") or settings.get("override") or "evolve") + +def get_pipeline(): + import duckdb + return dlt.pipeline(pipeline_name=uniq_id(), destination='duckdb', credentials=duckdb.connect(':memory:'), full_refresh=True) + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_tables(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: + + pipeline = get_pipeline() + + full_settings = { + setting_location: { + "tables": contract_setting + }} + run_resource(pipeline, items, {}, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + run_resource(pipeline, items_with_new_column, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding new table + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, new_items, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts.get("new_items", 0) == (10 if contract_setting in ["evolve"] else 0) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) + + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # run add variant column + run_resource(pipeline, items_with_variant, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding new subtable + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_subtable, full_settings) + + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 if contract_setting in ["freeze"] else 40 + assert table_counts.get(SUBITEMS_TABLE, 0) == (10 if contract_setting in ["evolve"] else 0) + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +@pytest.mark.parametrize("item_format", ALL_DATA_ITEM_FORMATS) +def test_new_columns(contract_setting: str, setting_location: str, item_format: TDataItemFormat) -> None: + + full_settings = { + setting_location: { + "columns": contract_setting + }} + + pipeline = get_pipeline() + run_resource(pipeline, items, {}, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # new should work + run_resource(pipeline, new_items, full_settings, item_format) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count = 10 + assert table_counts["items"] == expected_items_count + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column twice: filter will try to catch it before it is added for the second time + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_new_column, full_settings, item_format, duplicates=2) + # delete extracted files if left after exception + pipeline._get_normalize_storage().delete_extracted_files(pipeline.list_extracted_resources()) + + if contract_setting == "evolve": + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert NEW_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += (20 if contract_setting in ["evolve", "discard_value"] else 0) + assert table_counts["items"] == expected_items_count + + # NOTE: arrow / pandas do not support variants and subtables so we must skip + if item_format == "json": + # subtable should work + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count + assert table_counts[SUBITEMS_TABLE] == 10 + + # test adding variant column + run_resource(pipeline, items_with_variant, full_settings) + # variants are not new columns and should be able to always evolve + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + expected_items_count += 10 + assert table_counts["items"] == expected_items_count + + +@pytest.mark.parametrize("contract_setting", schema_contract) +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_freeze_variants(contract_setting: str, setting_location: str) -> None: + + full_settings = { + setting_location: { + "data_type": contract_setting + }} + pipeline = get_pipeline() + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert OLD_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # subtable should work + run_resource(pipeline, items_with_subtable, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[SUBITEMS_TABLE] == 10 + + # new should work + run_resource(pipeline, new_items, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + assert table_counts[NEW_ITEMS_TABLE] == 10 + + # test adding new column + run_resource(pipeline, items_with_new_column, full_settings) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + assert NEW_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + + # test adding variant column + with raises_frozen_exception(contract_setting == "freeze"): + run_resource(pipeline, items_with_variant, full_settings) + + if contract_setting == "evolve": + assert VARIANT_COLUMN_NAME in pipeline.default_schema.tables["items"]["columns"] + else: + assert VARIANT_COLUMN_NAME not in pipeline.default_schema.tables["items"]["columns"] + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == (40 if contract_setting in ["evolve", "discard_value"] else 30) + + +def test_settings_precedence() -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {}) + + # trying to add new column when forbidden on resource will fail + run_resource(pipeline, items_with_new_column, {"resource": { + "columns": "discard_row" + }}) + + # when allowed on override it will work + run_resource(pipeline, items_with_new_column, { + "resource": {"columns": "freeze"}, + "override": {"columns": "evolve"} + }) + + +def test_settings_precedence_2() -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {"source": { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden on source will fail + run_resource(pipeline, items_with_variant, {"source": { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # if allowed on resource it will pass + run_resource(pipeline, items_with_variant, { + "resource": {"data_type": "evolve"}, + "source": {"data_type": "discard_row"} + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + # if allowed on override it will also pass + run_resource(pipeline, items_with_variant, { + "resource": {"data_type": "discard_row"}, + "source": {"data_type": "discard_row"}, + "override": {"data_type": "evolve"}, + }) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 30 + + +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_change_mode(setting_location: str) -> None: + pipeline = get_pipeline() + + # load some data + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden will fail + run_resource(pipeline, items_with_variant, {setting_location: { + "data_type": "discard_row" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # now allow + run_resource(pipeline, items_with_variant, {setting_location: { + "data_type": "evolve" + }}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 20 + + +@pytest.mark.parametrize("setting_location", LOCATIONS) +def test_single_settings_value(setting_location: str) -> None: + pipeline = get_pipeline() + + run_resource(pipeline, items, {}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add variant when forbidden will fail + run_resource(pipeline, items_with_variant, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new column will fail + run_resource(pipeline, items_with_new_column, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + + # trying to add new table will fail + run_resource(pipeline, new_items, {setting_location: "discard_row"}) + table_counts = load_table_counts(pipeline, *[t["name"] for t in pipeline.default_schema.data_tables()]) + assert table_counts["items"] == 10 + assert "new_items" not in table_counts + + +def test_data_contract_interaction() -> None: + """ + ensure data contracts with pydantic are enforced properly + """ + from pydantic import BaseModel, Extra + + class Items(BaseModel): + id: int # noqa: A003 + name: Optional[str] + amount: Union[int, str, None] + class Config: + extra = Extra.forbid + + @dlt.resource(name="items") + def get_items(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + }] + + @dlt.resource(name="items", columns=Items) + def get_items_with_model(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + }] + + @dlt.resource(name="items") + def get_items_new_col(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "new_col": "hello" + }] + + @dlt.resource(name="items") + def get_items_subtable(): + yield from [{ + "id": 5, + "name": "dave", + "amount": 6, + "sub": [{"hello": "dave"}] + }] + + # test valid object + pipeline = get_pipeline() + # items with model work + pipeline.run([get_items_with_model()]) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + # loading once with pydantic will freeze the cols + pipeline = get_pipeline() + pipeline.run([get_items_with_model()]) + with raises_frozen_exception(True): + pipeline.run([get_items_new_col()]) + + # it is possible to override contract when there are new columns + # items with model alone does not work, since contract is set to freeze + pipeline = get_pipeline() + pipeline.run([get_items_with_model()]) + pipeline.run([get_items_new_col()], schema_contract="evolve") + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +def test_different_objects_in_one_load() -> None: + + pipeline = get_pipeline() + + @dlt.resource(name="items") + def get_items(): + yield { + "id": 1, + "name": "dave", + "amount": 50 + } + yield { + "id": 2, + "name": "dave", + "amount": 50, + "new_column": "some val" + } + + pipeline.run([get_items()], schema_contract={"columns": "freeze", "tables":"evolve"}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("table_mode", ["discard_row", "evolve", "freeze"]) +def test_dynamic_tables(table_mode: str) -> None: + + pipeline = get_pipeline() + + # adding columns with a data type makes this columns complete which makes this table complete -> it fails in the normalize because + # the tables is NOT new according to normalizer so the row is not discarded + # remove that and it will pass because the table contains just one incomplete column so it is incomplete so it is treated as new + # if you uncomment update code in the extract the problem probably goes away + @dlt.resource(name="items", table_name=lambda i: i["tables"], columns={"id": {}}) + def get_items(): + yield { + "id": 1, + "tables": "one", + } + yield { + "id": 2, + "tables": "two", + "new_column": "some val" + } + with raises_frozen_exception(table_mode == "freeze"): + pipeline.run([get_items()], schema_contract={"tables": table_mode}) + + if table_mode != "freeze": + assert pipeline.last_trace.last_normalize_info.row_counts.get("one", 0) == (1 if table_mode == "evolve" else 0) + assert pipeline.last_trace.last_normalize_info.row_counts.get("two", 0) == (1 if table_mode == "evolve" else 0) + + +@pytest.mark.parametrize("column_mode", ["discard_row", "evolve", "freeze"]) +def test_defined_column_in_new_table(column_mode: str) -> None: + pipeline = get_pipeline() + + @dlt.resource(name="items", columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + def get_items(): + yield { + "id": 1, + "key": "value", + } + pipeline.run([get_items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_new_column_from_hint_and_data(column_mode: str) -> None: + + pipeline = get_pipeline() + + # we define complete column on id, this creates a complete table + # normalizer does not know that it is a new table and discards the row + # and it also excepts on column freeze + + @dlt.resource( + name="items", + columns=[{"name": "id", "data_type": "bigint", "nullable": False}]) + def get_items(): + yield { + "id": 1, + "key": "value", + } + + pipeline.run([get_items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 1 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_two_new_columns_from_two_rows(column_mode: str) -> None: + + pipeline = get_pipeline() + + # this creates a complete table in first row + # and adds a new column to complete tables in 2nd row + # the test does not fail only because you clone schema in normalize + + @dlt.resource() + def items(): + yield { + "id": 1, + } + yield { + "id": 1, + "key": "value", + } + pipeline.run([items()], schema_contract={"columns": column_mode}) + assert pipeline.last_trace.last_normalize_info.row_counts["items"] == 2 + + +@pytest.mark.parametrize("column_mode", ["freeze", "discard_row", "evolve"]) +def test_dynamic_new_columns(column_mode: str) -> None: + + pipeline = get_pipeline() + + # fails because dlt is not able to add _dlt_load_id to tables. I think we should do an exception for those + # 1. schema.dlt_tables() - everything evolve + # 2. is_dlt_column (I hope we have helper) - column evolve, data_type freeze + + def dynamic_columns(item): + if item["id"] == 1: + return [{"name": "key", "data_type": "text", "nullable": True}] + if item["id"] == 2: + return [{"name": "id", "data_type": "bigint", "nullable": True}] + + @dlt.resource(name="items", table_name=lambda i: "items", schema_contract={"columns": column_mode}) # type: ignore + def get_items(): + yield { + "id": 1, + "key": "value", + } + yield { + "id": 2, + "key": "value", + } + + items = get_items() + items.apply_hints(columns=dynamic_columns) + # apply hints apply to `items` not the original resource, so doing get_items() below removed them completely + pipeline.run(items) + assert pipeline.last_trace.last_normalize_info.row_counts.get("items", 0) == 2 diff --git a/tests/pipeline/test_schema_updates.py b/tests/pipeline/test_schema_updates.py index 97345061e3..b88c1a7773 100644 --- a/tests/pipeline/test_schema_updates.py +++ b/tests/pipeline/test_schema_updates.py @@ -1,8 +1,10 @@ +import os import dlt def test_schema_updates() -> None: + os.environ["COMPLETED_PROB"] = "1.0" # make it complete immediately p = dlt.pipeline(pipeline_name="test_schema_updates", full_refresh=True, destination="dummy") @dlt.source() @@ -15,7 +17,7 @@ def resource(): # test without normalizer attributes s = source() p.run(s, table_name="items", write_disposition="append") - assert p.default_schema._normalizers_config["json"]["config"] == {} + assert "config" not in p.default_schema._normalizers_config["json"] # add table propagation s = source() @@ -45,12 +47,12 @@ def resource(): s = source() s.root_key = False p.run(s, table_name="items", write_disposition="merge") + # source schema overwrites normalizer settings so `root` propagation is gone assert p.default_schema._normalizers_config["json"]["config"] == { "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } } } @@ -62,8 +64,7 @@ def resource(): "propagation": { "tables": { "items": {'_dlt_id': '_dlt_root_id'} - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 5 } @@ -77,8 +78,7 @@ def resource(): "tables": { "items": {'_dlt_id': '_dlt_root_id'}, "items2": {'_dlt_id': '_dlt_root_id'}, - }, - "root": {'_dlt_id': '_dlt_root_id'} + } }, "max_nesting": 50 } \ No newline at end of file diff --git a/tests/pipeline/utils.py b/tests/pipeline/utils.py index 3e61c9510c..0d36ff3021 100644 --- a/tests/pipeline/utils.py +++ b/tests/pipeline/utils.py @@ -1,10 +1,16 @@ +import posixpath +from typing import Any, Dict, List, Tuple import pytest +import random from os import environ import dlt -from dlt.common import json -from dlt.common.pipeline import LoadInfo, PipelineContext +from dlt.common import json, sleep +from dlt.common.pipeline import LoadInfo +from dlt.common.schema.typing import LOADS_TABLE_NAME from dlt.common.typing import DictStrAny +from dlt.destinations.impl.filesystem.filesystem import FilesystemClient +from dlt.pipeline.exceptions import SqlClientNotAvailable from tests.utils import TEST_STORAGE_ROOT @@ -35,6 +41,154 @@ def load_json_case(name: str) -> DictStrAny: return json.load(f) +def load_table_counts(p: dlt.Pipeline, *table_names: str) -> DictStrAny: + """Returns row counts for `table_names` as dict""" + + # try sql, could be other destination though + try: + with p.sql_client() as c: + qualified_names = [c.make_qualified_table_name(name) for name in table_names] + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(1) as c FROM {q_name}" for name, q_name in zip(table_names, qualified_names)]) + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + except SqlClientNotAvailable: + pass + + # try filesystem + file_tables = load_files(p, *table_names) + result = {} + for table_name, items in file_tables.items(): + result[table_name] = len(items) + return result + +def load_data_table_counts(p: dlt.Pipeline) -> DictStrAny: + tables = [table["name"] for table in p.default_schema.data_tables()] + return load_table_counts(p, *tables) + + +def assert_data_table_counts(p: dlt.Pipeline, expected_counts: DictStrAny) -> None: + table_counts = load_data_table_counts(p) + assert table_counts == expected_counts, f"Table counts do not match, expected {expected_counts}, got {table_counts}" + + +def load_file(path: str, file: str) -> Tuple[str, List[Dict[str, Any]]]: + """ + util function to load a filesystem destination file and return parsed content + values may not be cast to the right type, especially for insert_values, please + make sure to do conversions and casting if needed in your tests + """ + result: List[Dict[str, Any]] = [] + + # check if this is a file we want to read + file_name_items = file.split(".") + ext = file_name_items[-1] + if ext not in ["jsonl", "insert_values", "parquet"]: + return "skip", [] + + # table name will be last element of path + table_name = path.split("/")[-1] + + # skip loads table + if table_name == "_dlt_loads": + return table_name, [] + + full_path = posixpath.join(path, file) + + # load jsonl + if ext == "jsonl": + with open(full_path, "rU", encoding="utf-8") as f: + for line in f: + result.append(json.loads(line)) + + # load insert_values (this is a bit volatile if the exact format of the source file changes) + elif ext == "insert_values": + with open(full_path, "rU", encoding="utf-8") as f: + lines = f.readlines() + # extract col names + cols = lines[0][15:-2].split(",") + for line in lines[2:]: + values = line[1:-3].split(",") + result.append(dict(zip(cols, values))) + + # load parquet + elif ext == "parquet": + import pyarrow.parquet as pq + with open(full_path, "rb") as f: + table = pq.read_table(f) + cols = table.column_names + count = 0 + for column in table: + column_name = cols[count] + item_count = 0 + for item in column.to_pylist(): + if len(result) <= item_count: + result.append({column_name: item}) + else: + result[item_count][column_name] = item + item_count += 1 + count += 1 + + return table_name, result + + +def load_files(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + """For now this will expect the standard layout in the filesystem destination, if changed the results will not be correct""" + client: FilesystemClient = p.destination_client() # type: ignore[assignment] + result: Dict[str, Any] = {} + for basedir, _dirs, files in client.fs_client.walk(client.dataset_path, detail=False, refresh=True): + for file in files: + table_name, items = load_file(basedir, file) + if table_name not in table_names: + continue + if table_name in result: + result[table_name] = result[table_name] + items + else: + result[table_name] = items + + # loads file is special case + if LOADS_TABLE_NAME in table_names and file.find(".{LOADS_TABLE_NAME}."): + result[LOADS_TABLE_NAME] = [] + + return result + + + +def load_tables_to_dicts(p: dlt.Pipeline, *table_names: str) -> Dict[str, List[Dict[str, Any]]]: + + # try sql, could be other destination though + try: + result = {} + for table_name in table_names: + table_rows = [] + columns = p.default_schema.get_table_columns(table_name).keys() + query_columns = ",".join(columns) + + with p.sql_client() as c: + f_q_table_name = c.make_qualified_table_name(table_name) + query = f"SELECT {query_columns} FROM {f_q_table_name}" + with c.execute_query(query) as cur: + for row in list(cur.fetchall()): + table_rows.append(dict(zip(columns, row))) + result[table_name] = table_rows + return result + + except SqlClientNotAvailable: + pass + + # try files + return load_files(p, *table_names) + + +def load_table_distinct_counts(p: dlt.Pipeline, distinct_column: str, *table_names: str) -> DictStrAny: + """Returns counts of distinct values for column `distinct_column` for `table_names` as dict""" + query = "\nUNION ALL\n".join([f"SELECT '{name}' as name, COUNT(DISTINCT {distinct_column}) as c FROM {name}" for name in table_names]) + with p.sql_client() as c: + with c.execute_query(query) as cur: + rows = list(cur.fetchall()) + return {r[0]: r[1] for r in rows} + + @dlt.source def airtable_emojis(): @@ -59,3 +213,20 @@ def wide_peacock(): return budget, schedule, peacock, wide_peacock + + +def run_deferred(iters): + + @dlt.defer + def item(n): + sleep(random.random() / 2) + return n + + for n in range(iters): + yield item(n) + + +@dlt.source +def many_delayed(many, iters): + for n in range(many): + yield dlt.resource(run_deferred(iters), name="resource_" + str(n)) diff --git a/tests/tools/clean_redshift.py b/tests/tools/clean_redshift.py index 7444d69685..27680b26cd 100644 --- a/tests/tools/clean_redshift.py +++ b/tests/tools/clean_redshift.py @@ -1,5 +1,5 @@ -from dlt.destinations.postgres.postgres import PostgresClient -from dlt.destinations.postgres.sql_client import psycopg2 +from dlt.destinations.impl.postgres.postgres import PostgresClient +from dlt.destinations.impl.postgres.sql_client import psycopg2 from psycopg2.errors import InsufficientPrivilege, InternalError_, SyntaxError CONNECTION_STRING = "" diff --git a/tests/utils.py b/tests/utils.py index 823b1cca83..8ec15a20ad 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -5,7 +5,7 @@ import requests import pytest from os import environ -from typing import Iterator, List +from typing import Any, Iterable, Iterator, List, Literal, Union, get_args from unittest.mock import patch from requests import Response @@ -21,7 +21,7 @@ from dlt.common.storages import FileStorage from dlt.common.schema import Schema from dlt.common.storages.versioned_storage import VersionedStorage -from dlt.common.typing import StrAny +from dlt.common.typing import StrAny, TDataItem from dlt.common.utils import custom_environ, uniq_id from dlt.common.pipeline import PipelineContext @@ -55,6 +55,13 @@ for destination in ACTIVE_DESTINATIONS: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown active destination {destination}" + +# possible TDataItem types +TDataItemFormat = Literal["json", "pandas", "arrow", "arrow-batch"] +ALL_DATA_ITEM_FORMATS = get_args(TDataItemFormat) +"""List with TDataItem formats: json, arrow table/batch / pandas""" + + def TEST_DICT_CONFIG_PROVIDER(): # add test dictionary provider providers_context = Container()[ConfigProvidersContext] @@ -136,6 +143,7 @@ def unload_modules() -> Iterator[None]: @pytest.fixture(autouse=True) def wipe_pipeline() -> Iterator[None]: + """Wipes pipeline local state and deactivates it""" container = Container() if container[PipelineContext].is_active(): container[PipelineContext].deactivate() @@ -148,6 +156,26 @@ def wipe_pipeline() -> Iterator[None]: container[PipelineContext].deactivate() +def data_to_item_format(item_format: TDataItemFormat, data: Union[Iterator[TDataItem], Iterable[TDataItem]]) -> Any: + """Return the given data in the form of pandas, arrow table/batch or json items""" + if item_format == "json": + return data + + import pandas as pd + from dlt.common.libs.pyarrow import pyarrow as pa + + # Make dataframe from the data + df = pd.DataFrame(list(data)) + if item_format == "pandas": + return [df] + elif item_format == "arrow": + return [pa.Table.from_pandas(df)] + elif item_format == "arrow-batch": + return [pa.RecordBatch.from_pandas(df)] + else: + raise ValueError(f"Unknown item format: {item_format}") + + def init_test_logging(c: RunConfiguration = None) -> None: if not c: c = resolve_configuration(RunConfiguration()) @@ -182,6 +210,7 @@ def create_schema_with_name(schema_name) -> Schema: def assert_no_dict_key_starts_with(d: StrAny, key_prefix: str) -> None: assert all(not key.startswith(key_prefix) for key in d.keys()) + def skip_if_not_active(destination: str) -> None: assert destination in IMPLEMENTED_DESTINATIONS, f"Unknown skipped destination {destination}" if destination not in ACTIVE_DESTINATIONS: